From 2029b037e9c83cfbb7a83f90c7a5a3bed93a7930 Mon Sep 17 00:00:00 2001 From: Rose Date: Sat, 31 May 2025 16:09:26 -0400 Subject: [PATCH 1/2] [X86] mtune should be generic Yes, a lot of tests were updated, but we cannot stop hiding bugs this way. --- llvm/lib/Target/X86/X86Subtarget.cpp | 2 +- .../CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll | 49 +- .../CodeGen/X86/2008-12-23-crazy-address.ll | 3 +- .../X86/2012-01-10-UndefExceptionEdge.ll | 45 +- .../CodeGen/X86/2012-12-1-merge-multiple.ll | 6 +- .../CodeGen/X86/64-bit-shift-by-32-minus-y.ll | 32 +- .../X86/AMX/amx-greedy-ra-spill-shape.ll | 2 +- .../test/CodeGen/X86/GlobalISel/add-scalar.ll | 6 +- .../test/CodeGen/X86/GlobalISel/sub-scalar.ll | 6 +- .../CodeGen/X86/MergeConsecutiveStores.ll | 226 +- .../X86/PR71178-register-coalescer-crash.ll | 2 +- llvm/test/CodeGen/X86/abds-neg.ll | 119 +- llvm/test/CodeGen/X86/abds.ll | 158 +- llvm/test/CodeGen/X86/abdu-neg.ll | 169 +- llvm/test/CodeGen/X86/abdu-vector-128.ll | 10 +- llvm/test/CodeGen/X86/abdu.ll | 200 +- llvm/test/CodeGen/X86/abi-isel.ll | 672 +- llvm/test/CodeGen/X86/abs.ll | 2 +- llvm/test/CodeGen/X86/add-cmov.ll | 9 +- llvm/test/CodeGen/X86/add-ext.ll | 21 +- llvm/test/CodeGen/X86/add-of-carry.ll | 4 +- llvm/test/CodeGen/X86/add-sub-bool.ll | 42 +- llvm/test/CodeGen/X86/add.ll | 16 +- llvm/test/CodeGen/X86/add_shl_constant.ll | 47 +- llvm/test/CodeGen/X86/addcarry.ll | 36 +- llvm/test/CodeGen/X86/addr-mode-matcher-2.ll | 6 +- llvm/test/CodeGen/X86/and-sink.ll | 14 +- llvm/test/CodeGen/X86/andnot-patterns.ll | 10 +- .../any_extend_vector_inreg_of_broadcast.ll | 943 +- ...d_vector_inreg_of_broadcast_from_memory.ll | 368 +- .../apx/check-nf-in-suppress-reloc-pass.ll | 6 +- llvm/test/CodeGen/X86/apx/cmov.ll | 12 +- .../CodeGen/X86/apx/flags-copy-lowering.ll | 2 +- llvm/test/CodeGen/X86/apx/mul-i1024.ll | 2243 +- llvm/test/CodeGen/X86/arithmetic_fence2.ll | 28 +- llvm/test/CodeGen/X86/atomic-bit-test.ll | 2 +- llvm/test/CodeGen/X86/atomic-eflags-reuse.ll | 16 +- llvm/test/CodeGen/X86/atomic-fp.ll | 24 +- llvm/test/CodeGen/X86/atomic-mi.ll | 4 +- llvm/test/CodeGen/X86/atomic-rm-bit-test.ll | 104 +- llvm/test/CodeGen/X86/avg.ll | 97 +- llvm/test/CodeGen/X86/avgceils-scalar.ll | 30 +- llvm/test/CodeGen/X86/avgceils.ll | 12 +- llvm/test/CodeGen/X86/avgceilu-scalar.ll | 30 +- llvm/test/CodeGen/X86/avgfloors-scalar.ll | 4 +- llvm/test/CodeGen/X86/avgfloors.ll | 4 +- llvm/test/CodeGen/X86/avgflooru-i128.ll | 22 +- llvm/test/CodeGen/X86/avgflooru-scalar.ll | 8 +- llvm/test/CodeGen/X86/avoid-lea-scale2.ll | 17 +- llvm/test/CodeGen/X86/avx-basic.ll | 2 +- llvm/test/CodeGen/X86/avx-cvt-3.ll | 8 +- .../CodeGen/X86/avx-intrinsics-fast-isel.ll | 56 +- .../test/CodeGen/X86/avx-intrinsics-x86_64.ll | 36 +- llvm/test/CodeGen/X86/avx-logic.ll | 16 +- llvm/test/CodeGen/X86/avx-select.ll | 4 +- llvm/test/CodeGen/X86/avx-splat.ll | 4 +- llvm/test/CodeGen/X86/avx-vbroadcast.ll | 8 +- llvm/test/CodeGen/X86/avx-vperm2x128.ll | 4 +- llvm/test/CodeGen/X86/avx2-arith.ll | 2 +- llvm/test/CodeGen/X86/avx2-conversions.ll | 6 +- llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll | 372 +- llvm/test/CodeGen/X86/avx2-nontemporal.ll | 18 +- llvm/test/CodeGen/X86/avx2-vbroadcast.ll | 26 +- llvm/test/CodeGen/X86/avx2-vector-shifts.ll | 36 +- llvm/test/CodeGen/X86/avx512-arith.ll | 6 +- .../CodeGen/X86/avx512-broadcast-unfold.ll | 1082 +- llvm/test/CodeGen/X86/avx512-cvt.ll | 366 +- .../avx512-extract-subvector-load-store.ll | 24 +- .../test/CodeGen/X86/avx512-insert-extract.ll | 296 +- .../X86/avx512-intrinsics-fast-isel.ll | 40 +- .../CodeGen/X86/avx512-intrinsics-upgrade.ll | 360 +- llvm/test/CodeGen/X86/avx512-intrinsics.ll | 58 +- llvm/test/CodeGen/X86/avx512-mask-op.ll | 4 +- .../CodeGen/X86/avx512-masked-memop-64-32.ll | 12 +- llvm/test/CodeGen/X86/avx512-nontemporal.ll | 4 +- llvm/test/CodeGen/X86/avx512-regcall-Mask.ll | 2 +- .../test/CodeGen/X86/avx512-regcall-NoMask.ll | 30 +- llvm/test/CodeGen/X86/avx512-select.ll | 4 +- .../X86/avx512-shuffles/partial_permute.ll | 500 +- .../CodeGen/X86/avx512-shuffles/permute.ll | 320 +- llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 16 +- .../X86/avx512bf16-intrinsics-upgrade.ll | 4 +- .../X86/avx512bw-intrinsics-fast-isel.ll | 14 +- .../X86/avx512bw-intrinsics-upgrade.ll | 114 +- llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 10 +- .../X86/avx512bwvl-intrinsics-upgrade.ll | 672 +- llvm/test/CodeGen/X86/avx512fp16-frem.ll | 408 +- .../X86/avx512vl-intrinsics-fast-isel.ll | 8 +- .../X86/avx512vl-intrinsics-upgrade.ll | 630 +- llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 36 +- .../X86/avx512vlvp2intersect-intrinsics.ll | 4 +- .../X86/avx512vp2intersect-intrinsics.ll | 4 +- llvm/test/CodeGen/X86/bfloat-calling-conv.ll | 14 +- llvm/test/CodeGen/X86/bfloat.ll | 30 +- .../test/CodeGen/X86/bitcast-and-setcc-512.ll | 46 +- .../X86/bitcast-int-to-vector-bool-sext.ll | 32 +- .../X86/bitcast-int-to-vector-bool-zext.ll | 48 +- .../CodeGen/X86/bitcast-int-to-vector-bool.ll | 16 +- llvm/test/CodeGen/X86/bitcast-vector-bool.ll | 2 +- llvm/test/CodeGen/X86/bitreverse.ll | 136 +- llvm/test/CodeGen/X86/bitselect.ll | 42 +- llvm/test/CodeGen/X86/bmi2.ll | 12 +- llvm/test/CodeGen/X86/bool-simplify.ll | 2 +- .../X86/broadcast-elm-cross-splat-vec.ll | 88 +- llvm/test/CodeGen/X86/bswap-wide-int.ll | 6 +- llvm/test/CodeGen/X86/bswap.ll | 30 +- llvm/test/CodeGen/X86/btc_bts_btr.ll | 118 +- llvm/test/CodeGen/X86/build-vector-512.ll | 162 +- .../CodeGen/X86/callbr-asm-blockplacement.ll | 9 +- llvm/test/CodeGen/X86/canonicalize-vars.ll | 4 +- llvm/test/CodeGen/X86/cast-vsel.ll | 52 +- llvm/test/CodeGen/X86/clear-highbits.ll | 102 +- llvm/test/CodeGen/X86/clear-lowbits.ll | 46 +- llvm/test/CodeGen/X86/clobber_frame_ptr2.ll | 50 +- llvm/test/CodeGen/X86/cmov-into-branch.ll | 4 +- llvm/test/CodeGen/X86/cmov.ll | 3 +- llvm/test/CodeGen/X86/cmovcmov.ll | 4 +- llvm/test/CodeGen/X86/cmp-concat.ll | 16 +- llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll | 18 +- llvm/test/CodeGen/X86/cmpf-avx.ll | 2 +- ...r-breaks-subreg-to-reg-liveness-reduced.ll | 2 +- .../X86/codegen-prepare-addrmode-tls.ll | 12 +- llvm/test/CodeGen/X86/combine-add.ll | 10 +- llvm/test/CodeGen/X86/combine-addo.ll | 2 +- llvm/test/CodeGen/X86/combine-and.ll | 2 +- llvm/test/CodeGen/X86/combine-bitselect.ll | 66 +- .../test/CodeGen/X86/combine-concatvectors.ll | 2 +- llvm/test/CodeGen/X86/combine-fcopysign.ll | 24 +- llvm/test/CodeGen/X86/combine-fneg.ll | 4 +- llvm/test/CodeGen/X86/combine-mul.ll | 8 +- llvm/test/CodeGen/X86/combine-pavg.ll | 4 +- llvm/test/CodeGen/X86/combine-pmadd.ll | 2 +- llvm/test/CodeGen/X86/combine-pmuldq.ll | 72 +- llvm/test/CodeGen/X86/combine-ptest-256.ll | 22 +- llvm/test/CodeGen/X86/combine-ptest.ll | 40 +- llvm/test/CodeGen/X86/combine-rotates.ll | 31 +- llvm/test/CodeGen/X86/combine-sbb.ll | 60 +- llvm/test/CodeGen/X86/combine-sdiv.ll | 123 +- llvm/test/CodeGen/X86/combine-shl.ll | 31 +- llvm/test/CodeGen/X86/combine-smax.ll | 2 +- llvm/test/CodeGen/X86/combine-smin.ll | 2 +- llvm/test/CodeGen/X86/combine-sra.ll | 86 +- llvm/test/CodeGen/X86/combine-srem.ll | 8 +- llvm/test/CodeGen/X86/combine-srl.ll | 94 +- .../CodeGen/X86/combine-sse41-intrinsics.ll | 13 +- llvm/test/CodeGen/X86/combine-sub-usat.ll | 40 +- llvm/test/CodeGen/X86/combine-sub.ll | 4 +- llvm/test/CodeGen/X86/combine-udiv.ll | 98 +- .../X86/combine-undef-index-mscatter.ll | 10 +- llvm/test/CodeGen/X86/combine-urem.ll | 10 +- llvm/test/CodeGen/X86/comi-flags.ll | 168 +- llvm/test/CodeGen/X86/concat-cast.ll | 42 +- llvm/test/CodeGen/X86/concat-fpext-v2bf16.ll | 6 +- llvm/test/CodeGen/X86/conditional-tailcall.ll | 154 +- llvm/test/CodeGen/X86/copy-eflags.ll | 4 +- .../copy-low-subvec-elt-to-high-subvec-elt.ll | 12 +- .../CodeGen/X86/critical-anti-dep-breaker.ll | 2 +- llvm/test/CodeGen/X86/ctlz.ll | 10 +- llvm/test/CodeGen/X86/dag-large-offset.ll | 3 +- .../CodeGen/X86/dag-update-nodetomatch.ll | 119 +- llvm/test/CodeGen/X86/dagcombine-cse.ll | 4 +- llvm/test/CodeGen/X86/dagcombine-select.ll | 2 +- llvm/test/CodeGen/X86/dagcombine-shifts.ll | 2 +- .../X86/div-rem-pair-recomposition-signed.ll | 725 +- .../div-rem-pair-recomposition-unsigned.ll | 578 +- llvm/test/CodeGen/X86/divide-by-constant.ll | 39 +- llvm/test/CodeGen/X86/divmod128.ll | 36 +- llvm/test/CodeGen/X86/divrem.ll | 26 + llvm/test/CodeGen/X86/dpbusd_i4.ll | 6 +- llvm/test/CodeGen/X86/early-ifcvt.ll | 114 +- llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll | 40 +- .../CodeGen/X86/expand-vp-int-intrinsics.ll | 198 +- llvm/test/CodeGen/X86/extract-bits.ll | 1442 +- llvm/test/CodeGen/X86/extract-concat.ll | 8 +- llvm/test/CodeGen/X86/extract-lowbits.ll | 98 +- llvm/test/CodeGen/X86/extract-store.ll | 28 +- llvm/test/CodeGen/X86/extractelement-fp.ll | 4 +- llvm/test/CodeGen/X86/extractelement-load.ll | 82 +- .../CodeGen/X86/fast-isel-select-cmov2.ll | 72 +- llvm/test/CodeGen/X86/fcmp-logic.ll | 40 +- llvm/test/CodeGen/X86/fixup-bw-inst.ll | 46 +- llvm/test/CodeGen/X86/fma-fneg-combine-2.ll | 4 +- .../CodeGen/X86/fma-intrinsics-fast-isel.ll | 4 +- llvm/test/CodeGen/X86/fma_patterns.ll | 8 +- llvm/test/CodeGen/X86/fma_patterns_wide.ll | 60 +- llvm/test/CodeGen/X86/fminimum-fmaximum.ll | 138 +- .../CodeGen/X86/fminimumnum-fmaximumnum.ll | 561 +- llvm/test/CodeGen/X86/fold-add.ll | 12 +- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 54 +- llvm/test/CodeGen/X86/fold-loop-of-urem.ll | 29 +- llvm/test/CodeGen/X86/fold-tied-op.ll | 117 +- .../CodeGen/X86/fold-vector-sext-crash2.ll | 4 +- llvm/test/CodeGen/X86/fp-round.ll | 8 +- .../CodeGen/X86/fp-strict-scalar-cmp-fp16.ll | 392 - llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll | 288 +- llvm/test/CodeGen/X86/fp-strict-scalar.ll | 6 +- llvm/test/CodeGen/X86/fp128-cast.ll | 6 +- .../test/CodeGen/X86/fp128-libcalls-strict.ll | 356 +- llvm/test/CodeGen/X86/fp128-libcalls.ll | 315 +- llvm/test/CodeGen/X86/fpclamptosat.ll | 6 +- llvm/test/CodeGen/X86/fpclamptosat_vec.ll | 154 +- llvm/test/CodeGen/X86/fpenv.ll | 12 +- llvm/test/CodeGen/X86/fptosi-sat-scalar.ll | 6 +- .../test/CodeGen/X86/fptosi-sat-vector-128.ll | 192 +- llvm/test/CodeGen/X86/fptoui-sat-scalar.ll | 2 +- .../test/CodeGen/X86/fptoui-sat-vector-128.ll | 285 +- llvm/test/CodeGen/X86/frame-base.ll | 12 +- llvm/test/CodeGen/X86/freeze-binary.ll | 42 +- llvm/test/CodeGen/X86/freeze-vector.ll | 50 +- llvm/test/CodeGen/X86/fshl.ll | 122 +- llvm/test/CodeGen/X86/fshr.ll | 146 +- llvm/test/CodeGen/X86/ftrunc.ll | 81 +- llvm/test/CodeGen/X86/funnel-shift.ll | 53 +- llvm/test/CodeGen/X86/gfni-funnel-shifts.ll | 627 +- llvm/test/CodeGen/X86/gfni-lzcnt.ll | 30 +- llvm/test/CodeGen/X86/gfni-rotates.ll | 564 +- llvm/test/CodeGen/X86/gfni-shifts.ll | 232 +- llvm/test/CodeGen/X86/gfni-tzcnt.ll | 58 +- llvm/test/CodeGen/X86/ghc-cc64.ll | 72 +- llvm/test/CodeGen/X86/haddsub-2.ll | 232 +- llvm/test/CodeGen/X86/haddsub-4.ll | 70 +- llvm/test/CodeGen/X86/haddsub-shuf.ll | 14 +- llvm/test/CodeGen/X86/haddsub-undef.ll | 14 +- llvm/test/CodeGen/X86/half.ll | 48 +- ...st-and-by-const-from-lshr-in-eqcmp-zero.ll | 40 +- ...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 4 +- .../CodeGen/X86/horizontal-reduce-umax.ll | 22 +- .../CodeGen/X86/horizontal-reduce-umin.ll | 2 +- .../X86/horizontal-shuffle-demanded.ll | 16 +- llvm/test/CodeGen/X86/horizontal-sum.ll | 207 +- llvm/test/CodeGen/X86/i128-add.ll | 36 +- llvm/test/CodeGen/X86/i128-mul.ll | 140 +- llvm/test/CodeGen/X86/i128-sdiv.ll | 378 +- llvm/test/CodeGen/X86/i64-to-float.ll | 46 +- .../CodeGen/X86/i686-win-shrink-wrapping.ll | 4 +- llvm/test/CodeGen/X86/iabs.ll | 2 +- llvm/test/CodeGen/X86/icmp-abs-C-vec.ll | 44 +- llvm/test/CodeGen/X86/icmp-pow2-diff.ll | 68 +- llvm/test/CodeGen/X86/icmp-shift-opt.ll | 34 +- llvm/test/CodeGen/X86/immediate_merging.ll | 8 +- llvm/test/CodeGen/X86/implicit-null-check.ll | 7 +- llvm/test/CodeGen/X86/imul.ll | 70 +- .../X86/insert-into-constant-vector.ll | 83 +- .../CodeGen/X86/insertelement-duplicates.ll | 16 +- .../CodeGen/X86/insertelement-legalize.ll | 12 +- .../test/CodeGen/X86/insertelement-shuffle.ll | 8 +- .../CodeGen/X86/insertelement-var-index.ll | 87 +- llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll | 40 +- llvm/test/CodeGen/X86/is_fpclass-fp80.ll | 54 +- llvm/test/CodeGen/X86/is_fpclass.ll | 25 +- llvm/test/CodeGen/X86/isel-and.ll | 2 +- llvm/test/CodeGen/X86/isel-buildvector-avx.ll | 13 +- llvm/test/CodeGen/X86/isel-fp-to-int.ll | 4 +- llvm/test/CodeGen/X86/isel-icmp.ll | 4 +- llvm/test/CodeGen/X86/isel-or.ll | 2 +- llvm/test/CodeGen/X86/isel-phi.ll | 12 +- llvm/test/CodeGen/X86/isel-sdiv.ll | 9 +- llvm/test/CodeGen/X86/isel-select-cmov.ll | 8 +- llvm/test/CodeGen/X86/isel-srem.ll | 70 +- llvm/test/CodeGen/X86/isel-udiv.ll | 7 - llvm/test/CodeGen/X86/isel-urem.ll | 68 +- llvm/test/CodeGen/X86/isel-xor.ll | 6 +- llvm/test/CodeGen/X86/ispow2.ll | 6 +- llvm/test/CodeGen/X86/known-bits-vector.ll | 4 +- llvm/test/CodeGen/X86/known-bits.ll | 10 +- llvm/test/CodeGen/X86/known-never-zero.ll | 42 +- llvm/test/CodeGen/X86/known-pow2.ll | 66 +- llvm/test/CodeGen/X86/known-signbits-shl.ll | 4 +- .../test/CodeGen/X86/known-signbits-vector.ll | 10 +- llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll | 4 +- .../X86/lack-of-signed-truncation-check.ll | 4 +- llvm/test/CodeGen/X86/lea-16bit.ll | 2 +- llvm/test/CodeGen/X86/lea-2.ll | 11 +- llvm/test/CodeGen/X86/lea-4.ll | 4 +- llvm/test/CodeGen/X86/lea-5.ll | 59 +- llvm/test/CodeGen/X86/lea-opt-cse1.ll | 12 +- llvm/test/CodeGen/X86/lea-opt-cse2.ll | 12 +- llvm/test/CodeGen/X86/lea-opt-cse3.ll | 48 +- llvm/test/CodeGen/X86/lea-opt-cse4.ll | 24 +- llvm/test/CodeGen/X86/lea-opt.ll | 30 +- llvm/test/CodeGen/X86/lea-recursion.ll | 6 +- llvm/test/CodeGen/X86/lea.ll | 6 +- llvm/test/CodeGen/X86/legalize-shift-64.ll | 26 +- llvm/test/CodeGen/X86/legalize-shl-vec.ll | 148 +- llvm/test/CodeGen/X86/llvm.frexp.ll | 2 +- llvm/test/CodeGen/X86/load-local-v3i1.ll | 2 +- .../test/CodeGen/X86/load-scalar-as-vector.ll | 22 +- .../CodeGen/X86/loop-strength-reduce-2.ll | 62 +- .../CodeGen/X86/loop-strength-reduce-3.ll | 27 +- llvm/test/CodeGen/X86/loop-strength-reduce.ll | 27 +- .../test/CodeGen/X86/loop-strength-reduce4.ll | 74 +- .../test/CodeGen/X86/loop-strength-reduce8.ll | 43 +- llvm/test/CodeGen/X86/lrshrink-debug.ll | 2 +- llvm/test/CodeGen/X86/lsr-i386.ll | 37 +- llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll | 55 +- llvm/test/CodeGen/X86/lsr-negative-stride.ll | 4 +- llvm/test/CodeGen/X86/machine-cp.ll | 31 +- llvm/test/CodeGen/X86/madd.ll | 91 +- llvm/test/CodeGen/X86/masked-iv-safe.ll | 29 +- llvm/test/CodeGen/X86/masked-iv-unsafe.ll | 70 +- llvm/test/CodeGen/X86/masked_compressstore.ll | 95 +- llvm/test/CodeGen/X86/masked_expandload.ll | 20 +- llvm/test/CodeGen/X86/masked_gather.ll | 54 +- .../test/CodeGen/X86/masked_gather_scatter.ll | 188 +- .../X86/masked_gather_scatter_widen.ll | 128 +- llvm/test/CodeGen/X86/masked_load.ll | 134 +- llvm/test/CodeGen/X86/masked_store.ll | 115 +- llvm/test/CodeGen/X86/masked_store_trunc.ll | 128 +- .../CodeGen/X86/masked_store_trunc_ssat.ll | 651 +- .../CodeGen/X86/masked_store_trunc_usat.ll | 456 +- llvm/test/CodeGen/X86/matrix-multiply.ll | 597 +- llvm/test/CodeGen/X86/mem-intrin-base-reg.ll | 177 +- .../CodeGen/X86/memcmp-more-load-pairs-x32.ll | 200 +- .../CodeGen/X86/memcmp-more-load-pairs.ll | 530 +- llvm/test/CodeGen/X86/memcmp-optsize-x32.ll | 28 +- llvm/test/CodeGen/X86/memcmp-optsize.ll | 78 +- llvm/test/CodeGen/X86/memcmp-pgso-x32.ll | 28 +- llvm/test/CodeGen/X86/memcmp-pgso.ll | 78 +- llvm/test/CodeGen/X86/memcmp-x32.ll | 76 +- llvm/test/CodeGen/X86/memcmp.ll | 244 +- llvm/test/CodeGen/X86/memcpy-scoped-aa.ll | 29 +- llvm/test/CodeGen/X86/memset-inline.ll | 66 +- llvm/test/CodeGen/X86/memset-minsize.ll | 26 +- llvm/test/CodeGen/X86/memset-nonzero.ll | 190 +- llvm/test/CodeGen/X86/memset64-on-x86-32.ll | 31 +- .../X86/merge-consecutive-stores-nt.ll | 16 +- .../test/CodeGen/X86/merge-store-constants.ll | 4 +- llvm/test/CodeGen/X86/merge_store.ll | 5 +- llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 586 +- llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 314 +- llvm/test/CodeGen/X86/midpoint-int-vec-512.ll | 16 +- llvm/test/CodeGen/X86/midpoint-int.ll | 319 +- .../test/CodeGen/X86/misched-critical-path.ll | 4 +- llvm/test/CodeGen/X86/mmx-arith.ll | 2 +- llvm/test/CodeGen/X86/mmx-build-vector.ll | 6 +- llvm/test/CodeGen/X86/mmx-fold-load.ll | 36 +- llvm/test/CodeGen/X86/movmsk-cmp.ll | 14 +- llvm/test/CodeGen/X86/movtopush.ll | 1762 +- llvm/test/CodeGen/X86/mul-constant-i16.ll | 12 +- llvm/test/CodeGen/X86/mul-constant-i32.ll | 12 +- llvm/test/CodeGen/X86/mul-constant-i64.ll | 274 +- llvm/test/CodeGen/X86/mul-constant-result.ll | 2 +- llvm/test/CodeGen/X86/mul-i1024.ll | 884 +- llvm/test/CodeGen/X86/mul-i256.ll | 208 +- llvm/test/CodeGen/X86/mul-i512.ll | 1133 +- llvm/test/CodeGen/X86/mul128.ll | 40 +- llvm/test/CodeGen/X86/muloti.ll | 41 +- llvm/test/CodeGen/X86/neg-abs.ll | 20 +- llvm/test/CodeGen/X86/neg-shl-add.ll | 4 +- llvm/test/CodeGen/X86/no-split-size.ll | 2 +- llvm/test/CodeGen/X86/nontemporal-4.ll | 32 +- llvm/test/CodeGen/X86/nontemporal-loads.ll | 32 +- llvm/test/CodeGen/X86/nontemporal.ll | 122 +- llvm/test/CodeGen/X86/nosse-vector.ll | 26 +- llvm/test/CodeGen/X86/oddshuffles.ll | 248 +- llvm/test/CodeGen/X86/oddsubvector.ll | 26 +- ...of-two-or-zero-when-comparing-with-zero.ll | 14 +- llvm/test/CodeGen/X86/optimize-max-2.ll | 7 +- llvm/test/CodeGen/X86/or-lea.ll | 65 +- llvm/test/CodeGen/X86/overflow.ll | 5 +- .../CodeGen/X86/overflowing-iv-codegen.ll | 50 +- llvm/test/CodeGen/X86/packus.ll | 2 +- llvm/test/CodeGen/X86/paddus.ll | 124 +- llvm/test/CodeGen/X86/peep-test-0.ll | 12 +- .../X86/peephole-na-phys-copy-folding.ll | 2 +- llvm/test/CodeGen/X86/phaddsub-extract.ll | 10 +- llvm/test/CodeGen/X86/phaddsub-undef.ll | 14 +- llvm/test/CodeGen/X86/pmaddubsw.ll | 40 +- llvm/test/CodeGen/X86/pmovsx-inreg.ll | 48 +- llvm/test/CodeGen/X86/pmul.ll | 121 +- llvm/test/CodeGen/X86/pmulh.ll | 30 +- llvm/test/CodeGen/X86/popcnt.ll | 12 +- llvm/test/CodeGen/X86/pr120093.ll | 6 +- llvm/test/CodeGen/X86/pr18344.ll | 2 +- llvm/test/CodeGen/X86/pr2656.ll | 2 +- llvm/test/CodeGen/X86/pr29112.ll | 20 +- llvm/test/CodeGen/X86/pr30562.ll | 2 +- llvm/test/CodeGen/X86/pr31271.ll | 2 +- llvm/test/CodeGen/X86/pr32329.ll | 20 +- llvm/test/CodeGen/X86/pr32345.ll | 2 +- llvm/test/CodeGen/X86/pr32368.ll | 4 +- llvm/test/CodeGen/X86/pr34080-2.ll | 45 +- llvm/test/CodeGen/X86/pr34177.ll | 20 +- llvm/test/CodeGen/X86/pr34605.ll | 2 +- llvm/test/CodeGen/X86/pr35972.ll | 10 +- llvm/test/CodeGen/X86/pr35982.ll | 6 +- llvm/test/CodeGen/X86/pr37499.ll | 6 +- llvm/test/CodeGen/X86/pr38539.ll | 144 +- llvm/test/CodeGen/X86/pr38738.ll | 74 +- llvm/test/CodeGen/X86/pr38795.ll | 4 +- llvm/test/CodeGen/X86/pr38865-3.ll | 16 +- llvm/test/CodeGen/X86/pr40891.ll | 2 +- llvm/test/CodeGen/X86/pr43820.ll | 184 +- llvm/test/CodeGen/X86/pr44812.ll | 2 +- llvm/test/CodeGen/X86/pr44976.ll | 2 +- llvm/test/CodeGen/X86/pr45563-2.ll | 288 +- llvm/test/CodeGen/X86/pr45563.ll | 52 +- llvm/test/CodeGen/X86/pr45833.ll | 272 +- llvm/test/CodeGen/X86/pr47857.ll | 40 +- llvm/test/CodeGen/X86/pr47874.ll | 50 +- llvm/test/CodeGen/X86/pr48215.ll | 8 +- llvm/test/CodeGen/X86/pr49393.ll | 22 +- llvm/test/CodeGen/X86/pr50782.ll | 2 +- llvm/test/CodeGen/X86/pr57402.ll | 3 +- llvm/test/CodeGen/X86/pr61964.ll | 8 +- llvm/test/CodeGen/X86/pr62014.ll | 30 +- llvm/test/CodeGen/X86/pr63108.ll | 4 +- llvm/test/CodeGen/X86/pr63507.ll | 2 +- llvm/test/CodeGen/X86/pr65895.ll | 6 +- llvm/test/CodeGen/X86/pr74736.ll | 2 +- llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll | 3 +- .../CodeGen/X86/prefer-avx256-mask-shuffle.ll | 12 +- llvm/test/CodeGen/X86/prefer-avx256-mulo.ll | 6 +- llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll | 20 +- llvm/test/CodeGen/X86/prefer-avx256-shift.ll | 12 +- .../CodeGen/X86/prefer-avx256-wide-mul.ll | 4 +- llvm/test/CodeGen/X86/promote-vec3.ll | 8 +- llvm/test/CodeGen/X86/psubus.ll | 62 +- llvm/test/CodeGen/X86/ptest.ll | 12 +- .../CodeGen/X86/pull-binop-through-shift.ll | 32 +- .../pull-conditional-binop-through-shift.ll | 50 +- llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll | 12 +- llvm/test/CodeGen/X86/recip-fastmath.ll | 42 +- llvm/test/CodeGen/X86/recip-fastmath2.ll | 30 +- llvm/test/CodeGen/X86/reverse_branches.ll | 18 +- llvm/test/CodeGen/X86/rotate-add.ll | 4 +- .../test/CodeGen/X86/rotate-extract-vector.ll | 41 +- llvm/test/CodeGen/X86/rotate-extract.ll | 4 +- llvm/test/CodeGen/X86/rotate.ll | 8 +- llvm/test/CodeGen/X86/rotate4.ll | 8 +- llvm/test/CodeGen/X86/sad.ll | 20 +- llvm/test/CodeGen/X86/sadd_sat.ll | 2 +- llvm/test/CodeGen/X86/sadd_sat_plus.ll | 2 +- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 98 +- llvm/test/CodeGen/X86/sat-add.ll | 22 +- llvm/test/CodeGen/X86/scalar-fp-to-i32.ll | 160 +- llvm/test/CodeGen/X86/scalar-fp-to-i64.ll | 46 +- llvm/test/CodeGen/X86/scalar_widen_div.ll | 48 +- .../CodeGen/X86/scheduler-backtracking.ll | 704 +- llvm/test/CodeGen/X86/scmp.ll | 424 +- llvm/test/CodeGen/X86/sdiv-exact.ll | 8 +- llvm/test/CodeGen/X86/sdiv_fix_sat.ll | 193 +- llvm/test/CodeGen/X86/select-1-or-neg1.ll | 2 +- llvm/test/CodeGen/X86/select-constant-lea.ll | 3 +- .../CodeGen/X86/select-of-half-constants.ll | 4 +- llvm/test/CodeGen/X86/select.ll | 56 +- llvm/test/CodeGen/X86/select_const.ll | 47 +- llvm/test/CodeGen/X86/setcc-logic.ll | 2 +- .../test/CodeGen/X86/setcc-non-simple-type.ll | 38 +- llvm/test/CodeGen/X86/setcc-wide-types.ll | 466 +- llvm/test/CodeGen/X86/setcc.ll | 2 +- llvm/test/CodeGen/X86/sext-vsetcc.ll | 4 +- llvm/test/CodeGen/X86/shift-amount-mod.ll | 74 +- llvm/test/CodeGen/X86/shift-and.ll | 2 +- llvm/test/CodeGen/X86/shift-combine.ll | 45 +- llvm/test/CodeGen/X86/shift-i128.ll | 314 +- llvm/test/CodeGen/X86/shift-i256.ll | 122 +- llvm/test/CodeGen/X86/shift-parts.ll | 15 +- llvm/test/CodeGen/X86/shrink_vmul.ll | 306 +- llvm/test/CodeGen/X86/shuffle-half.ll | 2 +- .../X86/shuffle-strided-with-offset-256.ll | 231 +- .../X86/shuffle-strided-with-offset-512.ll | 2 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll | 146 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 88 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 32 +- .../CodeGen/X86/signed-truncation-check.ll | 4 +- llvm/test/CodeGen/X86/slow-pmulld.ll | 16 +- llvm/test/CodeGen/X86/smax.ll | 36 +- llvm/test/CodeGen/X86/smin.ll | 39 +- llvm/test/CodeGen/X86/smul-with-overflow.ll | 265 +- llvm/test/CodeGen/X86/smul_fix.ll | 106 +- llvm/test/CodeGen/X86/smul_fix_sat.ll | 284 +- .../X86/smulo-128-legalisation-lowering.ll | 765 +- .../CodeGen/X86/speculative-load-hardening.ll | 6 +- llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll | 43 +- .../CodeGen/X86/srem-seteq-illegal-types.ll | 26 +- llvm/test/CodeGen/X86/srem-seteq-optsize.ll | 6 +- .../CodeGen/X86/srem-seteq-vec-nonsplat.ll | 302 +- llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll | 32 +- llvm/test/CodeGen/X86/srem-vector-lkk.ll | 220 +- llvm/test/CodeGen/X86/sse-fcopysign.ll | 4 +- llvm/test/CodeGen/X86/sse-regcall.ll | 2 +- llvm/test/CodeGen/X86/sse-regcall4.ll | 2 +- llvm/test/CodeGen/X86/sse1.ll | 14 +- .../CodeGen/X86/sse2-intrinsics-fast-isel.ll | 56 +- llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll | 12 +- llvm/test/CodeGen/X86/sshl_sat.ll | 4 +- llvm/test/CodeGen/X86/sshl_sat_vec.ll | 318 +- llvm/test/CodeGen/X86/ssub_sat.ll | 4 +- llvm/test/CodeGen/X86/ssub_sat_plus.ll | 4 +- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 164 +- llvm/test/CodeGen/X86/stack-clash-large.ll | 16 +- .../CodeGen/X86/stack-folding-adx-x86_64.ll | 54 +- .../test/CodeGen/X86/stack-folding-fp-avx1.ll | 16 +- .../CodeGen/X86/stack-folding-fp-sse42.ll | 7 +- .../CodeGen/X86/stack-folding-int-avx1.ll | 8 +- .../CodeGen/X86/stack-folding-int-sse42.ll | 8 +- .../CodeGen/X86/statepoint-call-lowering.ll | 21 +- .../CodeGen/X86/statepoint-deopt-lowering.ll | 22 +- .../CodeGen/X86/statepoint-stackmap-format.ll | 12 +- llvm/test/CodeGen/X86/strict-fsub-combines.ll | 12 +- llvm/test/CodeGen/X86/sttni.ll | 38 +- llvm/test/CodeGen/X86/subcarry.ll | 54 +- llvm/test/CodeGen/X86/subvector-broadcast.ll | 26 +- .../subvectorwise-store-of-vector-splat.ll | 4 +- .../X86/tail-dup-merge-loop-headers.ll | 2 +- llvm/test/CodeGen/X86/tbm_patterns.ll | 4 +- llvm/test/CodeGen/X86/trunc-nsw-nuw.ll | 3 +- .../CodeGen/X86/tuning-shuffle-permilps.ll | 4 +- llvm/test/CodeGen/X86/uadd_sat.ll | 4 +- llvm/test/CodeGen/X86/uadd_sat_plus.ll | 2 +- llvm/test/CodeGen/X86/uadd_sat_vec.ll | 46 +- llvm/test/CodeGen/X86/ucmp.ll | 118 +- llvm/test/CodeGen/X86/udiv-exact.ll | 8 +- llvm/test/CodeGen/X86/uint64-to-float.ll | 2 +- llvm/test/CodeGen/X86/uint_to_half.ll | 8 +- llvm/test/CodeGen/X86/umax.ll | 304 +- llvm/test/CodeGen/X86/umin.ll | 39 +- llvm/test/CodeGen/X86/umul-with-overflow.ll | 134 +- llvm/test/CodeGen/X86/umul_fix.ll | 149 +- llvm/test/CodeGen/X86/umul_fix_sat.ll | 141 +- .../X86/umulo-128-legalisation-lowering.ll | 47 +- .../X86/umulo-64-legalisation-lowering.ll | 30 +- ...-masked-merge-vector-variablemask-const.ll | 20 +- ...unfold-masked-merge-vector-variablemask.ll | 2158 +- .../CodeGen/X86/urem-seteq-illegal-types.ll | 8 +- llvm/test/CodeGen/X86/urem-seteq-optsize.ll | 4 +- .../CodeGen/X86/urem-seteq-vec-nonsplat.ll | 34 +- .../CodeGen/X86/urem-seteq-vec-nonzero.ll | 8 +- llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll | 2 +- .../X86/urem-seteq-vec-tautological.ll | 12 +- llvm/test/CodeGen/X86/urem-vector-lkk.ll | 32 +- llvm/test/CodeGen/X86/use-add-flags.ll | 4 +- llvm/test/CodeGen/X86/ushl_sat.ll | 20 +- llvm/test/CodeGen/X86/ushl_sat_vec.ll | 213 +- llvm/test/CodeGen/X86/usub_sat.ll | 4 +- llvm/test/CodeGen/X86/usub_sat_plus.ll | 4 +- llvm/test/CodeGen/X86/usub_sat_vec.ll | 112 +- llvm/test/CodeGen/X86/v8i1-masks.ll | 252 +- llvm/test/CodeGen/X86/var-permute-128.ll | 42 +- llvm/test/CodeGen/X86/var-permute-256.ll | 148 +- llvm/test/CodeGen/X86/var-permute-512.ll | 1340 +- llvm/test/CodeGen/X86/vec-strict-128.ll | 11 +- .../CodeGen/X86/vec-strict-cmp-128-fp16.ll | 36 +- llvm/test/CodeGen/X86/vec-strict-cmp-128.ll | 608 +- .../test/CodeGen/X86/vec-strict-cmp-sub128.ll | 92 +- .../CodeGen/X86/vec-strict-fptoint-128.ll | 144 +- .../CodeGen/X86/vec-strict-fptoint-256.ll | 6 +- .../CodeGen/X86/vec-strict-fptoint-512.ll | 76 +- .../CodeGen/X86/vec-strict-inttofp-256.ll | 144 +- .../CodeGen/X86/vec-strict-inttofp-512.ll | 128 +- llvm/test/CodeGen/X86/vec_anyext.ll | 2 +- llvm/test/CodeGen/X86/vec_call.ll | 6 +- llvm/test/CodeGen/X86/vec_cast.ll | 10 +- llvm/test/CodeGen/X86/vec_cmp_sint-128.ll | 32 +- llvm/test/CodeGen/X86/vec_cmp_uint-128.ll | 32 +- llvm/test/CodeGen/X86/vec_compare-sse4.ll | 2 +- llvm/test/CodeGen/X86/vec_extract-mmx.ll | 24 +- llvm/test/CodeGen/X86/vec_fabs.ll | 177 +- llvm/test/CodeGen/X86/vec_fcopysign.ll | 346 +- llvm/test/CodeGen/X86/vec_floor.ll | 40 +- llvm/test/CodeGen/X86/vec_fneg.ll | 90 +- llvm/test/CodeGen/X86/vec_fp_to_int.ll | 512 +- llvm/test/CodeGen/X86/vec_fpext.ll | 6 +- llvm/test/CodeGen/X86/vec_fptrunc.ll | 34 +- llvm/test/CodeGen/X86/vec_ins_extract-1.ll | 2 +- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 813 +- llvm/test/CodeGen/X86/vec_minmax_sint.ll | 304 +- llvm/test/CodeGen/X86/vec_minmax_uint.ll | 296 +- llvm/test/CodeGen/X86/vec_saddo.ll | 112 +- llvm/test/CodeGen/X86/vec_setcc-2.ll | 41 +- llvm/test/CodeGen/X86/vec_setcc.ll | 70 +- llvm/test/CodeGen/X86/vec_shift4.ll | 4 +- llvm/test/CodeGen/X86/vec_shift6.ll | 6 +- llvm/test/CodeGen/X86/vec_smulo.ll | 814 +- llvm/test/CodeGen/X86/vec_ssubo.ll | 128 +- llvm/test/CodeGen/X86/vec_uaddo.ll | 82 +- .../CodeGen/X86/vec_uint_to_fp-fastmath.ll | 194 +- llvm/test/CodeGen/X86/vec_uint_to_fp.ll | 280 +- llvm/test/CodeGen/X86/vec_umulo.ll | 503 +- llvm/test/CodeGen/X86/vec_usubo.ll | 82 +- llvm/test/CodeGen/X86/vector-bitreverse.ll | 266 +- llvm/test/CodeGen/X86/vector-blend.ll | 2 +- .../CodeGen/X86/vector-bo-select-avx512.ll | 10 +- llvm/test/CodeGen/X86/vector-bo-select.ll | 1887 +- .../CodeGen/X86/vector-compare-results.ll | 68 +- llvm/test/CodeGen/X86/vector-compress.ll | 1611 +- .../X86/vector-constrained-fp-intrinsics.ll | 278 +- llvm/test/CodeGen/X86/vector-ext-logic.ll | 2 +- llvm/test/CodeGen/X86/vector-extend-inreg.ll | 4 +- llvm/test/CodeGen/X86/vector-fshl-128.ll | 415 +- llvm/test/CodeGen/X86/vector-fshl-256.ll | 348 +- llvm/test/CodeGen/X86/vector-fshl-512.ll | 216 +- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 216 +- llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 135 +- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll | 66 +- .../CodeGen/X86/vector-fshl-rot-sub128.ll | 10 +- llvm/test/CodeGen/X86/vector-fshl-sub128.ll | 86 +- llvm/test/CodeGen/X86/vector-fshr-128.ll | 445 +- llvm/test/CodeGen/X86/vector-fshr-256.ll | 436 +- llvm/test/CodeGen/X86/vector-fshr-512.ll | 246 +- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 225 +- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 133 +- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll | 40 +- .../CodeGen/X86/vector-fshr-rot-sub128.ll | 10 +- llvm/test/CodeGen/X86/vector-fshr-sub128.ll | 62 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll | 178 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll | 138 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll | 174 +- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll | 36 +- llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll | 74 +- llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll | 122 +- llvm/test/CodeGen/X86/vector-idiv.ll | 10 +- .../vector-interleaved-load-i16-stride-2.ll | 44 +- .../vector-interleaved-load-i16-stride-3.ll | 1666 +- .../vector-interleaved-load-i16-stride-4.ll | 1002 +- .../vector-interleaved-load-i16-stride-5.ll | 3452 ++- .../vector-interleaved-load-i16-stride-6.ll | 5117 ++-- .../vector-interleaved-load-i16-stride-7.ll | 6745 ++--- .../vector-interleaved-load-i16-stride-8.ll | 712 +- .../vector-interleaved-load-i32-stride-2.ll | 356 +- .../vector-interleaved-load-i32-stride-3.ll | 392 +- .../vector-interleaved-load-i32-stride-4.ll | 1720 +- .../vector-interleaved-load-i32-stride-5.ll | 4644 ++-- .../vector-interleaved-load-i32-stride-6.ll | 6033 ++--- .../vector-interleaved-load-i32-stride-7.ll | 11364 ++++---- .../vector-interleaved-load-i32-stride-8.ll | 9589 ++++--- .../vector-interleaved-load-i64-stride-2.ll | 478 +- .../vector-interleaved-load-i64-stride-3.ll | 3041 +-- .../vector-interleaved-load-i64-stride-4.ll | 1412 +- .../vector-interleaved-load-i64-stride-5.ll | 8351 +++--- .../vector-interleaved-load-i64-stride-6.ll | 6613 ++--- .../vector-interleaved-load-i64-stride-7.ll | 21820 ++++++++-------- .../vector-interleaved-load-i64-stride-8.ll | 14929 +++++------ .../vector-interleaved-load-i8-stride-2.ll | 210 +- .../vector-interleaved-load-i8-stride-3.ll | 220 +- .../vector-interleaved-load-i8-stride-4.ll | 230 +- .../vector-interleaved-load-i8-stride-5.ll | 4793 ++-- .../vector-interleaved-load-i8-stride-6.ll | 6843 ++--- .../vector-interleaved-load-i8-stride-7.ll | 12523 +++++---- .../vector-interleaved-load-i8-stride-8.ll | 1798 +- .../vector-interleaved-store-i16-stride-2.ll | 48 +- .../vector-interleaved-store-i16-stride-3.ll | 1174 +- .../vector-interleaved-store-i16-stride-4.ll | 224 +- .../vector-interleaved-store-i16-stride-5.ll | 4614 ++-- .../vector-interleaved-store-i16-stride-6.ll | 2213 +- .../vector-interleaved-store-i16-stride-7.ll | 7617 +++--- .../vector-interleaved-store-i16-stride-8.ll | 1164 +- .../vector-interleaved-store-i32-stride-2.ll | 128 +- .../vector-interleaved-store-i32-stride-3.ll | 714 +- .../vector-interleaved-store-i32-stride-4.ll | 440 +- .../vector-interleaved-store-i32-stride-5.ll | 3187 +-- .../vector-interleaved-store-i32-stride-6.ll | 5418 ++-- .../vector-interleaved-store-i32-stride-7.ll | 7040 ++--- .../vector-interleaved-store-i32-stride-8.ll | 2300 +- .../vector-interleaved-store-i64-stride-2.ll | 170 +- .../vector-interleaved-store-i64-stride-3.ll | 1432 +- .../vector-interleaved-store-i64-stride-4.ll | 760 +- .../vector-interleaved-store-i64-stride-5.ll | 5540 ++-- .../vector-interleaved-store-i64-stride-6.ll | 11644 +++++---- .../vector-interleaved-store-i64-stride-7.ll | 15757 +++++------ .../vector-interleaved-store-i64-stride-8.ll | 8449 +++--- .../vector-interleaved-store-i8-stride-2.ll | 8 +- .../vector-interleaved-store-i8-stride-3.ll | 149 +- .../vector-interleaved-store-i8-stride-5.ll | 2027 +- .../vector-interleaved-store-i8-stride-6.ll | 2006 +- .../vector-interleaved-store-i8-stride-7.ll | 6559 +++-- .../vector-interleaved-store-i8-stride-8.ll | 540 +- llvm/test/CodeGen/X86/vector-llrint-f16.ll | 422 +- llvm/test/CodeGen/X86/vector-llrint.ll | 186 +- llvm/test/CodeGen/X86/vector-lrint-f16.ll | 1227 +- llvm/test/CodeGen/X86/vector-lrint.ll | 66 +- llvm/test/CodeGen/X86/vector-lzcnt-128.ll | 188 +- llvm/test/CodeGen/X86/vector-lzcnt-256.ll | 320 +- llvm/test/CodeGen/X86/vector-lzcnt-512.ll | 56 +- .../X86/vector-merge-store-fp-constants.ll | 7 +- llvm/test/CodeGen/X86/vector-mul.ll | 580 +- llvm/test/CodeGen/X86/vector-pack-128.ll | 8 +- llvm/test/CodeGen/X86/vector-pack-256.ll | 4 +- llvm/test/CodeGen/X86/vector-pack-512.ll | 72 +- llvm/test/CodeGen/X86/vector-pcmp.ll | 4 +- .../CodeGen/X86/vector-popcnt-128-ult-ugt.ll | 1377 +- llvm/test/CodeGen/X86/vector-popcnt-128.ll | 60 +- .../CodeGen/X86/vector-popcnt-256-ult-ugt.ll | 1228 +- llvm/test/CodeGen/X86/vector-popcnt-256.ll | 104 +- .../CodeGen/X86/vector-popcnt-512-ult-ugt.ll | 1938 +- llvm/test/CodeGen/X86/vector-popcnt-512.ll | 45 +- .../CodeGen/X86/vector-reduce-add-mask.ll | 22 +- .../CodeGen/X86/vector-reduce-and-bool.ll | 94 +- .../CodeGen/X86/vector-reduce-and-scalar.ll | 14 +- llvm/test/CodeGen/X86/vector-reduce-and.ll | 10 +- llvm/test/CodeGen/X86/vector-reduce-ctpop.ll | 80 +- llvm/test/CodeGen/X86/vector-reduce-fmax.ll | 2 +- .../CodeGen/X86/vector-reduce-fmaximum.ll | 110 +- llvm/test/CodeGen/X86/vector-reduce-fmin.ll | 6 +- llvm/test/CodeGen/X86/vector-reduce-mul.ll | 2 +- .../test/CodeGen/X86/vector-reduce-or-bool.ll | 4 +- llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 6 +- llvm/test/CodeGen/X86/vector-reduce-or.ll | 10 +- llvm/test/CodeGen/X86/vector-reduce-smax.ll | 8 +- llvm/test/CodeGen/X86/vector-reduce-smin.ll | 8 +- llvm/test/CodeGen/X86/vector-reduce-umax.ll | 2 +- llvm/test/CodeGen/X86/vector-reduce-umin.ll | 2 +- .../CodeGen/X86/vector-reduce-xor-bool.ll | 92 +- llvm/test/CodeGen/X86/vector-reduce-xor.ll | 10 +- .../CodeGen/X86/vector-replicaton-i1-mask.ll | 634 +- llvm/test/CodeGen/X86/vector-rotate-128.ll | 204 +- llvm/test/CodeGen/X86/vector-rotate-256.ll | 120 +- llvm/test/CodeGen/X86/vector-rotate-512.ll | 66 +- llvm/test/CodeGen/X86/vector-sext.ll | 287 +- .../test/CodeGen/X86/vector-shift-ashr-128.ll | 188 +- .../test/CodeGen/X86/vector-shift-ashr-256.ll | 222 +- .../test/CodeGen/X86/vector-shift-ashr-512.ll | 10 +- .../CodeGen/X86/vector-shift-ashr-sub128.ll | 326 +- .../X86/vector-shift-by-select-loop.ll | 226 +- .../test/CodeGen/X86/vector-shift-lshr-128.ll | 108 +- .../test/CodeGen/X86/vector-shift-lshr-256.ll | 171 +- .../test/CodeGen/X86/vector-shift-lshr-512.ll | 16 +- .../CodeGen/X86/vector-shift-lshr-sub128.ll | 242 +- llvm/test/CodeGen/X86/vector-shift-lut.ll | 104 +- llvm/test/CodeGen/X86/vector-shift-shl-128.ll | 42 +- llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 34 +- .../CodeGen/X86/vector-shift-shl-sub128.ll | 84 +- .../CodeGen/X86/vector-shuffle-128-v16.ll | 84 +- .../test/CodeGen/X86/vector-shuffle-128-v4.ll | 18 +- .../test/CodeGen/X86/vector-shuffle-128-v8.ll | 10 +- .../CodeGen/X86/vector-shuffle-256-v16.ll | 312 +- .../CodeGen/X86/vector-shuffle-256-v32.ll | 102 +- .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 68 +- .../test/CodeGen/X86/vector-shuffle-256-v8.ll | 565 +- .../CodeGen/X86/vector-shuffle-512-v64.ll | 314 +- .../test/CodeGen/X86/vector-shuffle-512-v8.ll | 1230 +- .../test/CodeGen/X86/vector-shuffle-avx512.ll | 30 +- .../X86/vector-shuffle-combining-avx.ll | 72 +- .../X86/vector-shuffle-combining-avx2.ll | 85 +- .../X86/vector-shuffle-combining-avx512bw.ll | 20 +- .../vector-shuffle-combining-avx512bwvl.ll | 20 +- .../X86/vector-shuffle-combining-avx512f.ll | 138 +- .../vector-shuffle-combining-avx512vbmi.ll | 2 +- .../X86/vector-shuffle-combining-sse41.ll | 4 +- llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 60 +- llvm/test/CodeGen/X86/vector-shuffle-v192.ll | 115 +- llvm/test/CodeGen/X86/vector-shuffle-v48.ll | 29 +- .../X86/vector-shuffle-variable-128.ll | 32 +- .../X86/vector-shuffle-variable-256.ll | 20 +- llvm/test/CodeGen/X86/vector-trunc-math.ll | 118 +- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 1296 +- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 1156 +- llvm/test/CodeGen/X86/vector-trunc-usat.ll | 483 +- llvm/test/CodeGen/X86/vector-trunc.ll | 152 +- llvm/test/CodeGen/X86/vector-tzcnt-128.ll | 160 +- llvm/test/CodeGen/X86/vector-tzcnt-256.ll | 268 +- llvm/test/CodeGen/X86/vector-tzcnt-512.ll | 78 +- llvm/test/CodeGen/X86/vector-unsigned-cmp.ll | 218 +- llvm/test/CodeGen/X86/vector-zext.ll | 2 +- ...vector_splat-const-shift-of-constmasked.ll | 16 +- llvm/test/CodeGen/X86/vselect-avx.ll | 26 +- llvm/test/CodeGen/X86/vselect-constants.ll | 4 +- llvm/test/CodeGen/X86/vselect-minmax.ll | 16 +- llvm/test/CodeGen/X86/vselect-packss.ll | 10 +- llvm/test/CodeGen/X86/vselect-pcmp.ll | 105 +- llvm/test/CodeGen/X86/vselect-post-combine.ll | 4 +- llvm/test/CodeGen/X86/vselect.ll | 40 +- llvm/test/CodeGen/X86/vshift-6.ll | 4 +- llvm/test/CodeGen/X86/wide-integer-cmp.ll | 36 +- ...lar-shift-by-byte-multiple-legalization.ll | 16498 ++++++------ .../X86/wide-scalar-shift-legalization.ll | 6681 +++-- ...ad-of-small-alloca-with-zero-upper-half.ll | 1121 +- .../CodeGen/X86/widen-load-of-small-alloca.ll | 688 +- llvm/test/CodeGen/X86/widen_arith-5.ll | 2 +- llvm/test/CodeGen/X86/widen_bitcnt.ll | 52 +- llvm/test/CodeGen/X86/widen_bitops-0.ll | 12 +- llvm/test/CodeGen/X86/widen_cast-2.ll | 2 +- llvm/test/CodeGen/X86/widen_cast-4.ll | 2 +- llvm/test/CodeGen/X86/widen_conv-4.ll | 4 +- llvm/test/CodeGen/X86/widen_fadd.ll | 103 +- llvm/test/CodeGen/X86/widen_fdiv.ll | 72 +- llvm/test/CodeGen/X86/widen_fmul.ll | 103 +- llvm/test/CodeGen/X86/widen_fsub.ll | 103 +- llvm/test/CodeGen/X86/widen_load-2.ll | 38 +- llvm/test/CodeGen/X86/win64-byval.ll | 30 +- llvm/test/CodeGen/X86/x86-64-varargs.ll | 188 +- .../CodeGen/X86/x86-interleaved-access.ll | 139 +- .../X86/x86-no_caller_saved_registers.ll | 399 +- llvm/test/CodeGen/X86/x86-shrink-wrapping.ll | 10 +- llvm/test/CodeGen/X86/xaluo128.ll | 64 +- llvm/test/CodeGen/X86/xmulo.ll | 341 +- llvm/test/CodeGen/X86/xor-lea.ll | 4 +- llvm/test/CodeGen/X86/xor.ll | 6 +- .../CodeGen/X86/zero-call-used-regs-i386.ll | 4 +- .../CodeGen/X86/zero_extend_vector_inreg.ll | 507 +- .../zero_extend_vector_inreg_of_broadcast.ll | 1260 +- ...d_vector_inreg_of_broadcast_from_memory.ll | 447 +- 793 files changed, 169526 insertions(+), 170938 deletions(-) diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index a8ee9f55611b6..95cef72057552 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -253,7 +253,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, CPU = "generic"; if (TuneCPU.empty()) - TuneCPU = "i586"; // FIXME: "generic" is more modern than llc tests expect. + TuneCPU = "generic"; std::string FullFS = X86_MC::ParseX86Triple(TargetTriple); assert(!FullFS.empty() && "Failed to parse X86 triple"); diff --git a/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll b/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll index 49e2bf207e52a..dc7385658fce7 100644 --- a/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll +++ b/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll @@ -4,47 +4,50 @@ define void @foo(ptr %buf, i32 %size, i32 %col, ptr %p) nounwind { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: testl %ebp, %ebp ; CHECK-NEXT: jle LBB0_3 ; CHECK-NEXT: ## %bb.1: ## %bb.preheader ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: addl $8, %ecx +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_2: ## %bb ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl (%esi), %edi -; CHECK-NEXT: movzbl -8(%ecx), %ebx -; CHECK-NEXT: movb %bl, (%edi,%edx) -; CHECK-NEXT: movzbl -7(%ecx), %ebx -; CHECK-NEXT: movb %bl, 7(%edi,%edx) -; CHECK-NEXT: movzbl -6(%ecx), %ebx -; CHECK-NEXT: movb %bl, 5(%edi,%edx) -; CHECK-NEXT: movzbl -5(%ecx), %ebx -; CHECK-NEXT: movb %bl, 3(%edi,%edx) -; CHECK-NEXT: movzbl -4(%ecx), %ebx -; CHECK-NEXT: movb %bl, 2(%edi,%edx) -; CHECK-NEXT: movzbl -3(%ecx), %ebx -; CHECK-NEXT: movb %bl, 1(%edi,%edx) -; CHECK-NEXT: movzbl -2(%ecx), %ebx -; CHECK-NEXT: movb %bl, 2(%edi,%edx) -; CHECK-NEXT: movzbl -1(%ecx), %ebx -; CHECK-NEXT: movb %bl, 4(%edi,%edx) -; CHECK-NEXT: movzbl (%ecx), %ebx -; CHECK-NEXT: movb %bl, 6(%edi,%edx) -; CHECK-NEXT: addl $4, %esi +; CHECK-NEXT: movl (%esi,%edi,4), %ebx +; CHECK-NEXT: movzbl -8(%ecx), %eax +; CHECK-NEXT: movb %al, (%ebx,%edx) +; CHECK-NEXT: movzbl -7(%ecx), %eax +; CHECK-NEXT: movb %al, 7(%ebx,%edx) +; CHECK-NEXT: movzbl -6(%ecx), %eax +; CHECK-NEXT: movb %al, 5(%ebx,%edx) +; CHECK-NEXT: movzbl -5(%ecx), %eax +; CHECK-NEXT: movb %al, 3(%ebx,%edx) +; CHECK-NEXT: movzbl -4(%ecx), %eax +; CHECK-NEXT: movb %al, 2(%ebx,%edx) +; CHECK-NEXT: movzbl -3(%ecx), %eax +; CHECK-NEXT: movb %al, 1(%ebx,%edx) +; CHECK-NEXT: movzbl -2(%ecx), %eax +; CHECK-NEXT: movb %al, 2(%ebx,%edx) +; CHECK-NEXT: movzbl -1(%ecx), %eax +; CHECK-NEXT: movb %al, 4(%ebx,%edx) +; CHECK-NEXT: movzbl (%ecx), %eax +; CHECK-NEXT: movb %al, 6(%ebx,%edx) +; CHECK-NEXT: incl %edi ; CHECK-NEXT: addl $9, %ecx -; CHECK-NEXT: decl %eax +; CHECK-NEXT: cmpl %edi, %ebp ; CHECK-NEXT: jne LBB0_2 ; CHECK-NEXT: LBB0_3: ## %return ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi ; CHECK-NEXT: popl %ebx +; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl entry: icmp sgt i32 %size, 0 ; :0 [#uses=1] diff --git a/llvm/test/CodeGen/X86/2008-12-23-crazy-address.ll b/llvm/test/CodeGen/X86/2008-12-23-crazy-address.ll index 54f85b8f73817..4f283c7a27b3c 100644 --- a/llvm/test/CodeGen/X86/2008-12-23-crazy-address.ll +++ b/llvm/test/CodeGen/X86/2008-12-23-crazy-address.ll @@ -36,7 +36,8 @@ define void @bar(i32 %i) nounwind { ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: calll frob@PLT ; CHECK-NEXT: addl $4, %esp -; CHECK-NEXT: leal X(%esp,%esi,4), %eax +; CHECK-NEXT: leal (%esp,%esi,4), %eax +; CHECK-NEXT: addl $X, %eax ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: calll borf@PLT ; CHECK-NEXT: addl $44, %esp diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll index 1962ddebc2115..1a8fdc5b4bc8f 100644 --- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll +++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll @@ -29,32 +29,29 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $28, %esp +; CHECK-NEXT: subl $12, %esp ; CHECK-NEXT: .cfi_offset %esi, -20 ; CHECK-NEXT: .cfi_offset %edi, -16 ; CHECK-NEXT: .cfi_offset %ebx, -12 ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: Ltmp0: -; CHECK-NEXT: ## implicit-def: $ebx +; CHECK-NEXT: ## implicit-def: $edi ; CHECK-NEXT: calll __Znam ; CHECK-NEXT: Ltmp1: ; CHECK-NEXT: ## %bb.1: ## %bb11 ; CHECK-NEXT: movl %eax, %esi -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: movb $1, %bl +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_2 ; CHECK-NEXT: ## %bb.7: ## %bb31 -; CHECK-NEXT: ## implicit-def: $eax -; CHECK-NEXT: ## kill: killed $eax +; CHECK-NEXT: ## implicit-def: $edi ; CHECK-NEXT: LBB0_8: ## %bb38 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_13 Depth 2 ; CHECK-NEXT: ## Child Loop BB0_16 Depth 3 ; CHECK-NEXT: ## Child Loop BB0_21 Depth 2 -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_9 ; CHECK-NEXT: ## %bb.10: ## %bb41 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 @@ -78,8 +75,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ## => This Loop Header: Depth=2 ; CHECK-NEXT: ## Child Loop BB0_16 Depth 3 -; CHECK-NEXT: movb $1, %cl -; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb48 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=2 @@ -87,15 +83,11 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: ## %bb.15: ## %bb49.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=2 ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl %esi, %edx -; CHECK-NEXT: movl %edi, %ebx ; CHECK-NEXT: LBB0_16: ## %bb49 ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ## Parent Loop BB0_13 Depth=2 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=3 ; CHECK-NEXT: incl %ecx -; CHECK-NEXT: addl $4, %edx -; CHECK-NEXT: decl %ebx ; CHECK-NEXT: jne LBB0_16 ; CHECK-NEXT: LBB0_17: ## %bb57 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=2 @@ -107,33 +99,30 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: calll ___bzero -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb61.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: LBB0_21: ## %bb61 ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movl $0, (%eax) -; CHECK-NEXT: addl $4, %eax -; CHECK-NEXT: decl %ecx +; CHECK-NEXT: movl $0, (%esi,%eax,4) +; CHECK-NEXT: incl %eax ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: LBB0_22: ## %bb67 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 -; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: decl %edi ; CHECK-NEXT: jmp LBB0_8 ; CHECK-NEXT: LBB0_18: ## %bb43 ; CHECK-NEXT: Ltmp5: -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %esi, %edi ; CHECK-NEXT: calll _OnOverFlow ; CHECK-NEXT: Ltmp6: ; CHECK-NEXT: jmp LBB0_3 ; CHECK-NEXT: LBB0_2: ## %bb29 ; CHECK-NEXT: Ltmp7: -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %esi, %edi ; CHECK-NEXT: calll _OnOverFlow ; CHECK-NEXT: Ltmp8: ; CHECK-NEXT: LBB0_3: ## %bb30 @@ -141,10 +130,10 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: LBB0_4: ## %bb20.loopexit ; CHECK-NEXT: Ltmp4: ; CHECK-NEXT: LBB0_9: -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %esi, %edi ; CHECK-NEXT: LBB0_6: ## %bb23 -; CHECK-NEXT: testl %ebx, %ebx -; CHECK-NEXT: addl $28, %esp +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: addl $12, %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi ; CHECK-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/2012-12-1-merge-multiple.ll b/llvm/test/CodeGen/X86/2012-12-1-merge-multiple.ll index 86af5fc58c977..5a3ebeceb73c7 100644 --- a/llvm/test/CodeGen/X86/2012-12-1-merge-multiple.ll +++ b/llvm/test/CodeGen/X86/2012-12-1-merge-multiple.ll @@ -4,10 +4,8 @@ define void @multiple_stores_on_chain(ptr %A) { ; CHECK-LABEL: multiple_stores_on_chain: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movabsq $844433520132096, %rax # imm = 0x3000200010000 -; CHECK-NEXT: movq %rax, (%rdi) -; CHECK-NEXT: movabsq $1970350607106052, %rax # imm = 0x7000600050004 -; CHECK-NEXT: movq %rax, 8(%rdi) +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; CHECK-NEXT: movups %xmm0, (%rdi) ; CHECK-NEXT: retq entry: %a1 = getelementptr inbounds i16, ptr %A, i64 1 diff --git a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll index 4c92adb25d0bd..d9ccdbdc9ea5f 100644 --- a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll +++ b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll @@ -307,19 +307,19 @@ define i64 @t5_cse(i64 %val, i64 %shamt, ptr%dst) nounwind { ; X86-NOBMI2-NEXT: pushl %esi ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI2-NEXT: movl %eax, %ebx ; X86-NOBMI2-NEXT: addl $32, %ebx ; X86-NOBMI2-NEXT: adcl $0, %edi -; X86-NOBMI2-NEXT: movl %ebx, (%ecx) -; X86-NOBMI2-NEXT: movl %edi, 4(%ecx) ; X86-NOBMI2-NEXT: movb $32, %cl ; X86-NOBMI2-NEXT: subb %al, %cl ; X86-NOBMI2-NEXT: movl %esi, %eax ; X86-NOBMI2-NEXT: shll %cl, %eax ; X86-NOBMI2-NEXT: shldl %cl, %esi, %edx +; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI2-NEXT: movl %ebx, (%esi) +; X86-NOBMI2-NEXT: movl %edi, 4(%esi) ; X86-NOBMI2-NEXT: testb $32, %cl ; X86-NOBMI2-NEXT: je .LBB5_2 ; X86-NOBMI2-NEXT: # %bb.1: @@ -338,17 +338,17 @@ define i64 @t5_cse(i64 %val, i64 %shamt, ptr%dst) nounwind { ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: movl %ebx, %edi ; X86-BMI2-NEXT: addl $32, %edi ; X86-BMI2-NEXT: adcl $0, %esi -; X86-BMI2-NEXT: movl %edi, (%ecx) -; X86-BMI2-NEXT: movl %esi, 4(%ecx) ; X86-BMI2-NEXT: movb $32, %cl ; X86-BMI2-NEXT: subb %bl, %cl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: shldl %cl, %eax, %edx +; X86-BMI2-NEXT: movl %edi, (%ebx) +; X86-BMI2-NEXT: movl %esi, 4(%ebx) ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB5_2 @@ -390,18 +390,18 @@ define i64 @t6_cse2(i64 %val, i64 %shamt, ptr%dst) nounwind { ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: pushl %edi ; X86-NOBMI2-NEXT: pushl %esi -; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI2-NEXT: xorl %edi, %edi +; X86-NOBMI2-NEXT: xorl %esi, %esi ; X86-NOBMI2-NEXT: movl $32, %ecx ; X86-NOBMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI2-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NOBMI2-NEXT: movl %ecx, (%eax) -; X86-NOBMI2-NEXT: movl %edi, 4(%eax) -; X86-NOBMI2-NEXT: movl %esi, %eax +; X86-NOBMI2-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI2-NEXT: movl %edi, %eax ; X86-NOBMI2-NEXT: shll %cl, %eax -; X86-NOBMI2-NEXT: shldl %cl, %esi, %edx +; X86-NOBMI2-NEXT: shldl %cl, %edi, %edx +; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NOBMI2-NEXT: movl %ecx, (%edi) +; X86-NOBMI2-NEXT: movl %esi, 4(%edi) ; X86-NOBMI2-NEXT: testb $32, %cl ; X86-NOBMI2-NEXT: je .LBB6_2 ; X86-NOBMI2-NEXT: # %bb.1: @@ -423,9 +423,9 @@ define i64 @t6_cse2(i64 %val, i64 %shamt, ptr%dst) nounwind { ; X86-BMI2-NEXT: movl $32, %ecx ; X86-BMI2-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-BMI2-NEXT: shldl %cl, %eax, %edx ; X86-BMI2-NEXT: movl %ecx, (%esi) ; X86-BMI2-NEXT: movl %edi, 4(%esi) -; X86-BMI2-NEXT: shldl %cl, %eax, %edx ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB6_2 diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll index 0c349c3aa8ec1..7a301c06107ce 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll @@ -124,9 +124,9 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]] ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]] diff --git a/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll index 7bde1b7a7a8be..ac916ffe29ed7 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll @@ -18,10 +18,10 @@ define i128 @test_add_i128(i128 %arg1, i128 %arg2) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, (%eax) @@ -44,8 +44,8 @@ define i64 @test_add_i64(i64 %arg1, i64 %arg2) { ; X86-LABEL: test_add_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl %ret = add i64 %arg1, %arg2 diff --git a/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll index 7a035f5e4ad4d..d8f113753ed8f 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll @@ -18,10 +18,10 @@ define i128 @test_sub_i128(i128 %arg1, i128 %arg2) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, (%eax) @@ -45,8 +45,8 @@ define i64 @test_sub_i64(i64 %arg1, i64 %arg2) { ; X86-LABEL: test_sub_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl %ret = sub i64 %arg1, %arg2 diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll index 0103d2bf3cc2c..be670237ad7fd 100644 --- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -17,13 +17,14 @@ define void @merge_const_store(i32 %count, ptr nocapture %p) nounwind uwtable no ; X86-NEXT: jle .LBB0_3 ; X86-NEXT: # %bb.1: # %.lr.ph.preheader ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB0_2: # %.lr.ph ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl $67305985, (%ecx) # imm = 0x4030201 -; X86-NEXT: movl $134678021, 4(%ecx) # imm = 0x8070605 -; X86-NEXT: addl $8, %ecx -; X86-NEXT: decl %eax +; X86-NEXT: movl $67305985, (%ecx,%edx,8) # imm = 0x4030201 +; X86-NEXT: movl $134678021, 4(%ecx,%edx,8) # imm = 0x8070605 +; X86-NEXT: incl %edx +; X86-NEXT: cmpl %edx, %eax ; X86-NEXT: jne .LBB0_2 ; X86-NEXT: .LBB0_3: # %._crit_edge ; X86-NEXT: retl @@ -33,13 +34,15 @@ define void @merge_const_store(i32 %count, ptr nocapture %p) nounwind uwtable no ; X64-NEXT: testl %edi, %edi ; X64-NEXT: jle .LBB0_3 ; X64-NEXT: # %bb.1: # %.lr.ph.preheader -; X64-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: movabsq $578437695752307201, %rdx # imm = 0x807060504030201 ; X64-NEXT: .p2align 4 ; X64-NEXT: .LBB0_2: # %.lr.ph ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movq %rax, (%rsi) -; X64-NEXT: addq $8, %rsi -; X64-NEXT: decl %edi +; X64-NEXT: movq %rdx, (%rsi,%rcx,8) +; X64-NEXT: incq %rcx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: jne .LBB0_2 ; X64-NEXT: .LBB0_3: # %._crit_edge ; X64-NEXT: retq @@ -213,61 +216,76 @@ define void @merge_const_store_vec(i32 %count, ptr nocapture %p) nounwind uwtabl define void @merge_nonconst_store(i32 %count, i8 %zz, ptr nocapture %p) nounwind uwtable noinline ssp { ; X86-BWON-LABEL: merge_nonconst_store: ; X86-BWON: # %bb.0: +; X86-BWON-NEXT: pushl %esi +; X86-BWON-NEXT: .cfi_def_cfa_offset 8 +; X86-BWON-NEXT: .cfi_offset %esi, -8 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BWON-NEXT: testl %eax, %eax ; X86-BWON-NEXT: jle .LBB3_3 ; X86-BWON-NEXT: # %bb.1: # %.lr.ph.preheader ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BWON-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-BWON-NEXT: xorl %esi, %esi ; X86-BWON-NEXT: .p2align 4 ; X86-BWON-NEXT: .LBB3_2: # %.lr.ph ; X86-BWON-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-BWON-NEXT: movl $67305985, (%ecx) # imm = 0x4030201 -; X86-BWON-NEXT: movb %dl, 4(%ecx) -; X86-BWON-NEXT: movw $1798, 5(%ecx) # imm = 0x706 -; X86-BWON-NEXT: movb $8, 7(%ecx) -; X86-BWON-NEXT: addl $8, %ecx -; X86-BWON-NEXT: decl %eax +; X86-BWON-NEXT: movl $67305985, (%ecx,%esi,8) # imm = 0x4030201 +; X86-BWON-NEXT: movb %dl, 4(%ecx,%esi,8) +; X86-BWON-NEXT: movw $1798, 5(%ecx,%esi,8) # imm = 0x706 +; X86-BWON-NEXT: movb $8, 7(%ecx,%esi,8) +; X86-BWON-NEXT: incl %esi +; X86-BWON-NEXT: cmpl %esi, %eax ; X86-BWON-NEXT: jne .LBB3_2 ; X86-BWON-NEXT: .LBB3_3: # %._crit_edge +; X86-BWON-NEXT: popl %esi +; X86-BWON-NEXT: .cfi_def_cfa_offset 4 ; X86-BWON-NEXT: retl ; ; X86-BWOFF-LABEL: merge_nonconst_store: ; X86-BWOFF: # %bb.0: +; X86-BWOFF-NEXT: pushl %esi +; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8 +; X86-BWOFF-NEXT: .cfi_offset %esi, -8 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BWOFF-NEXT: testl %eax, %eax ; X86-BWOFF-NEXT: jle .LBB3_3 ; X86-BWOFF-NEXT: # %bb.1: # %.lr.ph.preheader ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BWOFF-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-BWOFF-NEXT: xorl %esi, %esi ; X86-BWOFF-NEXT: .p2align 4 ; X86-BWOFF-NEXT: .LBB3_2: # %.lr.ph ; X86-BWOFF-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-BWOFF-NEXT: movl $67305985, (%ecx) # imm = 0x4030201 -; X86-BWOFF-NEXT: movb %dl, 4(%ecx) -; X86-BWOFF-NEXT: movw $1798, 5(%ecx) # imm = 0x706 -; X86-BWOFF-NEXT: movb $8, 7(%ecx) -; X86-BWOFF-NEXT: addl $8, %ecx -; X86-BWOFF-NEXT: decl %eax +; X86-BWOFF-NEXT: movl $67305985, (%ecx,%esi,8) # imm = 0x4030201 +; X86-BWOFF-NEXT: movb %dl, 4(%ecx,%esi,8) +; X86-BWOFF-NEXT: movw $1798, 5(%ecx,%esi,8) # imm = 0x706 +; X86-BWOFF-NEXT: movb $8, 7(%ecx,%esi,8) +; X86-BWOFF-NEXT: incl %esi +; X86-BWOFF-NEXT: cmpl %esi, %eax ; X86-BWOFF-NEXT: jne .LBB3_2 ; X86-BWOFF-NEXT: .LBB3_3: # %._crit_edge +; X86-BWOFF-NEXT: popl %esi +; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4 ; X86-BWOFF-NEXT: retl ; ; X64-LABEL: merge_nonconst_store: ; X64: # %bb.0: ; X64-NEXT: testl %edi, %edi -; X64-NEXT: jle .LBB3_2 +; X64-NEXT: jle .LBB3_3 +; X64-NEXT: # %bb.1: # %.lr.ph.preheader +; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: .p2align 4 -; X64-NEXT: .LBB3_1: # %.lr.ph +; X64-NEXT: .LBB3_2: # %.lr.ph ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl $67305985, (%rdx) # imm = 0x4030201 -; X64-NEXT: movb %sil, 4(%rdx) -; X64-NEXT: movw $1798, 5(%rdx) # imm = 0x706 -; X64-NEXT: movb $8, 7(%rdx) -; X64-NEXT: addq $8, %rdx -; X64-NEXT: decl %edi -; X64-NEXT: jne .LBB3_1 -; X64-NEXT: .LBB3_2: # %._crit_edge +; X64-NEXT: movl $67305985, (%rdx,%rcx,8) # imm = 0x4030201 +; X64-NEXT: movb %sil, 4(%rdx,%rcx,8) +; X64-NEXT: movw $1798, 5(%rdx,%rcx,8) # imm = 0x706 +; X64-NEXT: movb $8, 7(%rdx,%rcx,8) +; X64-NEXT: incq %rcx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: jne .LBB3_2 +; X64-NEXT: .LBB3_3: # %._crit_edge ; X64-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -301,76 +319,94 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, ptr nocapture %p) nounwind define void @merge_loads_i16(i32 %count, ptr noalias nocapture %q, ptr noalias nocapture %p) nounwind uwtable noinline ssp { ; X86-BWON-LABEL: merge_loads_i16: ; X86-BWON: # %bb.0: -; X86-BWON-NEXT: pushl %esi +; X86-BWON-NEXT: pushl %edi ; X86-BWON-NEXT: .cfi_def_cfa_offset 8 -; X86-BWON-NEXT: .cfi_offset %esi, -8 +; X86-BWON-NEXT: pushl %esi +; X86-BWON-NEXT: .cfi_def_cfa_offset 12 +; X86-BWON-NEXT: .cfi_offset %esi, -12 +; X86-BWON-NEXT: .cfi_offset %edi, -8 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BWON-NEXT: testl %eax, %eax ; X86-BWON-NEXT: jle .LBB4_3 ; X86-BWON-NEXT: # %bb.1: # %.lr.ph ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BWON-NEXT: xorl %esi, %esi ; X86-BWON-NEXT: .p2align 4 ; X86-BWON-NEXT: .LBB4_2: # =>This Inner Loop Header: Depth=1 -; X86-BWON-NEXT: movzwl (%edx), %esi -; X86-BWON-NEXT: movw %si, (%ecx) -; X86-BWON-NEXT: addl $8, %ecx -; X86-BWON-NEXT: decl %eax +; X86-BWON-NEXT: movzwl (%edx), %edi +; X86-BWON-NEXT: movw %di, (%ecx,%esi,8) +; X86-BWON-NEXT: incl %esi +; X86-BWON-NEXT: cmpl %esi, %eax ; X86-BWON-NEXT: jne .LBB4_2 ; X86-BWON-NEXT: .LBB4_3: # %._crit_edge ; X86-BWON-NEXT: popl %esi +; X86-BWON-NEXT: .cfi_def_cfa_offset 8 +; X86-BWON-NEXT: popl %edi ; X86-BWON-NEXT: .cfi_def_cfa_offset 4 ; X86-BWON-NEXT: retl ; ; X86-BWOFF-LABEL: merge_loads_i16: ; X86-BWOFF: # %bb.0: -; X86-BWOFF-NEXT: pushl %esi +; X86-BWOFF-NEXT: pushl %edi ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8 -; X86-BWOFF-NEXT: .cfi_offset %esi, -8 +; X86-BWOFF-NEXT: pushl %esi +; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12 +; X86-BWOFF-NEXT: .cfi_offset %esi, -12 +; X86-BWOFF-NEXT: .cfi_offset %edi, -8 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BWOFF-NEXT: testl %eax, %eax ; X86-BWOFF-NEXT: jle .LBB4_3 ; X86-BWOFF-NEXT: # %bb.1: # %.lr.ph ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BWOFF-NEXT: xorl %esi, %esi ; X86-BWOFF-NEXT: .p2align 4 ; X86-BWOFF-NEXT: .LBB4_2: # =>This Inner Loop Header: Depth=1 -; X86-BWOFF-NEXT: movw (%edx), %si -; X86-BWOFF-NEXT: movw %si, (%ecx) -; X86-BWOFF-NEXT: addl $8, %ecx -; X86-BWOFF-NEXT: decl %eax +; X86-BWOFF-NEXT: movw (%edx), %di +; X86-BWOFF-NEXT: movw %di, (%ecx,%esi,8) +; X86-BWOFF-NEXT: incl %esi +; X86-BWOFF-NEXT: cmpl %esi, %eax ; X86-BWOFF-NEXT: jne .LBB4_2 ; X86-BWOFF-NEXT: .LBB4_3: # %._crit_edge ; X86-BWOFF-NEXT: popl %esi +; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8 +; X86-BWOFF-NEXT: popl %edi ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4 ; X86-BWOFF-NEXT: retl ; ; X64-BWON-LABEL: merge_loads_i16: ; X64-BWON: # %bb.0: ; X64-BWON-NEXT: testl %edi, %edi -; X64-BWON-NEXT: jle .LBB4_2 +; X64-BWON-NEXT: jle .LBB4_3 +; X64-BWON-NEXT: # %bb.1: # %.lr.ph +; X64-BWON-NEXT: movl %edi, %eax +; X64-BWON-NEXT: xorl %ecx, %ecx ; X64-BWON-NEXT: .p2align 4 -; X64-BWON-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 -; X64-BWON-NEXT: movzwl (%rsi), %eax -; X64-BWON-NEXT: movw %ax, (%rdx) -; X64-BWON-NEXT: addq $8, %rdx -; X64-BWON-NEXT: decl %edi -; X64-BWON-NEXT: jne .LBB4_1 -; X64-BWON-NEXT: .LBB4_2: # %._crit_edge +; X64-BWON-NEXT: .LBB4_2: # =>This Inner Loop Header: Depth=1 +; X64-BWON-NEXT: movzwl (%rsi), %edi +; X64-BWON-NEXT: movw %di, (%rdx,%rcx,8) +; X64-BWON-NEXT: incq %rcx +; X64-BWON-NEXT: cmpl %ecx, %eax +; X64-BWON-NEXT: jne .LBB4_2 +; X64-BWON-NEXT: .LBB4_3: # %._crit_edge ; X64-BWON-NEXT: retq ; ; X64-BWOFF-LABEL: merge_loads_i16: ; X64-BWOFF: # %bb.0: ; X64-BWOFF-NEXT: testl %edi, %edi -; X64-BWOFF-NEXT: jle .LBB4_2 +; X64-BWOFF-NEXT: jle .LBB4_3 +; X64-BWOFF-NEXT: # %bb.1: # %.lr.ph +; X64-BWOFF-NEXT: movl %edi, %eax +; X64-BWOFF-NEXT: xorl %ecx, %ecx ; X64-BWOFF-NEXT: .p2align 4 -; X64-BWOFF-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 -; X64-BWOFF-NEXT: movw (%rsi), %ax -; X64-BWOFF-NEXT: movw %ax, (%rdx) -; X64-BWOFF-NEXT: addq $8, %rdx -; X64-BWOFF-NEXT: decl %edi -; X64-BWOFF-NEXT: jne .LBB4_1 -; X64-BWOFF-NEXT: .LBB4_2: # %._crit_edge +; X64-BWOFF-NEXT: .LBB4_2: # =>This Inner Loop Header: Depth=1 +; X64-BWOFF-NEXT: movw (%rsi), %di +; X64-BWOFF-NEXT: movw %di, (%rdx,%rcx,8) +; X64-BWOFF-NEXT: incq %rcx +; X64-BWOFF-NEXT: cmpl %ecx, %eax +; X64-BWOFF-NEXT: jne .LBB4_2 +; X64-BWOFF-NEXT: .LBB4_3: # %._crit_edge ; X64-BWOFF-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -404,6 +440,9 @@ define void @no_merge_loads(i32 %count, ptr noalias nocapture %q, ptr noalias no ; X86-BWON: # %bb.0: ; X86-BWON-NEXT: pushl %ebx ; X86-BWON-NEXT: .cfi_def_cfa_offset 8 +; X86-BWON-NEXT: pushl %esi +; X86-BWON-NEXT: .cfi_def_cfa_offset 12 +; X86-BWON-NEXT: .cfi_offset %esi, -12 ; X86-BWON-NEXT: .cfi_offset %ebx, -8 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BWON-NEXT: testl %eax, %eax @@ -411,17 +450,20 @@ define void @no_merge_loads(i32 %count, ptr noalias nocapture %q, ptr noalias no ; X86-BWON-NEXT: # %bb.1: # %.lr.ph ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BWON-NEXT: xorl %esi, %esi ; X86-BWON-NEXT: .p2align 4 ; X86-BWON-NEXT: .LBB5_2: # %a4 ; X86-BWON-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-BWON-NEXT: movzbl (%edx), %ebx -; X86-BWON-NEXT: movb %bl, (%ecx) +; X86-BWON-NEXT: movb %bl, (%ecx,%esi,8) ; X86-BWON-NEXT: movzbl 1(%edx), %ebx -; X86-BWON-NEXT: movb %bl, 1(%ecx) -; X86-BWON-NEXT: addl $8, %ecx -; X86-BWON-NEXT: decl %eax +; X86-BWON-NEXT: movb %bl, 1(%ecx,%esi,8) +; X86-BWON-NEXT: incl %esi +; X86-BWON-NEXT: cmpl %esi, %eax ; X86-BWON-NEXT: jne .LBB5_2 ; X86-BWON-NEXT: .LBB5_3: # %._crit_edge +; X86-BWON-NEXT: popl %esi +; X86-BWON-NEXT: .cfi_def_cfa_offset 8 ; X86-BWON-NEXT: popl %ebx ; X86-BWON-NEXT: .cfi_def_cfa_offset 4 ; X86-BWON-NEXT: retl @@ -430,6 +472,9 @@ define void @no_merge_loads(i32 %count, ptr noalias nocapture %q, ptr noalias no ; X86-BWOFF: # %bb.0: ; X86-BWOFF-NEXT: pushl %ebx ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8 +; X86-BWOFF-NEXT: pushl %esi +; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12 +; X86-BWOFF-NEXT: .cfi_offset %esi, -12 ; X86-BWOFF-NEXT: .cfi_offset %ebx, -8 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BWOFF-NEXT: testl %eax, %eax @@ -437,17 +482,20 @@ define void @no_merge_loads(i32 %count, ptr noalias nocapture %q, ptr noalias no ; X86-BWOFF-NEXT: # %bb.1: # %.lr.ph ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BWOFF-NEXT: xorl %esi, %esi ; X86-BWOFF-NEXT: .p2align 4 ; X86-BWOFF-NEXT: .LBB5_2: # %a4 ; X86-BWOFF-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-BWOFF-NEXT: movb (%edx), %bl -; X86-BWOFF-NEXT: movb %bl, (%ecx) +; X86-BWOFF-NEXT: movb %bl, (%ecx,%esi,8) ; X86-BWOFF-NEXT: movb 1(%edx), %bl -; X86-BWOFF-NEXT: movb %bl, 1(%ecx) -; X86-BWOFF-NEXT: addl $8, %ecx -; X86-BWOFF-NEXT: decl %eax +; X86-BWOFF-NEXT: movb %bl, 1(%ecx,%esi,8) +; X86-BWOFF-NEXT: incl %esi +; X86-BWOFF-NEXT: cmpl %esi, %eax ; X86-BWOFF-NEXT: jne .LBB5_2 ; X86-BWOFF-NEXT: .LBB5_3: # %._crit_edge +; X86-BWOFF-NEXT: popl %esi +; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8 ; X86-BWOFF-NEXT: popl %ebx ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4 ; X86-BWOFF-NEXT: retl @@ -455,35 +503,41 @@ define void @no_merge_loads(i32 %count, ptr noalias nocapture %q, ptr noalias no ; X64-BWON-LABEL: no_merge_loads: ; X64-BWON: # %bb.0: ; X64-BWON-NEXT: testl %edi, %edi -; X64-BWON-NEXT: jle .LBB5_2 +; X64-BWON-NEXT: jle .LBB5_3 +; X64-BWON-NEXT: # %bb.1: # %.lr.ph +; X64-BWON-NEXT: movl %edi, %eax +; X64-BWON-NEXT: xorl %ecx, %ecx ; X64-BWON-NEXT: .p2align 4 -; X64-BWON-NEXT: .LBB5_1: # %a4 +; X64-BWON-NEXT: .LBB5_2: # %a4 ; X64-BWON-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-BWON-NEXT: movzbl (%rsi), %eax -; X64-BWON-NEXT: movb %al, (%rdx) -; X64-BWON-NEXT: movzbl 1(%rsi), %eax -; X64-BWON-NEXT: movb %al, 1(%rdx) -; X64-BWON-NEXT: addq $8, %rdx -; X64-BWON-NEXT: decl %edi -; X64-BWON-NEXT: jne .LBB5_1 -; X64-BWON-NEXT: .LBB5_2: # %._crit_edge +; X64-BWON-NEXT: movzbl (%rsi), %edi +; X64-BWON-NEXT: movb %dil, (%rdx,%rcx,8) +; X64-BWON-NEXT: movzbl 1(%rsi), %edi +; X64-BWON-NEXT: movb %dil, 1(%rdx,%rcx,8) +; X64-BWON-NEXT: incq %rcx +; X64-BWON-NEXT: cmpl %ecx, %eax +; X64-BWON-NEXT: jne .LBB5_2 +; X64-BWON-NEXT: .LBB5_3: # %._crit_edge ; X64-BWON-NEXT: retq ; ; X64-BWOFF-LABEL: no_merge_loads: ; X64-BWOFF: # %bb.0: ; X64-BWOFF-NEXT: testl %edi, %edi -; X64-BWOFF-NEXT: jle .LBB5_2 +; X64-BWOFF-NEXT: jle .LBB5_3 +; X64-BWOFF-NEXT: # %bb.1: # %.lr.ph +; X64-BWOFF-NEXT: movl %edi, %eax +; X64-BWOFF-NEXT: xorl %ecx, %ecx ; X64-BWOFF-NEXT: .p2align 4 -; X64-BWOFF-NEXT: .LBB5_1: # %a4 +; X64-BWOFF-NEXT: .LBB5_2: # %a4 ; X64-BWOFF-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-BWOFF-NEXT: movb (%rsi), %al -; X64-BWOFF-NEXT: movb %al, (%rdx) -; X64-BWOFF-NEXT: movb 1(%rsi), %al -; X64-BWOFF-NEXT: movb %al, 1(%rdx) -; X64-BWOFF-NEXT: addq $8, %rdx -; X64-BWOFF-NEXT: decl %edi -; X64-BWOFF-NEXT: jne .LBB5_1 -; X64-BWOFF-NEXT: .LBB5_2: # %._crit_edge +; X64-BWOFF-NEXT: movb (%rsi), %dil +; X64-BWOFF-NEXT: movb %dil, (%rdx,%rcx,8) +; X64-BWOFF-NEXT: movb 1(%rsi), %dil +; X64-BWOFF-NEXT: movb %dil, 1(%rdx,%rcx,8) +; X64-BWOFF-NEXT: incq %rcx +; X64-BWOFF-NEXT: cmpl %ecx, %eax +; X64-BWOFF-NEXT: jne .LBB5_2 +; X64-BWOFF-NEXT: .LBB5_3: # %._crit_edge ; X64-BWOFF-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge diff --git a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll index 12d66f64cb73d..74d787af55dc7 100644 --- a/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll +++ b/llvm/test/CodeGen/X86/PR71178-register-coalescer-crash.ll @@ -40,8 +40,8 @@ define i32 @h(i1 %arg, i32 %arg1) { ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.5: # %bb14 ; CHECK-NEXT: movl %eax, %r8d -; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: jne .LBB0_6 ; CHECK-NEXT: .LBB0_10: # %bb22 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll index 6e22d855dc831..1f774e0829e93 100644 --- a/llvm/test/CodeGen/X86/abds-neg.ll +++ b/llvm/test/CodeGen/X86/abds-neg.ll @@ -643,23 +643,24 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl %eax, %esi ; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl %edi, %ecx -; X86-NEXT: cmovll %edx, %ecx +; X86-NEXT: cmovll %ebp, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ecx @@ -852,32 +853,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovgel (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: cmovgel (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovgel %ebx, %esi -; X86-NEXT: cmovgel %ebp, %ecx +; X86-NEXT: cmovgel %ebp, %edx ; X86-NEXT: cmovgel %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1057,8 +1058,8 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %edx ; X86-NEXT: sarl $31, %edx @@ -1088,8 +1089,8 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %edx ; X86-NEXT: sarl $31, %edx @@ -1122,31 +1123,31 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: subl %edi, %ebp -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl %ebp, (%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %eax, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %ebp, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1179,31 +1180,31 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: subl %edi, %ebp -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl %ebp, (%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %eax, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %ebp, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll index 0356c2702a419..60610789df206 100644 --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -347,32 +347,32 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx +; X86-NEXT: cmovll %ebp, %edx ; X86-NEXT: cmovll %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -408,32 +408,32 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx +; X86-NEXT: cmovll %ebp, %edx ; X86-NEXT: cmovll %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -589,32 +589,32 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx +; X86-NEXT: cmovll %ebp, %edx ; X86-NEXT: cmovll %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -772,32 +772,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx +; X86-NEXT: cmovll %ebp, %edx ; X86-NEXT: cmovll %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1030,29 +1030,29 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: sarl $31, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi ; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: subl %ebx, %ecx ; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1082,29 +1082,29 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: sarl $31, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi ; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: subl %ebx, %ecx ; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1286,32 +1286,32 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx +; X86-NEXT: cmovll %ebp, %edx ; X86-NEXT: cmovll %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll index 6bda99c89a37e..9694efbc22c27 100644 --- a/llvm/test/CodeGen/X86/abdu-neg.ll +++ b/llvm/test/CodeGen/X86/abdu-neg.ll @@ -358,36 +358,36 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: xorl %ebp, %ecx -; X86-NEXT: xorl %ebp, %esi -; X86-NEXT: xorl %ebp, %ebx -; X86-NEXT: xorl %ebp, %edx -; X86-NEXT: subl %ebp, %edx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: sbbl %ebp, %esi -; X86-NEXT: sbbl %ebp, %ecx -; X86-NEXT: negl %edx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebx, %ebp ; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: subl %ebx, %ecx +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: negl %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %ebp, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -426,36 +426,36 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: xorl %ebp, %ecx -; X86-NEXT: xorl %ebp, %esi -; X86-NEXT: xorl %ebp, %ebx -; X86-NEXT: xorl %ebp, %edx -; X86-NEXT: subl %ebp, %edx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: sbbl %ebp, %esi -; X86-NEXT: sbbl %ebp, %ecx -; X86-NEXT: negl %edx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebx, %ebp ; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: subl %ebx, %ecx +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: negl %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %ebp, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -625,23 +625,24 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl %eax, %esi ; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl %edi, %ecx -; X86-NEXT: cmovbl %edx, %ecx +; X86-NEXT: cmovbl %ebp, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ecx @@ -830,36 +831,36 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: xorl %ebp, %ecx -; X86-NEXT: xorl %ebp, %esi -; X86-NEXT: xorl %ebp, %ebx -; X86-NEXT: xorl %ebp, %edx -; X86-NEXT: subl %ebp, %edx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: sbbl %ebp, %esi -; X86-NEXT: sbbl %ebp, %ecx -; X86-NEXT: negl %edx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebx, %ebp ; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: subl %ebx, %ecx +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: negl %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %ebp, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll index 78b315a3773ec..52c513ff526ed 100644 --- a/llvm/test/CodeGen/X86/abdu-vector-128.ll +++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll @@ -237,7 +237,7 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: abd_ext_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -308,7 +308,7 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: abd_ext_v2i64_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -467,7 +467,7 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: abd_minmax_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -627,7 +627,7 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: abd_cmp_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -705,7 +705,7 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin ; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll index 27acec32fd348..051e0d7757cb1 100644 --- a/llvm/test/CodeGen/X86/abdu.ll +++ b/llvm/test/CodeGen/X86/abdu.ll @@ -267,9 +267,9 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind { ; X86-LABEL: abd_ext_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: xorl %ecx, %edx @@ -297,9 +297,9 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { ; X86-LABEL: abd_ext_i64_undef: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: xorl %ecx, %edx @@ -329,29 +329,29 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -384,29 +384,29 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -521,9 +521,9 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind { ; X86-LABEL: abd_minmax_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: xorl %ecx, %edx @@ -551,29 +551,29 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -689,9 +689,9 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X86-LABEL: abd_cmp_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: xorl %ecx, %edx @@ -720,29 +720,29 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -859,9 +859,9 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { ; X86-LABEL: abd_select_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: xorl %ecx, %edx @@ -890,29 +890,29 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/abi-isel.ll b/llvm/test/CodeGen/X86/abi-isel.ll index 2ac392c729d19..d3796ad4e949f 100644 --- a/llvm/test/CodeGen/X86/abi-isel.ll +++ b/llvm/test/CodeGen/X86/abi-isel.ll @@ -2791,7 +2791,8 @@ define dso_local void @ind04(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp31: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp31-.L31$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal .Lddst$local@GOTOFF(%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: addl $.Lddst$local@GOTOFF, %ecx ; LINUX-32-PIC-NEXT: movl %ecx, .Ldptr$local@GOTOFF(%eax) ; LINUX-32-PIC-NEXT: retl ; @@ -2822,7 +2823,8 @@ define dso_local void @ind04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L31$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ddst-L31$pb(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: addl $_ddst-L31$pb, %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _dptr-L31$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -3062,7 +3064,8 @@ define dso_local void @ind07(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp34: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp34-.L34$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal ldst@GOTOFF(%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: addl $ldst@GOTOFF, %ecx ; LINUX-32-PIC-NEXT: movl %ecx, lptr@GOTOFF(%eax) ; LINUX-32-PIC-NEXT: retl ; @@ -3093,7 +3096,8 @@ define dso_local void @ind07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L34$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ldst-L34$pb(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: addl $_ldst-L34$pb, %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _lptr-L34$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -3423,7 +3427,8 @@ define dso_local void @off01(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: off01: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq dst@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $64, %rax ; LINUX-64-STATIC-NEXT: movq ptr@GOTPCREL(%rip), %rcx ; LINUX-64-STATIC-NEXT: movq %rax, (%rcx) ; LINUX-64-STATIC-NEXT: retq @@ -3444,7 +3449,8 @@ define dso_local void @off01(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp38-.L38$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl dst@GOT(%eax), %edx -; LINUX-32-PIC-NEXT: leal 64(%edx,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: leal (%edx,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: addl $64, %ecx ; LINUX-32-PIC-NEXT: movl ptr@GOT(%eax), %eax ; LINUX-32-PIC-NEXT: movl %ecx, (%eax) ; LINUX-32-PIC-NEXT: retl @@ -3452,7 +3458,8 @@ define dso_local void @off01(i64 %i) nounwind { ; LINUX-64-PIC-LABEL: off01: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq dst@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: movq ptr@GOTPCREL(%rip), %rcx ; LINUX-64-PIC-NEXT: movq %rax, (%rcx) ; LINUX-64-PIC-NEXT: retq @@ -3468,7 +3475,8 @@ define dso_local void @off01(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_dst$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $64, %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_ptr$non_lazy_ptr, %ecx ; DARWIN-32-DYNAMIC-NEXT: movl %eax, (%ecx) ; DARWIN-32-DYNAMIC-NEXT: retl @@ -3480,7 +3488,8 @@ define dso_local void @off01(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_dst$non_lazy_ptr-L38$pb(%eax), %edx -; DARWIN-32-PIC-NEXT: leal 64(%edx,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal (%edx,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: addl $64, %ecx ; DARWIN-32-PIC-NEXT: movl L_ptr$non_lazy_ptr-L38$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %ecx, (%eax) ; DARWIN-32-PIC-NEXT: retl @@ -3488,7 +3497,8 @@ define dso_local void @off01(i64 %i) nounwind { ; DARWIN-64-STATIC-LABEL: off01: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: movq _ptr@GOTPCREL(%rip), %rcx ; DARWIN-64-STATIC-NEXT: movq %rax, (%rcx) ; DARWIN-64-STATIC-NEXT: retq @@ -3496,7 +3506,8 @@ define dso_local void @off01(i64 %i) nounwind { ; DARWIN-64-DYNAMIC-LABEL: off01: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: movq _ptr@GOTPCREL(%rip), %rcx ; DARWIN-64-DYNAMIC-NEXT: movq %rax, (%rcx) ; DARWIN-64-DYNAMIC-NEXT: retq @@ -3504,7 +3515,8 @@ define dso_local void @off01(i64 %i) nounwind { ; DARWIN-64-PIC-LABEL: off01: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: movq _ptr@GOTPCREL(%rip), %rcx ; DARWIN-64-PIC-NEXT: movq %rax, (%rcx) ; DARWIN-64-PIC-NEXT: retq @@ -3520,7 +3532,8 @@ define dso_local void @oxf01(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: oxf01: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq xdst@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $64, %rax ; LINUX-64-STATIC-NEXT: movq ptr@GOTPCREL(%rip), %rcx ; LINUX-64-STATIC-NEXT: movq %rax, (%rcx) ; LINUX-64-STATIC-NEXT: retq @@ -3541,7 +3554,8 @@ define dso_local void @oxf01(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp39-.L39$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl xdst@GOT(%eax), %edx -; LINUX-32-PIC-NEXT: leal 64(%edx,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: leal (%edx,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: addl $64, %ecx ; LINUX-32-PIC-NEXT: movl ptr@GOT(%eax), %eax ; LINUX-32-PIC-NEXT: movl %ecx, (%eax) ; LINUX-32-PIC-NEXT: retl @@ -3549,7 +3563,8 @@ define dso_local void @oxf01(i64 %i) nounwind { ; LINUX-64-PIC-LABEL: oxf01: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq xdst@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: movq ptr@GOTPCREL(%rip), %rcx ; LINUX-64-PIC-NEXT: movq %rax, (%rcx) ; LINUX-64-PIC-NEXT: retq @@ -3565,7 +3580,8 @@ define dso_local void @oxf01(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_xdst$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $64, %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_ptr$non_lazy_ptr, %ecx ; DARWIN-32-DYNAMIC-NEXT: movl %eax, (%ecx) ; DARWIN-32-DYNAMIC-NEXT: retl @@ -3577,7 +3593,8 @@ define dso_local void @oxf01(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_xdst$non_lazy_ptr-L39$pb(%eax), %edx -; DARWIN-32-PIC-NEXT: leal 64(%edx,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal (%edx,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: addl $64, %ecx ; DARWIN-32-PIC-NEXT: movl L_ptr$non_lazy_ptr-L39$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %ecx, (%eax) ; DARWIN-32-PIC-NEXT: retl @@ -3585,7 +3602,8 @@ define dso_local void @oxf01(i64 %i) nounwind { ; DARWIN-64-STATIC-LABEL: oxf01: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _xdst@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: movq _ptr@GOTPCREL(%rip), %rcx ; DARWIN-64-STATIC-NEXT: movq %rax, (%rcx) ; DARWIN-64-STATIC-NEXT: retq @@ -3593,7 +3611,8 @@ define dso_local void @oxf01(i64 %i) nounwind { ; DARWIN-64-DYNAMIC-LABEL: oxf01: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _xdst@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: movq _ptr@GOTPCREL(%rip), %rcx ; DARWIN-64-DYNAMIC-NEXT: movq %rax, (%rcx) ; DARWIN-64-DYNAMIC-NEXT: retq @@ -3601,7 +3620,8 @@ define dso_local void @oxf01(i64 %i) nounwind { ; DARWIN-64-PIC-LABEL: oxf01: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _xdst@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: movq _ptr@GOTPCREL(%rip), %rcx ; DARWIN-64-PIC-NEXT: movq %rax, (%rcx) ; DARWIN-64-PIC-NEXT: retq @@ -3946,14 +3966,16 @@ define dso_local void @off04(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp43: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp43-.L43$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal .Lddst$local@GOTOFF+64(%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: addl $.Lddst$local@GOTOFF+64, %ecx ; LINUX-32-PIC-NEXT: movl %ecx, .Ldptr$local@GOTOFF(%eax) ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: off04: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq .Lddst$local(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: movq %rax, .Ldptr$local(%rip) ; LINUX-64-PIC-NEXT: retq ; @@ -3977,28 +3999,32 @@ define dso_local void @off04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L43$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ddst-L43$pb+64(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: addl $_ddst-L43$pb+64, %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _dptr-L43$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: off04: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: movq %rax, _dptr(%rip) ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: off04: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: movq %rax, _dptr(%rip) ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: off04: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: movq %rax, _dptr(%rip) ; DARWIN-64-PIC-NEXT: retq @@ -4220,14 +4246,16 @@ define dso_local void @off07(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp46: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp46-.L46$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal ldst@GOTOFF+64(%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: addl $ldst@GOTOFF+64, %ecx ; LINUX-32-PIC-NEXT: movl %ecx, lptr@GOTOFF(%eax) ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: off07: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq ldst(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: movq %rax, lptr(%rip) ; LINUX-64-PIC-NEXT: retq ; @@ -4251,28 +4279,32 @@ define dso_local void @off07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L46$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ldst-L46$pb+64(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: addl $_ldst-L46$pb+64, %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _lptr-L46$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: off07: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: movq %rax, _lptr(%rip) ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: off07: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: movq %rax, _lptr(%rip) ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: off07: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: movq %rax, _lptr(%rip) ; DARWIN-64-PIC-NEXT: retq @@ -5231,7 +5263,8 @@ define dso_local void @big01(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: big01: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq dst@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-STATIC-NEXT: movq ptr@GOTPCREL(%rip), %rcx ; LINUX-64-STATIC-NEXT: movq %rax, (%rcx) ; LINUX-64-STATIC-NEXT: retq @@ -5252,7 +5285,8 @@ define dso_local void @big01(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp58-.L58$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl dst@GOT(%eax), %edx -; LINUX-32-PIC-NEXT: leal 262144(%edx,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: leal (%edx,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: addl $262144, %ecx # imm = 0x40000 ; LINUX-32-PIC-NEXT: movl ptr@GOT(%eax), %eax ; LINUX-32-PIC-NEXT: movl %ecx, (%eax) ; LINUX-32-PIC-NEXT: retl @@ -5260,7 +5294,8 @@ define dso_local void @big01(i64 %i) nounwind { ; LINUX-64-PIC-LABEL: big01: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq dst@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: movq ptr@GOTPCREL(%rip), %rcx ; LINUX-64-PIC-NEXT: movq %rax, (%rcx) ; LINUX-64-PIC-NEXT: retq @@ -5276,7 +5311,8 @@ define dso_local void @big01(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_dst$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-DYNAMIC-NEXT: movl L_ptr$non_lazy_ptr, %ecx ; DARWIN-32-DYNAMIC-NEXT: movl %eax, (%ecx) ; DARWIN-32-DYNAMIC-NEXT: retl @@ -5288,7 +5324,8 @@ define dso_local void @big01(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_dst$non_lazy_ptr-L58$pb(%eax), %edx -; DARWIN-32-PIC-NEXT: leal 262144(%edx,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal (%edx,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: addl $262144, %ecx ## imm = 0x40000 ; DARWIN-32-PIC-NEXT: movl L_ptr$non_lazy_ptr-L58$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %ecx, (%eax) ; DARWIN-32-PIC-NEXT: retl @@ -5296,7 +5333,8 @@ define dso_local void @big01(i64 %i) nounwind { ; DARWIN-64-STATIC-LABEL: big01: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: movq _ptr@GOTPCREL(%rip), %rcx ; DARWIN-64-STATIC-NEXT: movq %rax, (%rcx) ; DARWIN-64-STATIC-NEXT: retq @@ -5304,7 +5342,8 @@ define dso_local void @big01(i64 %i) nounwind { ; DARWIN-64-DYNAMIC-LABEL: big01: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: movq _ptr@GOTPCREL(%rip), %rcx ; DARWIN-64-DYNAMIC-NEXT: movq %rax, (%rcx) ; DARWIN-64-DYNAMIC-NEXT: retq @@ -5312,7 +5351,8 @@ define dso_local void @big01(i64 %i) nounwind { ; DARWIN-64-PIC-LABEL: big01: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: movq _ptr@GOTPCREL(%rip), %rcx ; DARWIN-64-PIC-NEXT: movq %rax, (%rcx) ; DARWIN-64-PIC-NEXT: retq @@ -5547,14 +5587,16 @@ define dso_local void @big04(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp61: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp61-.L61$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal .Lddst$local@GOTOFF+262144(%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: addl $.Lddst$local@GOTOFF+262144, %ecx ; LINUX-32-PIC-NEXT: movl %ecx, .Ldptr$local@GOTOFF(%eax) ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: big04: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq .Lddst$local(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: movq %rax, .Ldptr$local(%rip) ; LINUX-64-PIC-NEXT: retq ; @@ -5578,28 +5620,32 @@ define dso_local void @big04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L61$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ddst-L61$pb+262144(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: addl $_ddst-L61$pb+262144, %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _dptr-L61$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: big04: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: movq %rax, _dptr(%rip) ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: big04: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: movq %rax, _dptr(%rip) ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: big04: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: movq %rax, _dptr(%rip) ; DARWIN-64-PIC-NEXT: retq @@ -5821,14 +5867,16 @@ define dso_local void @big07(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp64: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp64-.L64$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal ldst@GOTOFF+262144(%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; LINUX-32-PIC-NEXT: addl $ldst@GOTOFF+262144, %ecx ; LINUX-32-PIC-NEXT: movl %ecx, lptr@GOTOFF(%eax) ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: big07: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq ldst(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: movq %rax, lptr(%rip) ; LINUX-64-PIC-NEXT: retq ; @@ -5852,28 +5900,32 @@ define dso_local void @big07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L64$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ldst-L64$pb+262144(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: addl $_ldst-L64$pb+262144, %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _lptr-L64$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: big07: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: movq %rax, _lptr(%rip) ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: big07: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: movq %rax, _lptr(%rip) ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: big07: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: movq %rax, _lptr(%rip) ; DARWIN-64-PIC-NEXT: retq @@ -8850,7 +8902,8 @@ define dso_local ptr @cat00(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cat00: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq src@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $64, %rax ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cat00: @@ -8868,13 +8921,15 @@ define dso_local ptr @cat00(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp109-.L109$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl src@GOT(%eax), %eax -; LINUX-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cat00: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq src@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cat00: @@ -8887,7 +8942,8 @@ define dso_local ptr @cat00(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_src$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $64, %eax ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cat00: @@ -8897,25 +8953,29 @@ define dso_local ptr @cat00(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_src$non_lazy_ptr-L109$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat00: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _src@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cat00: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _src@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cat00: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _src@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -8928,7 +8988,8 @@ define dso_local ptr @cxt00(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cxt00: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq xsrc@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $64, %rax ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cxt00: @@ -8946,13 +9007,15 @@ define dso_local ptr @cxt00(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp110-.L110$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl xsrc@GOT(%eax), %eax -; LINUX-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cxt00: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq xsrc@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cxt00: @@ -8965,7 +9028,8 @@ define dso_local ptr @cxt00(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_xsrc$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $64, %eax ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cxt00: @@ -8975,25 +9039,29 @@ define dso_local ptr @cxt00(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_xsrc$non_lazy_ptr-L110$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cxt00: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _xsrc@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cxt00: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _xsrc@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cxt00: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _xsrc@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9006,7 +9074,8 @@ define dso_local ptr @cat01(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cat01: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq dst@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $64, %rax ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cat01: @@ -9024,13 +9093,15 @@ define dso_local ptr @cat01(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp111-.L111$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl dst@GOT(%eax), %eax -; LINUX-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cat01: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq dst@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cat01: @@ -9043,7 +9114,8 @@ define dso_local ptr @cat01(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_dst$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $64, %eax ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cat01: @@ -9053,25 +9125,29 @@ define dso_local ptr @cat01(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_dst$non_lazy_ptr-L111$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat01: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cat01: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cat01: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9084,7 +9160,8 @@ define dso_local ptr @cxt01(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cxt01: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq xdst@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $64, %rax ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cxt01: @@ -9102,13 +9179,15 @@ define dso_local ptr @cxt01(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp112-.L112$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl xdst@GOT(%eax), %eax -; LINUX-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cxt01: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq xdst@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cxt01: @@ -9121,7 +9200,8 @@ define dso_local ptr @cxt01(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_xdst$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $64, %eax ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cxt01: @@ -9131,25 +9211,29 @@ define dso_local ptr @cxt01(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_xdst$non_lazy_ptr-L112$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cxt01: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _xdst@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cxt01: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _xdst@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cxt01: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _xdst@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9163,14 +9247,16 @@ define dso_local ptr @cat02(i64 %i) nounwind { ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq ptr@GOTPCREL(%rip), %rax ; LINUX-64-STATIC-NEXT: movq (%rax), %rax -; LINUX-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $64, %rax ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cat02: ; LINUX-32-STATIC: # %bb.0: # %entry ; LINUX-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; LINUX-32-STATIC-NEXT: movl ptr, %ecx -; LINUX-32-STATIC-NEXT: leal 64(%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: addl $64, %eax ; LINUX-32-STATIC-NEXT: retl ; ; LINUX-32-PIC-LABEL: cat02: @@ -9183,21 +9269,24 @@ define dso_local ptr @cat02(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: movl ptr@GOT(%eax), %eax ; LINUX-32-PIC-NEXT: movl (%eax), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cat02: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq ptr@GOTPCREL(%rip), %rax ; LINUX-64-PIC-NEXT: movq (%rax), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cat02: ; DARWIN-32-STATIC: ## %bb.0: ## %entry ; DARWIN-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-STATIC-NEXT: movl _ptr, %ecx -; DARWIN-32-STATIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: addl $64, %eax ; DARWIN-32-STATIC-NEXT: retl ; ; DARWIN-32-DYNAMIC-LABEL: cat02: @@ -9205,7 +9294,8 @@ define dso_local ptr @cat02(i64 %i) nounwind { ; DARWIN-32-DYNAMIC-NEXT: movl L_ptr$non_lazy_ptr, %eax ; DARWIN-32-DYNAMIC-NEXT: movl (%eax), %eax ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 64(%eax,%ecx,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $64, %eax ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cat02: @@ -9216,28 +9306,32 @@ define dso_local ptr @cat02(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: movl L_ptr$non_lazy_ptr-L113$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl (%eax), %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat02: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _ptr@GOTPCREL(%rip), %rax ; DARWIN-64-STATIC-NEXT: movq (%rax), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cat02: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _ptr@GOTPCREL(%rip), %rax ; DARWIN-64-DYNAMIC-NEXT: movq (%rax), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cat02: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _ptr@GOTPCREL(%rip), %rax ; DARWIN-64-PIC-NEXT: movq (%rax), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9267,13 +9361,15 @@ define dso_local ptr @cat03(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp114: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp114-.L114$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal .Ldsrc$local@GOTOFF+64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $.Ldsrc$local@GOTOFF+64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cat03: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq .Ldsrc$local(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cat03: @@ -9294,25 +9390,29 @@ define dso_local ptr @cat03(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L114$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _dsrc-L114$pb+64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $_dsrc-L114$pb+64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat03: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _dsrc(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cat03: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _dsrc(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cat03: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _dsrc(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9341,13 +9441,15 @@ define dso_local ptr @cat04(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp115: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp115-.L115$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal .Lddst$local@GOTOFF+64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $.Lddst$local@GOTOFF+64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cat04: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq .Lddst$local(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cat04: @@ -9368,25 +9470,29 @@ define dso_local ptr @cat04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L115$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ddst-L115$pb+64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $_ddst-L115$pb+64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat04: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cat04: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cat04: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9399,14 +9505,16 @@ define dso_local ptr @cat05(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cat05: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq dptr(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $64, %rax ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cat05: ; LINUX-32-STATIC: # %bb.0: # %entry ; LINUX-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; LINUX-32-STATIC-NEXT: movl dptr, %ecx -; LINUX-32-STATIC-NEXT: leal 64(%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: addl $64, %eax ; LINUX-32-STATIC-NEXT: retl ; ; LINUX-32-PIC-LABEL: cat05: @@ -9418,27 +9526,31 @@ define dso_local ptr @cat05(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp116-.L116$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl .Ldptr$local@GOTOFF(%eax), %eax -; LINUX-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cat05: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq .Ldptr$local(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cat05: ; DARWIN-32-STATIC: ## %bb.0: ## %entry ; DARWIN-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-STATIC-NEXT: movl _dptr, %ecx -; DARWIN-32-STATIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: addl $64, %eax ; DARWIN-32-STATIC-NEXT: retl ; ; DARWIN-32-DYNAMIC-LABEL: cat05: ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl _dptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $64, %eax ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cat05: @@ -9448,25 +9560,29 @@ define dso_local ptr @cat05(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl _dptr-L116$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat05: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _dptr(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cat05: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _dptr(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cat05: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _dptr(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9496,13 +9612,15 @@ define dso_local ptr @cat06(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp117: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp117-.L117$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal lsrc@GOTOFF+64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $lsrc@GOTOFF+64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cat06: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq lsrc(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cat06: @@ -9523,25 +9641,29 @@ define dso_local ptr @cat06(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L117$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _lsrc-L117$pb+64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $_lsrc-L117$pb+64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat06: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _lsrc(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cat06: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _lsrc(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cat06: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _lsrc(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9570,13 +9692,15 @@ define dso_local ptr @cat07(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp118: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp118-.L118$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal ldst@GOTOFF+64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $ldst@GOTOFF+64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cat07: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq ldst(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cat07: @@ -9597,25 +9721,29 @@ define dso_local ptr @cat07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L118$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ldst-L118$pb+64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $_ldst-L118$pb+64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat07: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cat07: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cat07: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9628,14 +9756,16 @@ define dso_local ptr @cat08(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cat08: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq lptr(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $64, %rax ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cat08: ; LINUX-32-STATIC: # %bb.0: # %entry ; LINUX-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; LINUX-32-STATIC-NEXT: movl lptr, %ecx -; LINUX-32-STATIC-NEXT: leal 64(%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: addl $64, %eax ; LINUX-32-STATIC-NEXT: retl ; ; LINUX-32-PIC-LABEL: cat08: @@ -9647,27 +9777,31 @@ define dso_local ptr @cat08(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp119-.L119$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl lptr@GOTOFF(%eax), %eax -; LINUX-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $64, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cat08: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq lptr(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $64, %rax ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cat08: ; DARWIN-32-STATIC: ## %bb.0: ## %entry ; DARWIN-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-STATIC-NEXT: movl _lptr, %ecx -; DARWIN-32-STATIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: addl $64, %eax ; DARWIN-32-STATIC-NEXT: retl ; ; DARWIN-32-DYNAMIC-LABEL: cat08: ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl _lptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 64(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $64, %eax ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cat08: @@ -9677,25 +9811,29 @@ define dso_local ptr @cat08(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl _lptr-L119$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $64, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat08: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _lptr(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $64, %rax ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cat08: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _lptr(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $64, %rax ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cat08: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _lptr(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 64(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $64, %rax ; DARWIN-64-PIC-NEXT: retq entry: @@ -9709,7 +9847,8 @@ define dso_local ptr @cam00(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cam00: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq src@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cam00: @@ -9727,13 +9866,15 @@ define dso_local ptr @cam00(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp120-.L120$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl src@GOT(%eax), %eax -; LINUX-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cam00: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq src@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cam00: @@ -9746,7 +9887,8 @@ define dso_local ptr @cam00(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_src$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cam00: @@ -9756,25 +9898,29 @@ define dso_local ptr @cam00(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_src$non_lazy_ptr-L120$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam00: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _src@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cam00: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _src@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cam00: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _src@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -9787,7 +9933,8 @@ define dso_local ptr @cxm00(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cxm00: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq xsrc@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cxm00: @@ -9805,13 +9952,15 @@ define dso_local ptr @cxm00(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp121-.L121$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl xsrc@GOT(%eax), %eax -; LINUX-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cxm00: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq xsrc@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cxm00: @@ -9824,7 +9973,8 @@ define dso_local ptr @cxm00(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_xsrc$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cxm00: @@ -9834,25 +9984,29 @@ define dso_local ptr @cxm00(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_xsrc$non_lazy_ptr-L121$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cxm00: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _xsrc@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cxm00: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _xsrc@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cxm00: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _xsrc@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -9865,7 +10019,8 @@ define dso_local ptr @cam01(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cam01: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq dst@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cam01: @@ -9883,13 +10038,15 @@ define dso_local ptr @cam01(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp122-.L122$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl dst@GOT(%eax), %eax -; LINUX-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cam01: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq dst@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cam01: @@ -9902,7 +10059,8 @@ define dso_local ptr @cam01(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_dst$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cam01: @@ -9912,25 +10070,29 @@ define dso_local ptr @cam01(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_dst$non_lazy_ptr-L122$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam01: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cam01: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cam01: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _dst@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -9943,7 +10105,8 @@ define dso_local ptr @cxm01(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cxm01: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq xdst@GOTPCREL(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cxm01: @@ -9961,13 +10124,15 @@ define dso_local ptr @cxm01(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp123-.L123$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl xdst@GOT(%eax), %eax -; LINUX-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cxm01: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq xdst@GOTPCREL(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cxm01: @@ -9980,7 +10145,8 @@ define dso_local ptr @cxm01(i64 %i) nounwind { ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl L_xdst$non_lazy_ptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cxm01: @@ -9990,25 +10156,29 @@ define dso_local ptr @cxm01(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl L_xdst$non_lazy_ptr-L123$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cxm01: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _xdst@GOTPCREL(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cxm01: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _xdst@GOTPCREL(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cxm01: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _xdst@GOTPCREL(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -10022,14 +10192,16 @@ define dso_local ptr @cam02(i64 %i) nounwind { ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq ptr@GOTPCREL(%rip), %rax ; LINUX-64-STATIC-NEXT: movq (%rax), %rax -; LINUX-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cam02: ; LINUX-32-STATIC: # %bb.0: # %entry ; LINUX-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; LINUX-32-STATIC-NEXT: movl ptr, %ecx -; LINUX-32-STATIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-STATIC-NEXT: retl ; ; LINUX-32-PIC-LABEL: cam02: @@ -10042,21 +10214,24 @@ define dso_local ptr @cam02(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: movl ptr@GOT(%eax), %eax ; LINUX-32-PIC-NEXT: movl (%eax), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cam02: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq ptr@GOTPCREL(%rip), %rax ; LINUX-64-PIC-NEXT: movq (%rax), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cam02: ; DARWIN-32-STATIC: ## %bb.0: ## %entry ; DARWIN-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-STATIC-NEXT: movl _ptr, %ecx -; DARWIN-32-STATIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-STATIC-NEXT: retl ; ; DARWIN-32-DYNAMIC-LABEL: cam02: @@ -10064,7 +10239,8 @@ define dso_local ptr @cam02(i64 %i) nounwind { ; DARWIN-32-DYNAMIC-NEXT: movl L_ptr$non_lazy_ptr, %eax ; DARWIN-32-DYNAMIC-NEXT: movl (%eax), %eax ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cam02: @@ -10075,28 +10251,32 @@ define dso_local ptr @cam02(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: movl L_ptr$non_lazy_ptr-L124$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl (%eax), %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam02: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _ptr@GOTPCREL(%rip), %rax ; DARWIN-64-STATIC-NEXT: movq (%rax), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cam02: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _ptr@GOTPCREL(%rip), %rax ; DARWIN-64-DYNAMIC-NEXT: movq (%rax), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cam02: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _ptr@GOTPCREL(%rip), %rax ; DARWIN-64-PIC-NEXT: movq (%rax), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -10126,13 +10306,15 @@ define dso_local ptr @cam03(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp125: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp125-.L125$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal .Ldsrc$local@GOTOFF+262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $.Ldsrc$local@GOTOFF+262144, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cam03: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq .Ldsrc$local(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cam03: @@ -10153,25 +10335,29 @@ define dso_local ptr @cam03(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L125$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _dsrc-L125$pb+262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $_dsrc-L125$pb+262144, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam03: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _dsrc(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cam03: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _dsrc(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cam03: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _dsrc(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -10200,13 +10386,15 @@ define dso_local ptr @cam04(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp126: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp126-.L126$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal .Lddst$local@GOTOFF+262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $.Lddst$local@GOTOFF+262144, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cam04: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq .Lddst$local(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cam04: @@ -10227,25 +10415,29 @@ define dso_local ptr @cam04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L126$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ddst-L126$pb+262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $_ddst-L126$pb+262144, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam04: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cam04: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cam04: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _ddst(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -10258,14 +10450,16 @@ define dso_local ptr @cam05(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cam05: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq dptr(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cam05: ; LINUX-32-STATIC: # %bb.0: # %entry ; LINUX-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; LINUX-32-STATIC-NEXT: movl dptr, %ecx -; LINUX-32-STATIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-STATIC-NEXT: retl ; ; LINUX-32-PIC-LABEL: cam05: @@ -10277,27 +10471,31 @@ define dso_local ptr @cam05(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp127-.L127$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl .Ldptr$local@GOTOFF(%eax), %eax -; LINUX-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cam05: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq .Ldptr$local(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cam05: ; DARWIN-32-STATIC: ## %bb.0: ## %entry ; DARWIN-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-STATIC-NEXT: movl _dptr, %ecx -; DARWIN-32-STATIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-STATIC-NEXT: retl ; ; DARWIN-32-DYNAMIC-LABEL: cam05: ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl _dptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cam05: @@ -10307,25 +10505,29 @@ define dso_local ptr @cam05(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl _dptr-L127$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam05: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _dptr(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cam05: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _dptr(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cam05: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _dptr(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -10355,13 +10557,15 @@ define dso_local ptr @cam06(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp128: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp128-.L128$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal lsrc@GOTOFF+262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $lsrc@GOTOFF+262144, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cam06: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq lsrc(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cam06: @@ -10382,25 +10586,29 @@ define dso_local ptr @cam06(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L128$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _lsrc-L128$pb+262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $_lsrc-L128$pb+262144, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam06: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _lsrc(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cam06: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _lsrc(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cam06: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _lsrc(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -10429,13 +10637,15 @@ define dso_local ptr @cam07(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: .Ltmp129: ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp129-.L129$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; LINUX-32-PIC-NEXT: leal ldst@GOTOFF+262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $ldst@GOTOFF+262144, %eax ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cam07: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: leaq ldst(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cam07: @@ -10456,25 +10666,29 @@ define dso_local ptr @cam07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L129$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal _ldst-L129$pb+262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $_ldst-L129$pb+262144, %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam07: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cam07: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cam07: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: leaq _ldst(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: @@ -10487,14 +10701,16 @@ define dso_local ptr @cam08(i64 %i) nounwind { ; LINUX-64-STATIC-LABEL: cam08: ; LINUX-64-STATIC: # %bb.0: # %entry ; LINUX-64-STATIC-NEXT: movq lptr(%rip), %rax -; LINUX-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-STATIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-STATIC-NEXT: retq ; ; LINUX-32-STATIC-LABEL: cam08: ; LINUX-32-STATIC: # %bb.0: # %entry ; LINUX-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; LINUX-32-STATIC-NEXT: movl lptr, %ecx -; LINUX-32-STATIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; LINUX-32-STATIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-STATIC-NEXT: retl ; ; LINUX-32-PIC-LABEL: cam08: @@ -10506,27 +10722,31 @@ define dso_local ptr @cam08(i64 %i) nounwind { ; LINUX-32-PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp130-.L130$pb), %eax ; LINUX-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; LINUX-32-PIC-NEXT: movl lptr@GOTOFF(%eax), %eax -; LINUX-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; LINUX-32-PIC-NEXT: addl $262144, %eax # imm = 0x40000 ; LINUX-32-PIC-NEXT: retl ; ; LINUX-64-PIC-LABEL: cam08: ; LINUX-64-PIC: # %bb.0: # %entry ; LINUX-64-PIC-NEXT: movq lptr(%rip), %rax -; LINUX-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; LINUX-64-PIC-NEXT: addq $262144, %rax # imm = 0x40000 ; LINUX-64-PIC-NEXT: retq ; ; DARWIN-32-STATIC-LABEL: cam08: ; DARWIN-32-STATIC: ## %bb.0: ## %entry ; DARWIN-32-STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-STATIC-NEXT: movl _lptr, %ecx -; DARWIN-32-STATIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-STATIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-STATIC-NEXT: retl ; ; DARWIN-32-DYNAMIC-LABEL: cam08: ; DARWIN-32-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-32-DYNAMIC-NEXT: movl {{[0-9]+}}(%esp), %eax ; DARWIN-32-DYNAMIC-NEXT: movl _lptr, %ecx -; DARWIN-32-DYNAMIC-NEXT: leal 262144(%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: leal (%ecx,%eax,4), %eax +; DARWIN-32-DYNAMIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-DYNAMIC-NEXT: retl ; ; DARWIN-32-PIC-LABEL: cam08: @@ -10536,25 +10756,29 @@ define dso_local ptr @cam08(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx ; DARWIN-32-PIC-NEXT: movl _lptr-L130$pb(%eax), %eax -; DARWIN-32-PIC-NEXT: leal 262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal (%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: addl $262144, %eax ## imm = 0x40000 ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam08: ; DARWIN-64-STATIC: ## %bb.0: ## %entry ; DARWIN-64-STATIC-NEXT: movq _lptr(%rip), %rax -; DARWIN-64-STATIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-STATIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-STATIC-NEXT: retq ; ; DARWIN-64-DYNAMIC-LABEL: cam08: ; DARWIN-64-DYNAMIC: ## %bb.0: ## %entry ; DARWIN-64-DYNAMIC-NEXT: movq _lptr(%rip), %rax -; DARWIN-64-DYNAMIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-DYNAMIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-DYNAMIC-NEXT: retq ; ; DARWIN-64-PIC-LABEL: cam08: ; DARWIN-64-PIC: ## %bb.0: ## %entry ; DARWIN-64-PIC-NEXT: movq _lptr(%rip), %rax -; DARWIN-64-PIC-NEXT: leaq 262144(%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: leaq (%rax,%rdi,4), %rax +; DARWIN-64-PIC-NEXT: addq $262144, %rax ## imm = 0x40000 ; DARWIN-64-PIC-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll index bae140abdf6b1..5b1b1877a9ba6 100644 --- a/llvm/test/CodeGen/X86/abs.ll +++ b/llvm/test/CodeGen/X86/abs.ll @@ -147,7 +147,6 @@ define i128 @test_i128(i128 %a) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx @@ -161,6 +160,7 @@ define i128 @test_i128(i128 %a) nounwind { ; X86-NEXT: subl %edx, %ebx ; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: movl %edi, 4(%eax) diff --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll index ae36f293a60ed..8a891402b1b97 100644 --- a/llvm/test/CodeGen/X86/add-cmov.ll +++ b/llvm/test/CodeGen/X86/add-cmov.ll @@ -98,7 +98,8 @@ define i32 @select_40_43_i32(i32 %offset, i64 %x) { ; CHECK-NEXT: cmpq $42, %rsi ; CHECK-NEXT: setl %al ; CHECK-NEXT: leal (%rax,%rax,2), %eax -; CHECK-NEXT: leal 40(%rdi,%rax), %eax +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: addl $40, %eax ; CHECK-NEXT: retq %b = icmp sgt i64 %x, 41 %s = select i1 %b, i32 40, i32 43 @@ -244,8 +245,10 @@ define ptr @bullet(i1 %b, ptr readnone %ptr, i64 %idx) { ; CHECK: # %bb.0: ; CHECK-NEXT: leaq (%rdx,%rdx,4), %rax ; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: leaq 60(%rsi,%rax), %rcx -; CHECK-NEXT: leaq 66(%rsi,%rax), %rax +; CHECK-NEXT: leaq (%rsi,%rax), %rcx +; CHECK-NEXT: addq $60, %rcx +; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: addq $66, %rax ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: cmovneq %rcx, %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/add-ext.ll b/llvm/test/CodeGen/X86/add-ext.ll index 12df378af3de3..56b757c9665fd 100644 --- a/llvm/test/CodeGen/X86/add-ext.ll +++ b/llvm/test/CodeGen/X86/add-ext.ll @@ -26,7 +26,8 @@ define i64 @add_nsw_sext_add(i32 %i, i64 %x) { ; CHECK-LABEL: add_nsw_sext_add: ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq 5(%rax,%rsi), %rax +; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: addq $5, %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -42,7 +43,8 @@ define i64 @add_nsw_sext_lsh_add(i32 %i, i64 %x) { ; CHECK-LABEL: add_nsw_sext_lsh_add: ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax +; CHECK-NEXT: leaq (%rsi,%rax,8), %rax +; CHECK-NEXT: addq $-40, %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, -5 @@ -73,7 +75,8 @@ define ptr @gep8(i32 %i, ptr %x) { ; CHECK-LABEL: gep8: ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq 5(%rax,%rsi), %rax +; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: addq $5, %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -86,7 +89,8 @@ define ptr @gep16(i32 %i, ptr %x) { ; CHECK-LABEL: gep16: ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq -10(%rsi,%rax,2), %rax +; CHECK-NEXT: leaq (%rsi,%rax,2), %rax +; CHECK-NEXT: addq $-10, %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, -5 @@ -99,7 +103,8 @@ define ptr @gep32(i32 %i, ptr %x) { ; CHECK-LABEL: gep32: ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq 20(%rsi,%rax,4), %rax +; CHECK-NEXT: leaq (%rsi,%rax,4), %rax +; CHECK-NEXT: addq $20, %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 @@ -112,7 +117,8 @@ define ptr @gep64(i32 %i, ptr %x) { ; CHECK-LABEL: gep64: ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax -; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax +; CHECK-NEXT: leaq (%rsi,%rax,8), %rax +; CHECK-NEXT: addq $-40, %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, -5 @@ -128,7 +134,8 @@ define ptr @gep128(i32 %i, ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: movslq %edi, %rax ; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: leaq 80(%rax,%rsi), %rax +; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: addq $80, %rax ; CHECK-NEXT: retq %add = add nsw i32 %i, 5 diff --git a/llvm/test/CodeGen/X86/add-of-carry.ll b/llvm/test/CodeGen/X86/add-of-carry.ll index 9bb50de25b2d0..270f59f1e566f 100644 --- a/llvm/test/CodeGen/X86/add-of-carry.ll +++ b/llvm/test/CodeGen/X86/add-of-carry.ll @@ -26,8 +26,8 @@ define i32 @test2(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: sbbl $0, %eax ; CHECK-NEXT: retl %cmp = icmp ugt i32 %x, %y diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll index c2bfcf57185e3..cdd5dd7344dba 100644 --- a/llvm/test/CodeGen/X86/add-sub-bool.ll +++ b/llvm/test/CodeGen/X86/add-sub-bool.ll @@ -59,9 +59,9 @@ define i32 @test_i32_add_add_commute_idx(i32 %x, i32 %y, i32 %z) nounwind { define i32 @test_i32_add_add_idx0(i32 %x, i32 %y, i32 %z) nounwind { ; X86-LABEL: test_i32_add_add_idx0: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: retl @@ -106,24 +106,24 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl {{[0-9]+}}(%esp), %esi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: btl $5, {{[0-9]+}}(%esp) -; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -148,9 +148,9 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind { define i32 @test_i32_add_sub_idx(i32 %x, i32 %y, i32 %z) nounwind { ; X86-LABEL: test_i32_add_sub_idx: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $31, %eax ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: retl @@ -391,22 +391,22 @@ define i64 @test_i64_add_add_var(i64 %x, i64 %y, i64 %z, i64 %w) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: shrdl %cl, %ebx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: shrdl %cl, %ebx, %edi ; X86-NEXT: testb $32, %cl ; X86-NEXT: jne .LBB15_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %edi, %esi ; X86-NEXT: .LBB15_2: -; X86-NEXT: andl $1, %edi -; X86-NEXT: addl %edi, %eax +; X86-NEXT: andl $1, %esi +; X86-NEXT: addl %esi, %eax ; X86-NEXT: adcl $0, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/add.ll b/llvm/test/CodeGen/X86/add.ll index 079294ef09bdb..44d956403f392 100644 --- a/llvm/test/CodeGen/X86/add.ll +++ b/llvm/test/CodeGen/X86/add.ll @@ -276,8 +276,8 @@ define {i64, i1} @test8(i64 %left, i64 %right) nounwind { ; X86-LABEL: test8: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: setb %cl ; X86-NEXT: retl @@ -310,9 +310,9 @@ entry: define i32 @test9(i32 %x, i32 %y) nounwind readnone { ; X86-LABEL: test9: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: cmpl $10, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sete %cl ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: retl @@ -462,19 +462,19 @@ define <4 x i32> @inc_not_vec(<4 x i32> %a) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %esi, %esi ; X86-NEXT: subl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %edi, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 diff --git a/llvm/test/CodeGen/X86/add_shl_constant.ll b/llvm/test/CodeGen/X86/add_shl_constant.ll index b783a51c2eef7..1f89efe38af37 100644 --- a/llvm/test/CodeGen/X86/add_shl_constant.ll +++ b/llvm/test/CodeGen/X86/add_shl_constant.ll @@ -1,47 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s -; CHECK-LABEL: add_shl_add_constant_1_i32 -; CHECK: leal 984(%rsi,%rdi,8), %eax -; CHECK-NEXT: retq define i32 @add_shl_add_constant_1_i32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: add_shl_add_constant_1_i32: +; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $esi killed $esi def $rsi +; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal (%rsi,%rdi,8), %eax +; CHECK-NEXT: addl $984, %eax ## imm = 0x3D8 +; CHECK-NEXT: retq %add.0 = add i32 %x, 123 %shl = shl i32 %add.0, 3 %add.1 = add i32 %shl, %y ret i32 %add.1 } -; CHECK-LABEL: add_shl_add_constant_2_i32 -; CHECK: leal 984(%rsi,%rdi,8), %eax -; CHECK-NEXT: retq define i32 @add_shl_add_constant_2_i32(i32 %x, i32 %y) nounwind { +; CHECK-LABEL: add_shl_add_constant_2_i32: +; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $esi killed $esi def $rsi +; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal (%rsi,%rdi,8), %eax +; CHECK-NEXT: addl $984, %eax ## imm = 0x3D8 +; CHECK-NEXT: retq %add.0 = add i32 %x, 123 %shl = shl i32 %add.0, 3 %add.1 = add i32 %y, %shl ret i32 %add.1 } -; CHECK: LCPI2_0: -; CHECK: .long 984 -; CHECK: _add_shl_add_constant_1_v4i32 -; CHECK: pslld $3, %[[REG:xmm[0-9]+]] -; CHECK: paddd %xmm1, %[[REG]] -; CHECK: paddd LCPI2_0(%rip), %[[REG:xmm[0-9]+]] -; CHECK: retq define <4 x i32> @add_shl_add_constant_1_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { +; CHECK-LABEL: add_shl_add_constant_1_v4i32: +; CHECK: ## %bb.0: +; CHECK-NEXT: pslld $3, %xmm0 +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: retq %add.0 = add <4 x i32> %x, %shl = shl <4 x i32> %add.0, %add.1 = add <4 x i32> %shl, %y ret <4 x i32> %add.1 } -; CHECK: LCPI3_0: -; CHECK: .long 984 -; CHECK: _add_shl_add_constant_2_v4i32 -; CHECK: pslld $3, %[[REG:xmm[0-9]+]] -; CHECK: paddd %xmm1, %[[REG]] -; CHECK: paddd LCPI3_0(%rip), %[[REG:xmm[0-9]+]] -; CHECK: retq define <4 x i32> @add_shl_add_constant_2_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { +; CHECK-LABEL: add_shl_add_constant_2_v4i32: +; CHECK: ## %bb.0: +; CHECK-NEXT: pslld $3, %xmm0 +; CHECK-NEXT: paddd %xmm1, %xmm0 +; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: retq %add.0 = add <4 x i32> %x, %shl = shl <4 x i32> %add.0, %add.1 = add <4 x i32> %y, %shl diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll index 97894db1188e2..8962e5d71675c 100644 --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -48,9 +48,9 @@ entry: define i256 @add256(i256 %a, i256 %b) nounwind { ; CHECK-LABEL: add256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq %r9, %rsi ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 ; CHECK-NEXT: movq %rcx, 16(%rdi) @@ -202,9 +202,9 @@ define i8 @e(ptr nocapture %a, i32 %b) nounwind { define %scalar @pr31719(ptr nocapture readonly %this, %scalar %arg.b) nounwind { ; CHECK-LABEL: pr31719: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq (%rsi), %rdx ; CHECK-NEXT: adcq 8(%rsi), %rcx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: adcq 16(%rsi), %r8 ; CHECK-NEXT: adcq 24(%rsi), %r9 ; CHECK-NEXT: movq %rdx, (%rdi) @@ -315,23 +315,23 @@ entry: define %S @readd(ptr nocapture readonly %this, %S %arg.b) nounwind { ; CHECK-LABEL: readd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq (%rsi), %rdx -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: adcq $0, %rdi -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: movzbl %r10b, %r10d -; CHECK-NEXT: addq %rcx, %rdi -; CHECK-NEXT: adcq 16(%rsi), %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %r8, %r10 +; CHECK-NEXT: movq 8(%rsi), %r10 +; CHECK-NEXT: adcq $0, %r10 +; CHECK-NEXT: setb %al +; CHECK-NEXT: movzbl %al, %r11d +; CHECK-NEXT: addq %rcx, %r10 +; CHECK-NEXT: adcq 16(%rsi), %r11 +; CHECK-NEXT: setb %al +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: addq %r8, %r11 ; CHECK-NEXT: adcq 24(%rsi), %rcx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq %r9, %rcx -; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %r10, 16(%rax) -; CHECK-NEXT: movq %rcx, 24(%rax) +; CHECK-NEXT: movq %rdx, (%rdi) +; CHECK-NEXT: movq %r10, 8(%rdi) +; CHECK-NEXT: movq %r11, 16(%rdi) +; CHECK-NEXT: movq %rcx, 24(%rdi) ; CHECK-NEXT: retq entry: %0 = extractvalue %S %arg.b, 0 @@ -422,10 +422,10 @@ define i128 @addcarry_to_subcarry(i64 %a, i64 %b) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: cmpq %rsi, %rdi -; CHECK-NEXT: notq %rsi ; CHECK-NEXT: setae %cl ; CHECK-NEXT: addb $-1, %cl ; CHECK-NEXT: adcq $0, %rax +; CHECK-NEXT: notq %rsi ; CHECK-NEXT: setb %cl ; CHECK-NEXT: movzbl %cl, %edx ; CHECK-NEXT: addq %rsi, %rax @@ -1228,9 +1228,9 @@ define zeroext i1 @uaddo_U128_without_i128_or(i64 %0, i64 %1, i64 %2, i64 %3, pt define void @add_U192_without_i128_or(ptr sret(%uint192) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6) nounwind { ; CHECK-LABEL: add_U192_without_i128_or: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq %r8, %rsi ; CHECK-NEXT: adcq %r9, %rdx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: movq %rcx, (%rdi) ; CHECK-NEXT: movq %rdx, 8(%rdi) diff --git a/llvm/test/CodeGen/X86/addr-mode-matcher-2.ll b/llvm/test/CodeGen/X86/addr-mode-matcher-2.ll index c810fe137024c..fcfdd2ae2cb2f 100644 --- a/llvm/test/CodeGen/X86/addr-mode-matcher-2.ll +++ b/llvm/test/CodeGen/X86/addr-mode-matcher-2.ll @@ -53,7 +53,8 @@ define void @foo_sext_nsw(i1 zeroext, i32) nounwind { ; X64-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: cltq ; X64-NEXT: shlq $2, %rax -; X64-NEXT: leaq 20(%rax,%rax,4), %rdi +; X64-NEXT: leaq (%rax,%rax,4), %rdi +; X64-NEXT: addq $20, %rdi ; X64-NEXT: callq bar@PLT ; X64-NEXT: jmp .LBB0_2 br i1 %0, label %9, label %3 @@ -196,7 +197,8 @@ define void @foo_zext_nuw(i1 zeroext, i32) nounwind { ; X64-NEXT: .LBB3_2: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movl %eax, %eax ; X64-NEXT: shlq $2, %rax -; X64-NEXT: leaq 20(%rax,%rax,4), %rdi +; X64-NEXT: leaq (%rax,%rax,4), %rdi +; X64-NEXT: addq $20, %rdi ; X64-NEXT: callq bar@PLT ; X64-NEXT: jmp .LBB3_2 br i1 %0, label %9, label %3 diff --git a/llvm/test/CodeGen/X86/and-sink.ll b/llvm/test/CodeGen/X86/and-sink.ll index d9a34d743ac65..3c43355a168ee 100644 --- a/llvm/test/CodeGen/X86/and-sink.ll +++ b/llvm/test/CodeGen/X86/and-sink.ll @@ -15,8 +15,8 @@ define i32 @and_sink1(i32 %a, i1 %c) { ; CHECK-NEXT: je .LBB0_3 ; CHECK-NEXT: # %bb.1: # %bb0 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: testb $4, %al ; CHECK-NEXT: movl $0, A +; CHECK-NEXT: testb $4, %al ; CHECK-NEXT: jne .LBB0_3 ; CHECK-NEXT: # %bb.2: # %bb1 ; CHECK-NEXT: movl $1, %eax @@ -62,8 +62,8 @@ define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) { ; CHECK-NEXT: je .LBB1_5 ; CHECK-NEXT: # %bb.3: # %bb1 ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: testb $4, %cl ; CHECK-NEXT: movl $0, C +; CHECK-NEXT: testb $4, %cl ; CHECK-NEXT: jne .LBB1_2 ; CHECK-NEXT: # %bb.4: # %bb2 ; CHECK-NEXT: movl $1, %eax @@ -107,8 +107,8 @@ define i32 @and_sink3(i1 %c, ptr %p) { ; CHECK-NEXT: # %bb.1: # %bb0 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movzbl (%eax), %eax -; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: movl $0, A +; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: je .LBB2_2 ; CHECK-NEXT: .LBB2_3: # %bb2 ; CHECK-NEXT: xorl %eax, %eax @@ -145,13 +145,13 @@ define i32 @and_sink4(i32 %a, i32 %b, i1 %c) { ; CHECK-NEXT: # %bb.1: # %bb0 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: testl %eax, %ecx ; CHECK-NEXT: movl $0, A +; CHECK-NEXT: testl %eax, %ecx ; CHECK-NEXT: jne .LBB3_4 ; CHECK-NEXT: # %bb.2: # %bb1 ; CHECK-NEXT: leal (%ecx,%eax), %edx -; CHECK-NEXT: testl %eax, %ecx ; CHECK-NEXT: movl %edx, B +; CHECK-NEXT: testl %eax, %ecx ; CHECK-NEXT: je .LBB3_3 ; CHECK-NEXT: .LBB3_4: # %bb3 ; CHECK-NEXT: xorl %eax, %eax @@ -195,14 +195,14 @@ define i32 @and_sink5(i32 %a, i32 %b, i32 %a2, i32 %b2, i1 %c) { ; CHECK-NEXT: je .LBB4_4 ; CHECK-NEXT: # %bb.1: # %bb0 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl $0, A +; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: jne .LBB4_4 ; CHECK-NEXT: # %bb.2: # %bb1 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: movl %ecx, B +; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: je .LBB4_3 ; CHECK-NEXT: .LBB4_4: # %bb3 ; CHECK-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll index fc573fbd4fc99..04b5cd6e9a132 100644 --- a/llvm/test/CodeGen/X86/andnot-patterns.ll +++ b/llvm/test/CodeGen/X86/andnot-patterns.ll @@ -31,10 +31,10 @@ define i64 @andnot_rotl_i64(i64 %a0, i64 %a1, i64 %a2) nounwind { ; X86-NOBMI-NEXT: .LBB0_3: ; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: shldl %cl, %edx, %eax -; X86-NOBMI-NEXT: notl %eax -; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shldl %cl, %esi, %edx +; X86-NOBMI-NEXT: notl %eax +; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: notl %edx ; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: popl %esi @@ -272,10 +272,10 @@ define i64 @andnot_rotr_i64(i64 %a0, i64 %a1, i64 %a2) nounwind { ; X86-NOBMI-NEXT: .LBB5_3: ; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: shrdl %cl, %edx, %eax -; X86-NOBMI-NEXT: notl %eax -; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOBMI-NEXT: shrdl %cl, %esi, %edx +; X86-NOBMI-NEXT: notl %eax +; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: notl %edx ; X86-NOBMI-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: popl %esi @@ -454,8 +454,8 @@ define i32 @andnot_rotr_i32_multiuse_not(i32 %a0, i32 %a1, i32 %a2) nounwind { ; X86-BMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: notl %edx ; X86-BMI-NEXT: rorl %cl, %eax +; X86-BMI-NEXT: notl %edx ; X86-BMI-NEXT: andnl {{[0-9]+}}(%esp), %eax, %esi ; X86-BMI-NEXT: pushl %edx ; X86-BMI-NEXT: calll use_i32@PLT diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 1fada58f05ba9..d09081ed13864 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -659,18 +659,18 @@ define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.v define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: paddb 16(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb (%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: @@ -749,7 +749,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 @@ -761,7 +761,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax @@ -803,24 +803,24 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; SSE42-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 16(%rdi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa 16(%rdi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq @@ -868,7 +868,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -881,7 +881,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -919,24 +919,24 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; SSE42-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 16(%rdi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa 16(%rdi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq @@ -968,7 +968,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -978,7 +978,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 @@ -989,7 +989,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 @@ -1000,7 +1000,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 @@ -1070,20 +1070,20 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq @@ -1157,21 +1157,21 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq ; @@ -1281,21 +1281,21 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq ; @@ -1404,22 +1404,22 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: paddb 32(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: paddb (%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm1 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movdqa %xmm1, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1448,7 +1448,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -1464,7 +1464,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1555,18 +1555,18 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa 32(%rdi), %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: movdqa %xmm4, (%rcx) @@ -1636,7 +1636,7 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1659,56 +1659,56 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa 32(%rdi), %xmm0 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: paddb 32(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1754,7 +1754,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1776,22 +1776,22 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: paddb 32(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: paddb (%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm1 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movdqa %xmm1, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1828,12 +1828,12 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper @@ -1841,12 +1841,12 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -1854,12 +1854,12 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -1869,7 +1869,7 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1911,29 +1911,29 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa 32(%rdi), %xmm0 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: paddb 32(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] @@ -1978,7 +1978,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2004,7 +2004,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2026,7 +2026,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2100,12 +2100,12 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper @@ -2113,12 +2113,12 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper @@ -2127,10 +2127,10 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2139,12 +2139,12 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -2153,10 +2153,10 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2178,7 +2178,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2278,12 +2278,12 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper @@ -2304,12 +2304,12 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -2330,7 +2330,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2354,8 +2354,8 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: packuswb %xmm1, %xmm1 @@ -2490,8 +2490,8 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; SSE2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -2697,31 +2697,31 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512F-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1)) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2778,17 +2778,18 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: movdqa 16(%rdx), %xmm2 -; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: palignr {{.*#+}} xmm2 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: paddb %xmm0, %xmm1 +; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2959,31 +2960,31 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1)) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3016,8 +3017,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; SSE2-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -3172,22 +3173,22 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; ; SSE42-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm1 -; SSE42-NEXT: movdqa 16(%rdi), %xmm2 -; SSE42-NEXT: movdqa 48(%rdi), %xmm3 -; SSE42-NEXT: paddb 16(%rsi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm3 -; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm2 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: movdqa (%rdi), %xmm3 +; SSE42-NEXT: paddb (%rsi), %xmm3 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE42-NEXT: movdqa %xmm3, %xmm4 +; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4 ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0 -; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm2 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: paddb %xmm3, %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, 48(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq @@ -3200,7 +3201,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 @@ -3219,7 +3220,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551615,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3234,11 +3235,11 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2)) -; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm1 ^ ymm3)) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -3251,11 +3252,11 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2)) -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm1 ^ ymm3)) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) @@ -3326,7 +3327,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 @@ -3341,7 +3342,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -3406,17 +3407,17 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa 48(%rdi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: movdqa 16(%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 @@ -3512,7 +3513,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3555,44 +3556,44 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 48(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2 -; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] @@ -3606,33 +3607,33 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3640,7 +3641,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3762,8 +3763,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3786,8 +3786,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -3843,8 +3843,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] @@ -3858,33 +3858,33 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3892,7 +3892,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,25,26,27,28,29,0,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3951,9 +3951,9 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 @@ -4015,8 +4015,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -4128,7 +4127,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4207,8 +4206,8 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm2 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 @@ -4223,10 +4222,10 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,0,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,21,0,23] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4239,10 +4238,10 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,0,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,21,0,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4256,7 +4255,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4338,7 +4337,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,0] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 @@ -4350,10 +4349,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,0] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,21,22,0] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4366,10 +4365,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,21,22,0] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4383,7 +4382,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,13,14,0,0,0,0,0,0,0,1,1,0,0,0,0] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,14,0,0,0,0,0,0,0,1,1,0,0,0,0] ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4394,7 +4393,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4450,9 +4449,9 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 @@ -4483,10 +4482,10 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,21,22,23] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4499,10 +4498,10 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,21,22,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4528,7 +4527,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -4599,7 +4598,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -4608,12 +4607,12 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] -; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,13,14,15] +; AVX512F-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4621,12 +4620,12 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] -; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,13,14,15] +; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4636,7 +4635,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4646,7 +4645,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] @@ -4702,21 +4701,21 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -4737,10 +4736,10 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4753,10 +4752,10 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4782,7 +4781,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -4835,8 +4834,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] @@ -4850,11 +4849,11 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,0] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -4863,12 +4862,12 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] -; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4876,12 +4875,12 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] -; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4891,7 +4890,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512BW-SLOW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4901,7 +4900,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 7fcca526e460c..55648e07a8292 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -643,7 +643,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -652,7 +652,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] @@ -737,7 +737,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -747,7 +747,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -968,8 +968,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 @@ -983,8 +983,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: pshufb %xmm2, %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 @@ -997,8 +997,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -1064,8 +1064,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 @@ -1079,8 +1079,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: pshufb %xmm2, %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 @@ -1093,8 +1093,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -1159,8 +1159,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; SSE2-NEXT: pandn (%rdi), %xmm1 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm2 @@ -1184,7 +1184,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2 ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 @@ -1196,7 +1196,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1328,7 +1328,7 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1354,8 +1354,8 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 @@ -1415,7 +1415,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1440,8 +1440,8 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; SSE2-NEXT: pandn (%rdi), %xmm1 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm2 @@ -1501,7 +1501,7 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1524,9 +1524,9 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,0,1,1] ; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = mem[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rdx) @@ -1662,7 +1662,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] ; AVX512F-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1680,7 +1680,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] ; AVX512DQ-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1698,7 +1698,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] ; AVX512BW-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1772,7 +1772,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,0,7] ; AVX512F-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1790,7 +1790,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,0,7] ; AVX512DQ-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1808,7 +1808,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,0,7] ; AVX512BW-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1848,21 +1848,21 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; ; SSE42-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 48(%rdi), %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movdqa (%rdi), %xmm1 +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; SSE42-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE42-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 -; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: paddb %xmm0, %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 32(%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: @@ -2147,12 +2147,12 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; ; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -2188,16 +2188,17 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; SSE42-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: palignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 -; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) +; SSE42-NEXT: paddb %xmm0, %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 32(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 32(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) ; SSE42-NEXT: retq ; @@ -2368,12 +2369,12 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; ; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -2541,7 +2542,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm0 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vmovdqa (%rdi), %xmm1 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX-NEXT: vpblendvb %xmm0, 48(%rdi), %xmm1, %xmm0 @@ -2560,7 +2561,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551615,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2573,11 +2574,11 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm0 ^ ymm2)) -; AVX512F-NEXT: vpaddb (%rsi), %ymm3, %ymm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,1] +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm0 ^ ymm3)) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -2588,11 +2589,11 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm0 ^ ymm2)) -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm3, %ymm0 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm0 ^ ymm3)) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -2652,7 +2653,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -2664,7 +2665,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 @@ -2676,7 +2677,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq {{.*#+}} xmm1 = mem ^ (xmm1 & (xmm0 ^ mem)) ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -2688,7 +2689,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = mem ^ (xmm1 & (xmm0 ^ mem)) ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -2722,20 +2723,20 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; SSE2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa 16(%rsi), %xmm2 -; SSE2-NEXT: paddb %xmm0, %xmm2 +; SSE2-NEXT: paddb (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm2, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm2, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: @@ -2757,60 +2758,60 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,0,0,0] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = mem[0,0,0,0] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2912,7 +2913,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,0,28,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,0,28,29,0,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2975,43 +2976,43 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3116,7 +3117,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,27,28,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,27,28,29,0,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3168,17 +3169,17 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm2, (%rdx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -3222,7 +3223,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3313,7 +3314,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3380,19 +3381,19 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastd (%rdi), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5,6,7] -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,0,15] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3404,7 +3405,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,0,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3416,7 +3417,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,0,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] ; AVX512BW-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3438,15 +3439,16 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],mem[1,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] +; SSE2-NEXT: paddb (%rsi), %xmm2 +; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rdx) -; SSE2-NEXT: movdqa %xmm2, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm2, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -3484,7 +3486,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,0,1,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3495,7 +3497,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,0] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3507,7 +3509,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3519,7 +3521,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] ; AVX512BW-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3596,7 +3598,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3608,7 +3610,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3620,7 +3622,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512BW-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3677,7 +3679,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -3686,7 +3688,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -3695,7 +3697,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -3704,7 +3706,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512BW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512BW-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3780,7 +3782,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3792,7 +3794,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3804,7 +3806,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3871,7 +3873,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -3880,7 +3882,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -3889,7 +3891,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512BW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/apx/check-nf-in-suppress-reloc-pass.ll b/llvm/test/CodeGen/X86/apx/check-nf-in-suppress-reloc-pass.ll index 3a2c954e37077..aea80286c679a 100644 --- a/llvm/test/CodeGen/X86/apx/check-nf-in-suppress-reloc-pass.ll +++ b/llvm/test/CodeGen/X86/apx/check-nf-in-suppress-reloc-pass.ll @@ -39,10 +39,10 @@ define fastcc void @foo(i32 %0, i1 %or.cond) nounwind { ; CHECK-NEXT: # %bb.2: # %if.then37 ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: movq 0, %rcx +; CHECK-NEXT: {nf} addq %r15, %rcx +; CHECK-NEXT: movb $0, (%rbp,%rcx) ; CHECK-NEXT: addq %rbp, %rax -; CHECK-NEXT: movq 0, %rax -; CHECK-NEXT: {nf} addq %r15, %rax -; CHECK-NEXT: movb $0, (%rbp,%rax) ; CHECK-NEXT: jmp .LBB0_3 entry: %1 = sext i32 %0 to i64 diff --git a/llvm/test/CodeGen/X86/apx/cmov.ll b/llvm/test/CodeGen/X86/apx/cmov.ll index 7b846120d3f72..4f93111523d58 100644 --- a/llvm/test/CodeGen/X86/apx/cmov.ll +++ b/llvm/test/CodeGen/X86/apx/cmov.ll @@ -23,9 +23,9 @@ define i16 @cmov16(i16 %a, i16 %b, i16 %x, ptr %y.ptr) { ; CHECK-LABEL: cmov16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpw %si, %di # encoding: [0x66,0x39,0xf7] -; CHECK-NEXT: cmovbel %edx, %edi # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xfa] ; CHECK-NEXT: cmovaw (%rcx), %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x47,0x11] -; CHECK-NEXT: addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8] +; CHECK-NEXT: cmoval %edi, %edx # EVEX TO LEGACY Compression encoding: [0x0f,0x47,0xd7] +; CHECK-NEXT: addw %dx, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xd0] ; CHECK-NEXT: retq # encoding: [0xc3] entry: %cond = icmp ugt i16 %a, %b @@ -40,8 +40,8 @@ define i32 @cmov32(i32 %a, i32 %b, i32 %x, ptr %y.ptr) { ; CHECK-LABEL: cmov32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpl %esi, %edi # encoding: [0x39,0xf7] -; CHECK-NEXT: cmoval %edi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x47,0xd7] -; CHECK-NEXT: cmoval (%rcx), %edx # EVEX TO LEGACY Compression encoding: [0x0f,0x47,0x11] +; CHECK-NEXT: cmoval (%rcx), %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x47,0x11] +; CHECK-NEXT: cmoval %edi, %edx # EVEX TO LEGACY Compression encoding: [0x0f,0x47,0xd7] ; CHECK-NEXT: addl %edx, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xd0] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -57,8 +57,8 @@ define i64 @cmov64(i64 %a, i64 %b, i64 %x, ptr %y.ptr) { ; CHECK-LABEL: cmov64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpq %rsi, %rdi # encoding: [0x48,0x39,0xf7] -; CHECK-NEXT: cmovaq %rdi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x47,0xd7] -; CHECK-NEXT: cmovaq (%rcx), %rdx # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x47,0x11] +; CHECK-NEXT: cmovaq (%rcx), %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x47,0x11] +; CHECK-NEXT: cmovaq %rdi, %rdx # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x47,0xd7] ; CHECK-NEXT: addq %rdx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xd0] ; CHECK-NEXT: retq # encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll index 0dcda8efdbc78..207866176cb6d 100644 --- a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll +++ b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll @@ -26,8 +26,8 @@ declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>) define <2 x i128> @flag_copy_2(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: flag_copy_2: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: subq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 ; CHECK-NEXT: movq %r8, %rdi ; CHECK-NEXT: {nf} sarq $63, %rdi diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll index a29a92176f432..b90706ac52d45 100644 --- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll @@ -11,951 +11,951 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NEXT: pushq %r13 ; EGPR-NEXT: pushq %r12 ; EGPR-NEXT: pushq %rbx -; EGPR-NEXT: subq $104, %rsp +; EGPR-NEXT: subq $96, %rsp ; EGPR-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq %rdi, %r26 -; EGPR-NEXT: movq (%rdi), %r13 -; EGPR-NEXT: movq 8(%rdi), %r18 -; EGPR-NEXT: movq 24(%rdi), %r21 -; EGPR-NEXT: movq 16(%rdi), %r17 +; EGPR-NEXT: movq %rdi, %r29 ; EGPR-NEXT: movq 40(%rdi), %rdi -; EGPR-NEXT: movq 32(%r26), %r10 -; EGPR-NEXT: movq 56(%r26), %r15 -; EGPR-NEXT: movq 48(%r26), %r12 -; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq 24(%rsi), %r25 -; EGPR-NEXT: movq 16(%rsi), %r11 -; EGPR-NEXT: movq (%rsi), %r31 -; EGPR-NEXT: movq 8(%rsi), %r14 -; EGPR-NEXT: movq %r12, %rax -; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq 56(%r29), %r24 +; EGPR-NEXT: movq 48(%r29), %r25 +; EGPR-NEXT: movq (%rsi), %r30 +; EGPR-NEXT: movq 8(%rsi), %r22 +; EGPR-NEXT: movq %rsi, %r13 +; EGPR-NEXT: movq %r25, %rax +; EGPR-NEXT: mulq %r30 ; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r19 -; EGPR-NEXT: movq %r15, %rax -; EGPR-NEXT: mulq %r31 -; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r11 +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %r30 +; EGPR-NEXT: movq %rdx, %r10 ; EGPR-NEXT: movq %rax, %r16 ; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq %r12, %rax -; EGPR-NEXT: mulq %r14 -; EGPR-NEXT: movq %rdx, %r22 -; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: addq %r16, %r8 -; EGPR-NEXT: adcq %r9, %r22 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq %r25, %rax +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r17 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r16, %r9 +; EGPR-NEXT: adcq %r10, %r17 +; EGPR-NEXT: movq 32(%r29), %rsi ; EGPR-NEXT: setb %al ; EGPR-NEXT: movzbl %al, %ecx -; EGPR-NEXT: movq %r15, %rax -; EGPR-NEXT: mulq %r14 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r22, %r16 -; EGPR-NEXT: adcq %rcx, %r9 -; EGPR-NEXT: movq %r10, %rax -; EGPR-NEXT: mulq %r31 -; EGPR-NEXT: movq %rdx, %r22 -; EGPR-NEXT: movq %rax, %r27 +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rax, %r10 +; EGPR-NEXT: addq %r17, %r10 +; EGPR-NEXT: adcq %rcx, %r8 +; EGPR-NEXT: movq %rsi, %rax +; EGPR-NEXT: mulq %r30 +; EGPR-NEXT: movq %rdx, %r16 +; EGPR-NEXT: movq %rax, %r26 ; EGPR-NEXT: movq %rdi, %rax -; EGPR-NEXT: mulq %r31 -; EGPR-NEXT: movq %rdx, %r23 -; EGPR-NEXT: movq %rax, %r24 -; EGPR-NEXT: addq %r22, %r24 -; EGPR-NEXT: adcq $0, %r23 -; EGPR-NEXT: movq %r10, %rax -; EGPR-NEXT: mulq %r14 -; EGPR-NEXT: movq %rdx, %r22 -; EGPR-NEXT: movq %rax, %r20 -; EGPR-NEXT: addq %r24, %r20 -; EGPR-NEXT: adcq %r23, %r22 +; EGPR-NEXT: mulq %r30 +; EGPR-NEXT: movq %rdx, %r17 +; EGPR-NEXT: movq %rax, %r18 +; EGPR-NEXT: addq %r16, %r18 +; EGPR-NEXT: adcq $0, %r17 +; EGPR-NEXT: movq %rsi, %rax +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r19 +; EGPR-NEXT: movq %rax, %r31 +; EGPR-NEXT: addq %r18, %r31 +; EGPR-NEXT: adcq %r17, %r19 +; EGPR-NEXT: movq 24(%r13), %r20 ; EGPR-NEXT: setb %al ; EGPR-NEXT: movzbl %al, %ecx ; EGPR-NEXT: movq %rdi, %rax -; EGPR-NEXT: mulq %r14 -; EGPR-NEXT: movq %rdx, %r23 -; EGPR-NEXT: movq %rax, %r24 -; EGPR-NEXT: addq %r22, %r24 -; EGPR-NEXT: adcq %rcx, %r23 -; EGPR-NEXT: addq %r19, %r24 -; EGPR-NEXT: adcq %r8, %r23 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq %r10, %rax -; EGPR-NEXT: mulq %r11 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r28 +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r16 +; EGPR-NEXT: movq %rax, %r18 +; EGPR-NEXT: addq %r19, %r18 +; EGPR-NEXT: adcq %rcx, %r16 +; EGPR-NEXT: addq %r11, %r18 +; EGPR-NEXT: adcq %r9, %r16 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq 16(%r13), %rbp +; EGPR-NEXT: adcq $0, %r8 +; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: movq %rsi, %rax +; EGPR-NEXT: mulq %rbp +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r21 ; EGPR-NEXT: movq %rdi, %rax ; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: mulq %r11 +; EGPR-NEXT: mulq %rbp +; EGPR-NEXT: movq %rdx, %r11 +; EGPR-NEXT: movq %rax, %r17 +; EGPR-NEXT: addq %r9, %r17 +; EGPR-NEXT: adcq $0, %r11 +; EGPR-NEXT: movq %rsi, %rax +; EGPR-NEXT: mulq %r20 ; EGPR-NEXT: movq %rdx, %r19 -; EGPR-NEXT: movq %rax, %r22 -; EGPR-NEXT: addq %r8, %r22 -; EGPR-NEXT: adcq $0, %r19 -; EGPR-NEXT: movq %r10, %rax -; EGPR-NEXT: mulq %r25 -; EGPR-NEXT: movq %rdx, %rbx -; EGPR-NEXT: movq %rax, %r29 -; EGPR-NEXT: addq %r22, %r29 -; EGPR-NEXT: adcq %r19, %rbx +; EGPR-NEXT: movq %rax, %r28 +; EGPR-NEXT: addq %r17, %r28 +; EGPR-NEXT: adcq %r11, %r19 +; EGPR-NEXT: movq (%r29), %r11 ; EGPR-NEXT: setb %al ; EGPR-NEXT: movzbl %al, %ecx ; EGPR-NEXT: movq %rdi, %rax -; EGPR-NEXT: mulq %r25 -; EGPR-NEXT: movq %rdx, %r30 -; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: addq %rbx, %r8 -; EGPR-NEXT: adcq %rcx, %r30 -; EGPR-NEXT: addq %r24, %r28 -; EGPR-NEXT: adcq %r23, %r29 -; EGPR-NEXT: adcq $0, %r8 -; EGPR-NEXT: adcq $0, %r30 -; EGPR-NEXT: addq %r16, %r8 -; EGPR-NEXT: adcq %r9, %r30 -; EGPR-NEXT: setb %al -; EGPR-NEXT: movzbl %al, %ecx -; EGPR-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq %r12, %rax -; EGPR-NEXT: mulq %r11 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %rsi -; EGPR-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq %r15, %rax -; EGPR-NEXT: mulq %r11 -; EGPR-NEXT: movq %rdx, %r16 -; EGPR-NEXT: movq %rax, %r23 -; EGPR-NEXT: addq %r9, %r23 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: movq %r12, %rax -; EGPR-NEXT: mulq %r25 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %rdi -; EGPR-NEXT: addq %r23, %rdi -; EGPR-NEXT: adcq %r16, %r9 -; EGPR-NEXT: setb %al -; EGPR-NEXT: movzbl %al, %r10d -; EGPR-NEXT: movq %r15, %rax -; EGPR-NEXT: mulq %r25 +; EGPR-NEXT: mulq %r20 ; EGPR-NEXT: movq %rdx, %r23 -; EGPR-NEXT: movq %rax, %r24 -; EGPR-NEXT: addq %r9, %r24 -; EGPR-NEXT: adcq %r10, %r23 -; EGPR-NEXT: addq %r8, %rsi -; EGPR-NEXT: movq %rsi, %r19 -; EGPR-NEXT: adcq %r30, %rdi -; EGPR-NEXT: adcq %rcx, %r24 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r19, %r9 +; EGPR-NEXT: adcq %rcx, %r23 +; EGPR-NEXT: addq %r18, %r21 +; EGPR-NEXT: adcq %r16, %r28 +; EGPR-NEXT: adcq $0, %r9 +; EGPR-NEXT: movq 24(%r29), %rdi ; EGPR-NEXT: adcq $0, %r23 -; EGPR-NEXT: movq %r17, %rax -; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: addq %r10, %r9 +; EGPR-NEXT: adcq %r8, %r23 +; EGPR-NEXT: setb %cl +; EGPR-NEXT: movq %r25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: movq %r25, %rax +; EGPR-NEXT: mulq %rbp ; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %rbx -; EGPR-NEXT: movq %r21, %rax -; EGPR-NEXT: mulq %r31 -; EGPR-NEXT: movq %rdx, %r9 ; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq %r17, %rax -; EGPR-NEXT: mulq %r14 +; EGPR-NEXT: movq %r24, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %rbp +; EGPR-NEXT: movq %rdx, %r10 +; EGPR-NEXT: movq %rax, %r18 +; EGPR-NEXT: addq %r8, %r18 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq %r25, %rax +; EGPR-NEXT: mulq %r20 ; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r30 -; EGPR-NEXT: addq %r16, %r30 -; EGPR-NEXT: adcq %r9, %r8 +; EGPR-NEXT: movq %rax, %r25 +; EGPR-NEXT: addq %r18, %r25 +; EGPR-NEXT: adcq %r10, %r8 +; EGPR-NEXT: movzbl %cl, %ecx +; EGPR-NEXT: setb %al +; EGPR-NEXT: movzbl %al, %esi +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %r20 +; EGPR-NEXT: movq %rdx, %r18 +; EGPR-NEXT: movq %rax, %r19 +; EGPR-NEXT: addq %r8, %r19 +; EGPR-NEXT: adcq %rsi, %r18 +; EGPR-NEXT: addq %r9, %r16 +; EGPR-NEXT: adcq %r23, %r25 +; EGPR-NEXT: adcq %rcx, %r19 +; EGPR-NEXT: movq 16(%r29), %r24 +; EGPR-NEXT: adcq $0, %r18 +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %r30 +; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rax, %rbx +; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: mulq %r30 +; EGPR-NEXT: movq %rdx, %r10 +; EGPR-NEXT: movq %rax, %r23 +; EGPR-NEXT: addq %r8, %r23 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r27 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r23, %r9 +; EGPR-NEXT: adcq %r10, %r27 +; EGPR-NEXT: movq 8(%r29), %r23 ; EGPR-NEXT: setb %al ; EGPR-NEXT: movzbl %al, %ecx -; EGPR-NEXT: movq %r21, %rax -; EGPR-NEXT: mulq %r14 -; EGPR-NEXT: movq %r14, %rsi -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: adcq %rcx, %r9 -; EGPR-NEXT: movq %r13, %rax -; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: mulq %r22 ; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rax, %r10 +; EGPR-NEXT: addq %r27, %r10 +; EGPR-NEXT: adcq %rcx, %r8 +; EGPR-NEXT: movq %r11, %rax +; EGPR-NEXT: mulq %r30 +; EGPR-NEXT: movq %rdx, %r27 ; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq %r18, %rax -; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %r23, %rax +; EGPR-NEXT: mulq %r30 ; EGPR-NEXT: movq %rdx, %r14 ; EGPR-NEXT: movq %rax, %r15 -; EGPR-NEXT: addq %r8, %r15 +; EGPR-NEXT: addq %r27, %r15 ; EGPR-NEXT: adcq $0, %r14 -; EGPR-NEXT: movq %r13, %rax -; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: mulq %rsi +; EGPR-NEXT: movq %r11, %rax +; EGPR-NEXT: movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: mulq %r22 ; EGPR-NEXT: movq %rdx, %r12 ; EGPR-NEXT: addq %r15, %rax ; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NEXT: adcq %r14, %r12 ; EGPR-NEXT: setb %cl -; EGPR-NEXT: movq %r18, %rax -; EGPR-NEXT: mulq %rsi -; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %r23, %rax +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r27 ; EGPR-NEXT: movq %rax, %r15 ; EGPR-NEXT: addq %r12, %r15 ; EGPR-NEXT: movzbl %cl, %eax -; EGPR-NEXT: adcq %rax, %r8 +; EGPR-NEXT: adcq %rax, %r27 ; EGPR-NEXT: addq %rbx, %r15 -; EGPR-NEXT: adcq %r30, %r8 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq %r13, %rax -; EGPR-NEXT: mulq %r11 -; EGPR-NEXT: movq %rdx, %r30 +; EGPR-NEXT: adcq %r9, %r27 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: adcq $0, %r8 +; EGPR-NEXT: movq %r11, %rax +; EGPR-NEXT: mulq %rbp +; EGPR-NEXT: movq %rdx, %r9 ; EGPR-NEXT: movq %rax, %rsi -; EGPR-NEXT: movq %r18, %rax -; EGPR-NEXT: mulq %r11 +; EGPR-NEXT: movq %r23, %rax +; EGPR-NEXT: mulq %rbp ; EGPR-NEXT: movq %rdx, %rbx ; EGPR-NEXT: movq %rax, %r14 -; EGPR-NEXT: addq %r30, %r14 +; EGPR-NEXT: addq %r9, %r14 ; EGPR-NEXT: adcq $0, %rbx -; EGPR-NEXT: movq %r13, %rax -; EGPR-NEXT: mulq %r25 +; EGPR-NEXT: movq %r11, %rax +; EGPR-NEXT: mulq %r20 ; EGPR-NEXT: movq %rdx, %r12 ; EGPR-NEXT: addq %r14, %rax -; EGPR-NEXT: movq %rax, %r10 +; EGPR-NEXT: movq %rax, %r22 ; EGPR-NEXT: adcq %rbx, %r12 ; EGPR-NEXT: setb %cl -; EGPR-NEXT: movq %r18, %rax -; EGPR-NEXT: mulq %r25 +; EGPR-NEXT: movq %r23, %rax +; EGPR-NEXT: mulq %r20 ; EGPR-NEXT: movq %rdx, %r14 -; EGPR-NEXT: movq %rax, %r30 -; EGPR-NEXT: addq %r12, %r30 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r12, %r9 ; EGPR-NEXT: movzbl %cl, %eax ; EGPR-NEXT: adcq %rax, %r14 ; EGPR-NEXT: addq %r15, %rsi ; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq %r8, %r10 -; EGPR-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq $0, %r30 +; EGPR-NEXT: adcq %r27, %r22 +; EGPR-NEXT: movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: adcq $0, %r9 ; EGPR-NEXT: adcq $0, %r14 -; EGPR-NEXT: addq %r16, %r30 -; EGPR-NEXT: adcq %r9, %r14 +; EGPR-NEXT: addq %r10, %r9 +; EGPR-NEXT: adcq %r8, %r14 ; EGPR-NEXT: setb %cl -; EGPR-NEXT: movq %r17, %rax -; EGPR-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: mulq %r11 +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: mulq %rbp ; EGPR-NEXT: movq %rdx, %r8 ; EGPR-NEXT: movq %rax, %rbx -; EGPR-NEXT: movq %r21, %rax -; EGPR-NEXT: mulq %r11 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq %r17, %rax -; EGPR-NEXT: mulq %r25 +; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: mulq %rbp +; EGPR-NEXT: movq %rdx, %r10 +; EGPR-NEXT: movq %rax, %r27 +; EGPR-NEXT: addq %r8, %r27 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %r20 ; EGPR-NEXT: movq %rdx, %r8 ; EGPR-NEXT: movq %rax, %r15 -; EGPR-NEXT: addq %r16, %r15 -; EGPR-NEXT: adcq %r9, %r8 -; EGPR-NEXT: setb %r9b -; EGPR-NEXT: movq %r21, %rax -; EGPR-NEXT: mulq %r25 +; EGPR-NEXT: addq %r27, %r15 +; EGPR-NEXT: adcq %r10, %r8 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: mulq %r20 ; EGPR-NEXT: movq %rdx, %r12 ; EGPR-NEXT: movq %rax, %rbp ; EGPR-NEXT: addq %r8, %rbp -; EGPR-NEXT: movzbl %r9b, %eax +; EGPR-NEXT: movzbl %sil, %eax ; EGPR-NEXT: adcq %rax, %r12 -; EGPR-NEXT: addq %r30, %rbx +; EGPR-NEXT: addq %r9, %rbx ; EGPR-NEXT: adcq %r14, %r15 ; EGPR-NEXT: movzbl %cl, %eax ; EGPR-NEXT: adcq %rax, %rbp ; EGPR-NEXT: adcq $0, %r12 -; EGPR-NEXT: addq %r27, %rbx -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; EGPR-NEXT: movq 32(%rsi), %r27 -; EGPR-NEXT: adcq %r20, %r15 -; EGPR-NEXT: adcq %r28, %rbp -; EGPR-NEXT: adcq %r29, %r12 +; EGPR-NEXT: addq %r26, %rbx +; EGPR-NEXT: adcq %r31, %r15 +; EGPR-NEXT: adcq %r21, %rbp +; EGPR-NEXT: adcq %r28, %r12 +; EGPR-NEXT: adcq $0, %r16 +; EGPR-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: adcq $0, %r25 ; EGPR-NEXT: adcq $0, %r19 -; EGPR-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq $0, %rdi -; EGPR-NEXT: adcq $0, %r24 -; EGPR-NEXT: adcq $0, %r23 -; EGPR-NEXT: movq %r17, %rax -; EGPR-NEXT: mulq %r27 +; EGPR-NEXT: adcq $0, %r18 +; EGPR-NEXT: movq 32(%r13), %r26 +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %r26 ; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r20 -; EGPR-NEXT: movq %r21, %rax -; EGPR-NEXT: mulq %r27 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq 40(%rsi), %rcx -; EGPR-NEXT: movq %r17, %rax +; EGPR-NEXT: movq %rax, %r31 +; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: mulq %r26 +; EGPR-NEXT: movq %rdx, %r10 +; EGPR-NEXT: movq %rax, %r27 +; EGPR-NEXT: addq %r8, %r27 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq 40(%r13), %rcx +; EGPR-NEXT: movq %r24, %rax ; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r30 -; EGPR-NEXT: addq %r16, %r30 -; EGPR-NEXT: adcq %r9, %r8 -; EGPR-NEXT: setb %r10b -; EGPR-NEXT: movq %r21, %rax +; EGPR-NEXT: movq %rdx, %r21 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r27, %r9 +; EGPR-NEXT: adcq %r10, %r21 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %rdi, %rax ; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: movzbl %r10b, %eax -; EGPR-NEXT: adcq %rax, %r9 -; EGPR-NEXT: movq %r13, %rax -; EGPR-NEXT: mulq %r27 ; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r19 -; EGPR-NEXT: movq %r18, %rax -; EGPR-NEXT: mulq %r27 -; EGPR-NEXT: movq %rdx, %r28 -; EGPR-NEXT: movq %rax, %r29 -; EGPR-NEXT: addq %r8, %r29 -; EGPR-NEXT: adcq $0, %r28 -; EGPR-NEXT: movq %r13, %rax +; EGPR-NEXT: movq %rax, %r10 +; EGPR-NEXT: addq %r21, %r10 +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r8 +; EGPR-NEXT: movq %r11, %rsi +; EGPR-NEXT: movq %r11, %rax +; EGPR-NEXT: mulq %r26 +; EGPR-NEXT: movq %rdx, %r27 +; EGPR-NEXT: movq %rax, %r11 +; EGPR-NEXT: movq %r23, %rax +; EGPR-NEXT: mulq %r26 +; EGPR-NEXT: movq %rdx, %r21 +; EGPR-NEXT: movq %rax, %r28 +; EGPR-NEXT: addq %r27, %r28 +; EGPR-NEXT: adcq $0, %r21 +; EGPR-NEXT: movq %rsi, %rax +; EGPR-NEXT: movq %rsi, %r17 ; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r22 -; EGPR-NEXT: addq %r29, %r22 -; EGPR-NEXT: adcq %r28, %r8 -; EGPR-NEXT: setb %r10b -; EGPR-NEXT: movq %r18, %rax +; EGPR-NEXT: movq %rdx, %r27 +; EGPR-NEXT: movq %rax, %r16 +; EGPR-NEXT: addq %r28, %r16 +; EGPR-NEXT: adcq %r21, %r27 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %r23, %rax ; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r28 -; EGPR-NEXT: movq %rax, %r29 -; EGPR-NEXT: addq %r8, %r29 -; EGPR-NEXT: movzbl %r10b, %eax -; EGPR-NEXT: adcq %rax, %r28 -; EGPR-NEXT: addq %r20, %r29 -; EGPR-NEXT: adcq %r30, %r28 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq 48(%rsi), %r20 -; EGPR-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq %r13, %rax -; EGPR-NEXT: mulq %r20 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r11 -; EGPR-NEXT: movq %r18, %rax -; EGPR-NEXT: movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: mulq %r20 -; EGPR-NEXT: movq %rdx, %r30 +; EGPR-NEXT: movq %rdx, %r21 +; EGPR-NEXT: movq %rax, %r28 +; EGPR-NEXT: addq %r27, %r28 +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r21 +; EGPR-NEXT: addq %r31, %r28 +; EGPR-NEXT: adcq %r9, %r21 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: adcq $0, %r8 +; EGPR-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: movq 48(%r13), %r31 +; EGPR-NEXT: movq %r17, %rsi +; EGPR-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: movq %r17, %rax +; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r17 +; EGPR-NEXT: movq %r23, %rax +; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %rdx, %r27 ; EGPR-NEXT: movq %rax, %r14 -; EGPR-NEXT: addq %r8, %r14 -; EGPR-NEXT: adcq $0, %r30 -; EGPR-NEXT: movq 56(%rsi), %r10 -; EGPR-NEXT: movq %r13, %rax -; EGPR-NEXT: mulq %r10 +; EGPR-NEXT: addq %r9, %r14 +; EGPR-NEXT: adcq $0, %r27 +; EGPR-NEXT: movq 56(%r13), %r22 +; EGPR-NEXT: movq %rsi, %rax +; EGPR-NEXT: mulq %r22 ; EGPR-NEXT: movq %rdx, %r13 ; EGPR-NEXT: addq %r14, %rax ; EGPR-NEXT: movq %rax, %r14 -; EGPR-NEXT: adcq %r30, %r13 +; EGPR-NEXT: adcq %r27, %r13 ; EGPR-NEXT: setb %sil -; EGPR-NEXT: movq %r18, %rax -; EGPR-NEXT: mulq %r10 -; EGPR-NEXT: movq %rdx, %r30 -; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: addq %r13, %r8 -; EGPR-NEXT: movzbl %sil, %eax -; EGPR-NEXT: adcq %rax, %r30 -; EGPR-NEXT: addq %r29, %r11 -; EGPR-NEXT: adcq %r28, %r14 -; EGPR-NEXT: adcq $0, %r8 -; EGPR-NEXT: adcq $0, %r30 -; EGPR-NEXT: addq %r16, %r8 -; EGPR-NEXT: adcq %r9, %r30 -; EGPR-NEXT: setb %r18b -; EGPR-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq %r17, %rax -; EGPR-NEXT: mulq %r20 +; EGPR-NEXT: movq %r23, %rax +; EGPR-NEXT: mulq %r22 ; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r27 +; EGPR-NEXT: addq %r13, %r27 +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r9 +; EGPR-NEXT: addq %r28, %r17 +; EGPR-NEXT: adcq %r21, %r14 +; EGPR-NEXT: adcq $0, %r27 +; EGPR-NEXT: adcq $0, %r9 +; EGPR-NEXT: addq %r10, %r27 +; EGPR-NEXT: adcq %r8, %r9 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %r24, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rax, %r21 +; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %rdx, %r10 ; EGPR-NEXT: movq %rax, %r28 -; EGPR-NEXT: movq %r21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq %r21, %rax -; EGPR-NEXT: mulq %r20 -; EGPR-NEXT: movq %rdx, %r16 -; EGPR-NEXT: movq %rax, %r29 -; EGPR-NEXT: addq %r9, %r29 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: movq %r17, %rax -; EGPR-NEXT: mulq %r10 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r17 -; EGPR-NEXT: addq %r29, %r17 -; EGPR-NEXT: adcq %r16, %r9 -; EGPR-NEXT: setb %r16b -; EGPR-NEXT: movq %r21, %rax -; EGPR-NEXT: mulq %r10 +; EGPR-NEXT: addq %r8, %r28 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rax, %r24 +; EGPR-NEXT: addq %r28, %r24 +; EGPR-NEXT: adcq %r10, %r8 +; EGPR-NEXT: setb %r10b +; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: mulq %r22 ; EGPR-NEXT: movq %rdx, %r13 -; EGPR-NEXT: movq %rax, %r29 -; EGPR-NEXT: addq %r9, %r29 -; EGPR-NEXT: movzbl %r16b, %eax -; EGPR-NEXT: adcq %rax, %r13 +; EGPR-NEXT: movq %rax, %r28 ; EGPR-NEXT: addq %r8, %r28 -; EGPR-NEXT: adcq %r30, %r17 -; EGPR-NEXT: movzbl %r18b, %eax -; EGPR-NEXT: adcq %rax, %r29 +; EGPR-NEXT: movzbl %r10b, %eax +; EGPR-NEXT: adcq %rax, %r13 +; EGPR-NEXT: addq %r27, %r21 +; EGPR-NEXT: adcq %r9, %r24 +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r28 ; EGPR-NEXT: adcq $0, %r13 -; EGPR-NEXT: addq %rbx, %r19 -; EGPR-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq %r15, %r22 -; EGPR-NEXT: movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq %rbp, %r11 +; EGPR-NEXT: addq %rbx, %r11 ; EGPR-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: adcq %r15, %r16 +; EGPR-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: adcq %rbp, %r17 +; EGPR-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NEXT: adcq %r12, %r14 ; EGPR-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: adcq $0, %r21 +; EGPR-NEXT: adcq $0, %r24 ; EGPR-NEXT: adcq $0, %r28 -; EGPR-NEXT: adcq $0, %r17 -; EGPR-NEXT: adcq $0, %r29 ; EGPR-NEXT: adcq $0, %r13 -; EGPR-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload -; EGPR-NEXT: adcq %rdi, %r17 -; EGPR-NEXT: adcq %r24, %r29 -; EGPR-NEXT: adcq %r23, %r13 -; EGPR-NEXT: setb %r15b -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; EGPR-NEXT: movq %rsi, %rax -; EGPR-NEXT: mulq %r27 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r19 -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload -; EGPR-NEXT: movq %r23, %rax -; EGPR-NEXT: mulq %r27 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq %rsi, %rax -; EGPR-NEXT: movq %rsi, %r21 -; EGPR-NEXT: mulq %rcx +; EGPR-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload +; EGPR-NEXT: adcq %r25, %r24 +; EGPR-NEXT: adcq %r19, %r28 +; EGPR-NEXT: adcq %r18, %r13 +; EGPR-NEXT: setb %bl +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: mulq %r26 ; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r22 -; EGPR-NEXT: addq %r16, %r22 -; EGPR-NEXT: adcq %r9, %r8 -; EGPR-NEXT: setb %r18b -; EGPR-NEXT: movq %r23, %rax -; EGPR-NEXT: movq %r23, %r14 -; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r11 +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload +; EGPR-NEXT: movq %r19, %rax +; EGPR-NEXT: mulq %r26 +; EGPR-NEXT: movq %rdx, %r10 ; EGPR-NEXT: movq %rax, %r16 ; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: movzbl %r18b, %eax -; EGPR-NEXT: adcq %rax, %r9 -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; EGPR-NEXT: movq %rbx, %rax -; EGPR-NEXT: mulq %r27 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %rdi -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; EGPR-NEXT: movq %rsi, %rax -; EGPR-NEXT: mulq %r27 -; EGPR-NEXT: movq %rdx, %r23 -; EGPR-NEXT: movq %rax, %r24 -; EGPR-NEXT: addq %r8, %r24 -; EGPR-NEXT: adcq $0, %r23 -; EGPR-NEXT: movq %rbx, %rax +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: movq %rdi, %r15 ; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: addq %r24, %rax -; EGPR-NEXT: movq %rax, %r11 -; EGPR-NEXT: adcq %r23, %r8 -; EGPR-NEXT: setb %r18b -; EGPR-NEXT: movq %rsi, %rax -; EGPR-NEXT: movq %rsi, %r23 +; EGPR-NEXT: movq %rdx, %r18 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r16, %r9 +; EGPR-NEXT: adcq %r10, %r18 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %r19, %rax +; EGPR-NEXT: movq %r19, %r12 ; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r24 -; EGPR-NEXT: movq %rax, %r30 -; EGPR-NEXT: addq %r8, %r30 -; EGPR-NEXT: movzbl %r18b, %eax -; EGPR-NEXT: adcq %rax, %r24 -; EGPR-NEXT: addq %r19, %r30 -; EGPR-NEXT: adcq %r22, %r24 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq %rbx, %rax -; EGPR-NEXT: mulq %r20 ; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %rsi -; EGPR-NEXT: movq %r23, %rax -; EGPR-NEXT: mulq %r20 -; EGPR-NEXT: movq %rdx, %r19 -; EGPR-NEXT: movq %rax, %r22 -; EGPR-NEXT: addq %r8, %r22 -; EGPR-NEXT: adcq $0, %r19 -; EGPR-NEXT: movq %rbx, %rax -; EGPR-NEXT: mulq %r10 -; EGPR-NEXT: movq %rdx, %rbx -; EGPR-NEXT: addq %r22, %rax -; EGPR-NEXT: movq %rax, %r22 -; EGPR-NEXT: adcq %r19, %rbx -; EGPR-NEXT: setb %r18b -; EGPR-NEXT: movq %r23, %rax -; EGPR-NEXT: mulq %r10 -; EGPR-NEXT: movq %rdx, %r23 -; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: addq %rbx, %r8 -; EGPR-NEXT: movzbl %r18b, %eax -; EGPR-NEXT: adcq %rax, %r23 -; EGPR-NEXT: addq %r30, %rsi -; EGPR-NEXT: adcq %r24, %r22 -; EGPR-NEXT: adcq $0, %r8 -; EGPR-NEXT: adcq $0, %r23 -; EGPR-NEXT: addq %r16, %r8 -; EGPR-NEXT: adcq %r9, %r23 -; EGPR-NEXT: setb %r18b -; EGPR-NEXT: movq %r21, %rax -; EGPR-NEXT: mulq %r20 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r24 +; EGPR-NEXT: movq %rax, %r10 +; EGPR-NEXT: addq %r18, %r10 +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r8 +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; EGPR-NEXT: movq %r14, %rax -; EGPR-NEXT: mulq %r20 +; EGPR-NEXT: mulq %r26 ; EGPR-NEXT: movq %rdx, %r16 +; EGPR-NEXT: movq %rax, %r17 +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; EGPR-NEXT: movq %rbp, %rax +; EGPR-NEXT: mulq %r26 +; EGPR-NEXT: movq %rdx, %r18 ; EGPR-NEXT: movq %rax, %r19 -; EGPR-NEXT: addq %r9, %r19 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: movq %r21, %rax -; EGPR-NEXT: mulq %r10 -; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: addq %r16, %r19 +; EGPR-NEXT: adcq $0, %r18 +; EGPR-NEXT: movq %r14, %rax +; EGPR-NEXT: mulq %rcx +; EGPR-NEXT: movq %rdx, %r27 ; EGPR-NEXT: addq %r19, %rax +; EGPR-NEXT: movq %rax, %r25 +; EGPR-NEXT: adcq %r18, %r27 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %rbp, %rax +; EGPR-NEXT: mulq %rcx +; EGPR-NEXT: movq %rdx, %r16 ; EGPR-NEXT: movq %rax, %r19 -; EGPR-NEXT: adcq %r16, %r9 -; EGPR-NEXT: setb %r16b +; EGPR-NEXT: addq %r27, %r19 +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r16 +; EGPR-NEXT: addq %r11, %r19 +; EGPR-NEXT: adcq %r9, %r16 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: adcq $0, %r8 +; EGPR-NEXT: movq %r14, %rax +; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %rdi +; EGPR-NEXT: movq %rbp, %rax +; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %rdx, %r11 +; EGPR-NEXT: movq %rax, %r18 +; EGPR-NEXT: addq %r9, %r18 +; EGPR-NEXT: adcq $0, %r11 ; EGPR-NEXT: movq %r14, %rax -; EGPR-NEXT: mulq %r10 +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r27 +; EGPR-NEXT: addq %r18, %rax +; EGPR-NEXT: movq %rax, %r14 +; EGPR-NEXT: adcq %r11, %r27 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %rbp, %rax +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r18 +; EGPR-NEXT: addq %r27, %r18 +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r9 +; EGPR-NEXT: addq %r19, %rdi +; EGPR-NEXT: adcq %r16, %r14 +; EGPR-NEXT: adcq $0, %r18 +; EGPR-NEXT: adcq $0, %r9 +; EGPR-NEXT: addq %r10, %r18 +; EGPR-NEXT: adcq %r8, %r9 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %r15, %rax +; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rax, %r16 +; EGPR-NEXT: movq %r12, %rax +; EGPR-NEXT: mulq %r31 +; EGPR-NEXT: movq %rdx, %r10 +; EGPR-NEXT: movq %rax, %r11 +; EGPR-NEXT: addq %r8, %r11 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq %r15, %rax +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: addq %r11, %rax +; EGPR-NEXT: movq %rax, %r11 +; EGPR-NEXT: adcq %r10, %r8 +; EGPR-NEXT: setb %r10b +; EGPR-NEXT: movq %r12, %rax +; EGPR-NEXT: mulq %r22 ; EGPR-NEXT: movq %rdx, %rbp ; EGPR-NEXT: movq %rax, %r12 -; EGPR-NEXT: addq %r9, %r12 -; EGPR-NEXT: movzbl %r16b, %eax +; EGPR-NEXT: addq %r8, %r12 +; EGPR-NEXT: movzbl %r10b, %eax ; EGPR-NEXT: adcq %rax, %rbp -; EGPR-NEXT: addq %r8, %r24 -; EGPR-NEXT: adcq %r23, %r19 -; EGPR-NEXT: movzbl %r18b, %eax +; EGPR-NEXT: addq %r18, %r16 +; EGPR-NEXT: adcq %r9, %r11 +; EGPR-NEXT: movzbl %sil, %eax ; EGPR-NEXT: adcq %rax, %r12 ; EGPR-NEXT: adcq $0, %rbp -; EGPR-NEXT: addq %r28, %rdi +; EGPR-NEXT: addq %r21, %r17 +; EGPR-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: adcq %r24, %r25 +; EGPR-NEXT: movq %r25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: adcq %r28, %rdi ; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq %r17, %r11 +; EGPR-NEXT: adcq %r13, %r14 +; EGPR-NEXT: movq %r14, (%rsp) # 8-byte Spill +; EGPR-NEXT: movzbl %bl, %eax +; EGPR-NEXT: adcq %rax, %r16 +; EGPR-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: adcq $0, %r11 ; EGPR-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq %r29, %rsi -; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq %r13, %r22 -; EGPR-NEXT: movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movzbl %r15b, %eax -; EGPR-NEXT: adcq %rax, %r24 -; EGPR-NEXT: movq %r24, (%rsp) # 8-byte Spill -; EGPR-NEXT: adcq $0, %r19 -; EGPR-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NEXT: adcq $0, %r12 ; EGPR-NEXT: adcq $0, %rbp -; EGPR-NEXT: movq 64(%r26), %r23 -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; EGPR-NEXT: movq %rdi, %rax -; EGPR-NEXT: mulq %r23 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r24 -; EGPR-NEXT: movq %r25, %rax -; EGPR-NEXT: mulq %r23 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq 72(%r26), %r28 -; EGPR-NEXT: movq %rdi, %rax -; EGPR-NEXT: mulq %r28 +; EGPR-NEXT: movq 64(%r29), %r18 +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload +; EGPR-NEXT: movq %r16, %rax +; EGPR-NEXT: mulq %r18 ; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r30 -; EGPR-NEXT: addq %r16, %r30 -; EGPR-NEXT: adcq %r9, %r8 -; EGPR-NEXT: setb %r18b -; EGPR-NEXT: movq %r25, %rax -; EGPR-NEXT: mulq %r28 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: movzbl %r18b, %eax -; EGPR-NEXT: adcq %rax, %r9 -; EGPR-NEXT: movq %r31, %rax -; EGPR-NEXT: mulq %r23 +; EGPR-NEXT: movq %rax, %r19 +; EGPR-NEXT: movq %r20, %rax +; EGPR-NEXT: mulq %r18 +; EGPR-NEXT: movq %rdx, %r10 +; EGPR-NEXT: movq %rax, %r27 +; EGPR-NEXT: addq %r8, %r27 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq 72(%r29), %r21 +; EGPR-NEXT: movq %r16, %rax +; EGPR-NEXT: mulq %r21 +; EGPR-NEXT: movq %rdx, %r28 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r27, %r9 +; EGPR-NEXT: adcq %r10, %r28 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %r20, %rax +; EGPR-NEXT: mulq %r21 ; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rax, %r10 +; EGPR-NEXT: addq %r28, %r10 +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r8 +; EGPR-NEXT: movq %r30, %rax +; EGPR-NEXT: mulq %r18 +; EGPR-NEXT: movq %rdx, %r27 ; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload ; EGPR-NEXT: movq %r11, %rax -; EGPR-NEXT: mulq %r23 -; EGPR-NEXT: movq %rdx, %r29 +; EGPR-NEXT: mulq %r18 +; EGPR-NEXT: movq %rdx, %r28 ; EGPR-NEXT: movq %rax, %rbx -; EGPR-NEXT: addq %r8, %rbx -; EGPR-NEXT: adcq $0, %r29 -; EGPR-NEXT: movq %r31, %rax -; EGPR-NEXT: mulq %r28 -; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: addq %r27, %rbx +; EGPR-NEXT: adcq $0, %r28 +; EGPR-NEXT: movq %r30, %rax +; EGPR-NEXT: mulq %r21 +; EGPR-NEXT: movq %rdx, %r27 ; EGPR-NEXT: addq %rbx, %rax ; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq %r29, %r8 -; EGPR-NEXT: setb %r18b +; EGPR-NEXT: adcq %r28, %r27 +; EGPR-NEXT: setb %sil ; EGPR-NEXT: movq %r11, %rax -; EGPR-NEXT: mulq %r28 -; EGPR-NEXT: movq %rdx, %r29 +; EGPR-NEXT: mulq %r21 +; EGPR-NEXT: movq %rdx, %r28 ; EGPR-NEXT: movq %rax, %rbx -; EGPR-NEXT: addq %r8, %rbx -; EGPR-NEXT: movzbl %r18b, %eax -; EGPR-NEXT: adcq %rax, %r29 -; EGPR-NEXT: addq %r24, %rbx -; EGPR-NEXT: adcq %r30, %r29 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq 80(%r26), %r13 -; EGPR-NEXT: movq %r31, %rax +; EGPR-NEXT: addq %r27, %rbx +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r28 +; EGPR-NEXT: addq %r19, %rbx +; EGPR-NEXT: adcq %r9, %r28 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: adcq $0, %r8 +; EGPR-NEXT: movq 80(%r29), %r13 +; EGPR-NEXT: movq %r30, %rax ; EGPR-NEXT: mulq %r13 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %rsi +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %rdi ; EGPR-NEXT: movq %r11, %rax ; EGPR-NEXT: mulq %r13 -; EGPR-NEXT: movq %rdx, %r30 +; EGPR-NEXT: movq %rdx, %r27 ; EGPR-NEXT: movq %rax, %r14 -; EGPR-NEXT: addq %r8, %r14 -; EGPR-NEXT: adcq $0, %r30 -; EGPR-NEXT: movq 88(%r26), %r18 -; EGPR-NEXT: movq %r31, %rax -; EGPR-NEXT: mulq %r18 +; EGPR-NEXT: addq %r9, %r14 +; EGPR-NEXT: adcq $0, %r27 +; EGPR-NEXT: movq 88(%r29), %rsi +; EGPR-NEXT: movq %r30, %rax +; EGPR-NEXT: mulq %rsi ; EGPR-NEXT: movq %rdx, %r15 -; EGPR-NEXT: movq %rax, %r24 -; EGPR-NEXT: addq %r14, %r24 -; EGPR-NEXT: adcq %r30, %r15 +; EGPR-NEXT: movq %rax, %r19 +; EGPR-NEXT: addq %r14, %r19 +; EGPR-NEXT: adcq %r27, %r15 ; EGPR-NEXT: setb %r14b ; EGPR-NEXT: movq %r11, %rax -; EGPR-NEXT: mulq %r18 -; EGPR-NEXT: movq %rdx, %r30 -; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: addq %r15, %r8 +; EGPR-NEXT: mulq %rsi +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r27 +; EGPR-NEXT: addq %r15, %r27 ; EGPR-NEXT: movzbl %r14b, %eax -; EGPR-NEXT: adcq %rax, %r30 -; EGPR-NEXT: addq %rbx, %rsi -; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: adcq %r29, %r24 -; EGPR-NEXT: adcq $0, %r8 -; EGPR-NEXT: adcq $0, %r30 -; EGPR-NEXT: addq %r16, %r8 -; EGPR-NEXT: adcq %r9, %r30 -; EGPR-NEXT: setb %r29b -; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: adcq %rax, %r9 +; EGPR-NEXT: addq %rbx, %rdi +; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: adcq %r28, %r19 +; EGPR-NEXT: adcq $0, %r27 +; EGPR-NEXT: adcq $0, %r9 +; EGPR-NEXT: addq %r10, %r27 +; EGPR-NEXT: adcq %r8, %r9 +; EGPR-NEXT: setb %r15b +; EGPR-NEXT: movq %r16, %rax ; EGPR-NEXT: mulq %r13 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %rsi -; EGPR-NEXT: movq %r25, %rax +; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rax, %rdi +; EGPR-NEXT: movq %r20, %rax ; EGPR-NEXT: mulq %r13 -; EGPR-NEXT: movq %rdx, %r16 +; EGPR-NEXT: movq %rdx, %r10 ; EGPR-NEXT: movq %rax, %r14 -; EGPR-NEXT: addq %r9, %r14 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: movq %rdi, %rax -; EGPR-NEXT: mulq %r18 -; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: addq %r8, %r14 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq %r16, %rax +; EGPR-NEXT: mulq %rsi +; EGPR-NEXT: movq %rdx, %r8 ; EGPR-NEXT: movq %rax, %rbx ; EGPR-NEXT: addq %r14, %rbx -; EGPR-NEXT: adcq %r16, %r9 -; EGPR-NEXT: setb %r16b -; EGPR-NEXT: movq %r25, %rax -; EGPR-NEXT: mulq %r18 +; EGPR-NEXT: adcq %r10, %r8 +; EGPR-NEXT: setb %r28b +; EGPR-NEXT: movq %r20, %rax +; EGPR-NEXT: mulq %rsi ; EGPR-NEXT: movq %rdx, %r14 -; EGPR-NEXT: movq %rax, %r15 -; EGPR-NEXT: addq %r9, %r15 -; EGPR-NEXT: movzbl %r16b, %eax +; EGPR-NEXT: movq %rax, %r10 +; EGPR-NEXT: addq %r8, %r10 +; EGPR-NEXT: movzbl %r28b, %eax ; EGPR-NEXT: adcq %rax, %r14 -; EGPR-NEXT: addq %r8, %rsi -; EGPR-NEXT: adcq %r30, %rbx -; EGPR-NEXT: movzbl %r29b, %eax -; EGPR-NEXT: adcq %rax, %r15 +; EGPR-NEXT: addq %r27, %rdi +; EGPR-NEXT: adcq %r9, %rbx +; EGPR-NEXT: movzbl %r15b, %eax +; EGPR-NEXT: adcq %rax, %r10 ; EGPR-NEXT: adcq $0, %r14 -; EGPR-NEXT: imulq %r27, %r18 -; EGPR-NEXT: movq %r27, %rax +; EGPR-NEXT: movq %r26, %rax ; EGPR-NEXT: mulq %r13 ; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: addq %r18, %rdx +; EGPR-NEXT: imulq %r26, %rsi +; EGPR-NEXT: addq %rsi, %rdx ; EGPR-NEXT: imulq %rcx, %r13 ; EGPR-NEXT: addq %rdx, %r13 -; EGPR-NEXT: movq %r20, %r9 -; EGPR-NEXT: imulq %r28, %r9 -; EGPR-NEXT: movq %r20, %rax -; EGPR-NEXT: mulq %r23 -; EGPR-NEXT: movq %rax, %r30 -; EGPR-NEXT: addq %r9, %rdx -; EGPR-NEXT: imulq %r23, %r10 -; EGPR-NEXT: addq %rdx, %r10 -; EGPR-NEXT: addq %r8, %r30 -; EGPR-NEXT: adcq %r13, %r10 -; EGPR-NEXT: movq %r23, %rax -; EGPR-NEXT: mulq %r27 -; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %r31, %rsi +; EGPR-NEXT: imulq %r21, %rsi +; EGPR-NEXT: movq %r31, %rax +; EGPR-NEXT: mulq %r18 ; EGPR-NEXT: movq %rax, %r9 -; EGPR-NEXT: movq %r28, %rax -; EGPR-NEXT: mulq %r27 +; EGPR-NEXT: addq %rsi, %rdx +; EGPR-NEXT: imulq %r18, %r22 +; EGPR-NEXT: addq %rdx, %r22 +; EGPR-NEXT: addq %r8, %r9 +; EGPR-NEXT: adcq %r13, %r22 +; EGPR-NEXT: movq %r18, %rax +; EGPR-NEXT: mulq %r26 ; EGPR-NEXT: movq %rdx, %r27 -; EGPR-NEXT: movq %rax, %r20 -; EGPR-NEXT: addq %r8, %r20 -; EGPR-NEXT: adcq $0, %r27 -; EGPR-NEXT: movq %r23, %rax -; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r20, %r16 -; EGPR-NEXT: adcq %r27, %r8 -; EGPR-NEXT: setb %r18b -; EGPR-NEXT: movq %r28, %rax +; EGPR-NEXT: movq %rax, %r8 +; EGPR-NEXT: movq %r21, %rax +; EGPR-NEXT: mulq %r26 +; EGPR-NEXT: movq %rdx, %r26 +; EGPR-NEXT: movq %rax, %r31 +; EGPR-NEXT: addq %r27, %r31 +; EGPR-NEXT: adcq $0, %r26 +; EGPR-NEXT: movq %r18, %rax ; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r23 -; EGPR-NEXT: movq %rax, %r20 -; EGPR-NEXT: addq %r8, %r20 -; EGPR-NEXT: movzbl %r18b, %eax -; EGPR-NEXT: adcq %rax, %r23 -; EGPR-NEXT: addq %r30, %r20 -; EGPR-NEXT: adcq %r10, %r23 -; EGPR-NEXT: movq 112(%r26), %rcx -; EGPR-NEXT: movq %r31, %rax +; EGPR-NEXT: movq %rdx, %r27 +; EGPR-NEXT: movq %rax, %r18 +; EGPR-NEXT: addq %r31, %r18 +; EGPR-NEXT: adcq %r26, %r27 +; EGPR-NEXT: setb %sil +; EGPR-NEXT: movq %r21, %rax ; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: imulq %r11, %rcx -; EGPR-NEXT: addq %rdx, %rcx -; EGPR-NEXT: movq 120(%r26), %rax -; EGPR-NEXT: imulq %r31, %rax -; EGPR-NEXT: addq %rax, %rcx -; EGPR-NEXT: movq 96(%r26), %r27 -; EGPR-NEXT: movq 104(%r26), %r30 -; EGPR-NEXT: movq %rdi, %rax -; EGPR-NEXT: imulq %r30, %rdi -; EGPR-NEXT: mulq %r27 +; EGPR-NEXT: movq %rdx, %r31 ; EGPR-NEXT: movq %rax, %r21 -; EGPR-NEXT: addq %rdi, %rdx -; EGPR-NEXT: imulq %r27, %r25 -; EGPR-NEXT: addq %rdx, %r25 -; EGPR-NEXT: addq %r8, %r21 -; EGPR-NEXT: adcq %rcx, %r25 -; EGPR-NEXT: movq %r27, %rax -; EGPR-NEXT: mulq %r31 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r22 +; EGPR-NEXT: addq %r27, %r21 +; EGPR-NEXT: movzbl %sil, %eax +; EGPR-NEXT: adcq %rax, %r31 +; EGPR-NEXT: addq %r9, %r21 +; EGPR-NEXT: adcq %r22, %r31 +; EGPR-NEXT: movq 120(%r29), %rcx +; EGPR-NEXT: imulq %r30, %rcx +; EGPR-NEXT: movq 112(%r29), %rsi ; EGPR-NEXT: movq %r30, %rax -; EGPR-NEXT: mulq %r31 -; EGPR-NEXT: movq %rdx, %r31 -; EGPR-NEXT: movq %rax, %r28 -; EGPR-NEXT: addq %r8, %r28 -; EGPR-NEXT: adcq $0, %r31 +; EGPR-NEXT: mulq %rsi +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %rcx, %rdx +; EGPR-NEXT: imulq %r11, %rsi +; EGPR-NEXT: addq %rdx, %rsi +; EGPR-NEXT: movq 96(%r29), %r26 +; EGPR-NEXT: movq 104(%r29), %r27 +; EGPR-NEXT: movq %r16, %rax +; EGPR-NEXT: movq %r16, %rcx +; EGPR-NEXT: imulq %r27, %rcx +; EGPR-NEXT: mulq %r26 +; EGPR-NEXT: movq %rax, %r29 +; EGPR-NEXT: addq %rcx, %rdx +; EGPR-NEXT: imulq %r26, %r20 +; EGPR-NEXT: addq %rdx, %r20 +; EGPR-NEXT: addq %r9, %r29 +; EGPR-NEXT: adcq %rsi, %r20 +; EGPR-NEXT: movq %r26, %rax +; EGPR-NEXT: mulq %r30 +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r16 ; EGPR-NEXT: movq %r27, %rax +; EGPR-NEXT: mulq %r30 +; EGPR-NEXT: movq %rdx, %r30 +; EGPR-NEXT: movq %rax, %r15 +; EGPR-NEXT: addq %r9, %r15 +; EGPR-NEXT: adcq $0, %r30 +; EGPR-NEXT: movq %r26, %rax ; EGPR-NEXT: mulq %r11 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r27 -; EGPR-NEXT: addq %r28, %r27 -; EGPR-NEXT: adcq %r31, %r8 +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r26 +; EGPR-NEXT: addq %r15, %r26 +; EGPR-NEXT: adcq %r30, %r9 ; EGPR-NEXT: setb %cl -; EGPR-NEXT: movq %r30, %rax +; EGPR-NEXT: movq %r27, %rax ; EGPR-NEXT: mulq %r11 -; EGPR-NEXT: movq %rdx, %r26 -; EGPR-NEXT: movq %rax, %r31 -; EGPR-NEXT: addq %r8, %r31 +; EGPR-NEXT: movq %rdx, %r25 +; EGPR-NEXT: movq %rax, %r30 +; EGPR-NEXT: addq %r9, %r30 ; EGPR-NEXT: movzbl %cl, %eax -; EGPR-NEXT: adcq %rax, %r26 -; EGPR-NEXT: addq %r21, %r31 -; EGPR-NEXT: adcq %r25, %r26 -; EGPR-NEXT: addq %r9, %r22 -; EGPR-NEXT: adcq %r16, %r27 -; EGPR-NEXT: adcq %r20, %r31 -; EGPR-NEXT: adcq %r23, %r26 -; EGPR-NEXT: addq %rsi, %r22 -; EGPR-NEXT: adcq %rbx, %r27 -; EGPR-NEXT: adcq %r15, %r31 -; EGPR-NEXT: adcq %r14, %r26 +; EGPR-NEXT: adcq %rax, %r25 +; EGPR-NEXT: addq %r29, %r30 +; EGPR-NEXT: adcq %r20, %r25 +; EGPR-NEXT: addq %r8, %r16 +; EGPR-NEXT: adcq %r18, %r26 +; EGPR-NEXT: adcq %r21, %r30 +; EGPR-NEXT: adcq %r31, %r25 +; EGPR-NEXT: addq %rdi, %r16 +; EGPR-NEXT: adcq %rbx, %r26 +; EGPR-NEXT: adcq %r10, %r30 +; EGPR-NEXT: adcq %r14, %r25 ; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload ; EGPR-NEXT: movq 80(%r11), %rbx ; EGPR-NEXT: movq %rbx, %rax -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload -; EGPR-NEXT: mulq %r19 -; EGPR-NEXT: movq %rax, %r23 +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r17 # 8-byte Reload +; EGPR-NEXT: mulq %r17 +; EGPR-NEXT: movq %rax, %r18 ; EGPR-NEXT: movq %rdx, %r8 ; EGPR-NEXT: movq 88(%r11), %r20 ; EGPR-NEXT: movq %r20, %rax -; EGPR-NEXT: mulq %r19 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 -; EGPR-NEXT: adcq $0, %r9 -; EGPR-NEXT: movq %rbx, %rax -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r17 # 8-byte Reload ; EGPR-NEXT: mulq %r17 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r30 -; EGPR-NEXT: addq %r16, %r30 -; EGPR-NEXT: adcq %r9, %r8 +; EGPR-NEXT: movq %rdx, %r10 +; EGPR-NEXT: movq %rax, %r27 +; EGPR-NEXT: addq %r8, %r27 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: movq %rbx, %rax +; EGPR-NEXT: mulq %r23 +; EGPR-NEXT: movq %rdx, %r31 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r27, %r9 +; EGPR-NEXT: adcq %r10, %r31 ; EGPR-NEXT: setb %cl ; EGPR-NEXT: movq %r20, %rax -; EGPR-NEXT: mulq %r17 -; EGPR-NEXT: movq %rdx, %r9 -; EGPR-NEXT: movq %rax, %r16 -; EGPR-NEXT: addq %r8, %r16 +; EGPR-NEXT: mulq %r23 +; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rax, %r10 +; EGPR-NEXT: addq %r31, %r10 ; EGPR-NEXT: movzbl %cl, %eax -; EGPR-NEXT: adcq %rax, %r9 +; EGPR-NEXT: adcq %rax, %r8 ; EGPR-NEXT: movq 64(%r11), %r15 ; EGPR-NEXT: movq %r15, %rax -; EGPR-NEXT: mulq %r19 -; EGPR-NEXT: movq %rax, %r25 -; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: mulq %r17 +; EGPR-NEXT: movq %rax, %r31 +; EGPR-NEXT: movq %rdx, %r27 ; EGPR-NEXT: movq 72(%r11), %r14 ; EGPR-NEXT: movq %r14, %rax -; EGPR-NEXT: mulq %r19 -; EGPR-NEXT: movq %rdx, %r28 -; EGPR-NEXT: movq %rax, %r29 -; EGPR-NEXT: addq %r8, %r29 -; EGPR-NEXT: adcq $0, %r28 -; EGPR-NEXT: movq %r15, %rax ; EGPR-NEXT: mulq %r17 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r21 -; EGPR-NEXT: addq %r29, %r21 -; EGPR-NEXT: adcq %r28, %r8 +; EGPR-NEXT: movq %rdx, %r21 +; EGPR-NEXT: movq %rax, %r28 +; EGPR-NEXT: addq %r27, %r28 +; EGPR-NEXT: adcq $0, %r21 +; EGPR-NEXT: movq %r15, %rax +; EGPR-NEXT: mulq %r23 +; EGPR-NEXT: movq %rdx, %r27 +; EGPR-NEXT: movq %rax, %r29 +; EGPR-NEXT: addq %r28, %r29 +; EGPR-NEXT: adcq %r21, %r27 ; EGPR-NEXT: setb %cl ; EGPR-NEXT: movq %r14, %rax -; EGPR-NEXT: mulq %r17 -; EGPR-NEXT: movq %rdx, %r29 +; EGPR-NEXT: mulq %r23 +; EGPR-NEXT: movq %rdx, %r28 ; EGPR-NEXT: movq %rax, %r13 -; EGPR-NEXT: addq %r8, %r13 +; EGPR-NEXT: addq %r27, %r13 ; EGPR-NEXT: movzbl %cl, %eax -; EGPR-NEXT: adcq %rax, %r29 -; EGPR-NEXT: addq %r23, %r13 -; EGPR-NEXT: adcq %r30, %r29 -; EGPR-NEXT: adcq $0, %r16 -; EGPR-NEXT: adcq $0, %r9 +; EGPR-NEXT: adcq %rax, %r28 +; EGPR-NEXT: addq %r18, %r13 +; EGPR-NEXT: adcq %r9, %r28 +; EGPR-NEXT: adcq $0, %r10 +; EGPR-NEXT: adcq $0, %r8 ; EGPR-NEXT: movq %r15, %rax -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; EGPR-NEXT: mulq %rdi -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r28 +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r24 # 8-byte Reload +; EGPR-NEXT: mulq %r24 +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r21 ; EGPR-NEXT: movq %r14, %rax -; EGPR-NEXT: mulq %rdi -; EGPR-NEXT: movq %rdx, %r30 +; EGPR-NEXT: mulq %r24 +; EGPR-NEXT: movq %rdx, %r27 ; EGPR-NEXT: movq %rax, %rcx -; EGPR-NEXT: addq %r8, %rcx -; EGPR-NEXT: adcq $0, %r30 +; EGPR-NEXT: addq %r9, %rcx +; EGPR-NEXT: adcq $0, %r27 ; EGPR-NEXT: movq %r15, %rax -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload -; EGPR-NEXT: mulq %r18 -; EGPR-NEXT: movq %rdx, %r10 -; EGPR-NEXT: movq %rax, %r23 -; EGPR-NEXT: addq %rcx, %r23 -; EGPR-NEXT: adcq %r30, %r10 +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NEXT: mulq %rdi +; EGPR-NEXT: movq %rdx, %r22 +; EGPR-NEXT: movq %rax, %r18 +; EGPR-NEXT: addq %rcx, %r18 +; EGPR-NEXT: adcq %r27, %r22 ; EGPR-NEXT: setb %cl ; EGPR-NEXT: movq %r14, %rax -; EGPR-NEXT: mulq %r18 -; EGPR-NEXT: movq %rdx, %r30 -; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: addq %r10, %r8 +; EGPR-NEXT: mulq %rdi +; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: movq %rax, %r27 +; EGPR-NEXT: addq %r22, %r27 ; EGPR-NEXT: movzbl %cl, %eax -; EGPR-NEXT: adcq %rax, %r30 -; EGPR-NEXT: addq %r13, %r28 -; EGPR-NEXT: adcq %r29, %r23 -; EGPR-NEXT: adcq $0, %r8 -; EGPR-NEXT: adcq $0, %r30 -; EGPR-NEXT: addq %r16, %r8 -; EGPR-NEXT: adcq %r9, %r30 +; EGPR-NEXT: adcq %rax, %r9 +; EGPR-NEXT: addq %r13, %r21 +; EGPR-NEXT: adcq %r28, %r18 +; EGPR-NEXT: adcq $0, %r27 +; EGPR-NEXT: adcq $0, %r9 +; EGPR-NEXT: addq %r10, %r27 +; EGPR-NEXT: adcq %r8, %r9 ; EGPR-NEXT: setb %sil ; EGPR-NEXT: movq %rbx, %rax -; EGPR-NEXT: mulq %rdi +; EGPR-NEXT: mulq %r24 ; EGPR-NEXT: movq %rdx, %rcx -; EGPR-NEXT: movq %rax, %r29 +; EGPR-NEXT: movq %rax, %r28 ; EGPR-NEXT: movq %r20, %rax -; EGPR-NEXT: mulq %rdi -; EGPR-NEXT: movq %rdx, %r9 +; EGPR-NEXT: mulq %r24 +; EGPR-NEXT: movq %rdx, %r8 ; EGPR-NEXT: movq %rax, %r10 ; EGPR-NEXT: addq %rcx, %r10 -; EGPR-NEXT: adcq $0, %r9 +; EGPR-NEXT: adcq $0, %r8 ; EGPR-NEXT: movq %rbx, %rax -; EGPR-NEXT: mulq %r18 +; EGPR-NEXT: movq %rdi, %r22 +; EGPR-NEXT: mulq %rdi ; EGPR-NEXT: movq %rdx, %rcx ; EGPR-NEXT: movq %rax, %r13 ; EGPR-NEXT: addq %r10, %r13 -; EGPR-NEXT: adcq %r9, %rcx -; EGPR-NEXT: setb %r10b +; EGPR-NEXT: adcq %r8, %rcx +; EGPR-NEXT: setb %dil ; EGPR-NEXT: movq %r20, %rax -; EGPR-NEXT: mulq %r18 -; EGPR-NEXT: movq %rdx, %r16 -; EGPR-NEXT: movq %rax, %r9 -; EGPR-NEXT: addq %rcx, %r9 -; EGPR-NEXT: movzbl %r10b, %eax -; EGPR-NEXT: adcq %rax, %r16 -; EGPR-NEXT: addq %r8, %r29 -; EGPR-NEXT: adcq %r30, %r13 +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rdx, %r10 +; EGPR-NEXT: movq %rax, %r8 +; EGPR-NEXT: addq %rcx, %r8 +; EGPR-NEXT: movzbl %dil, %eax +; EGPR-NEXT: adcq %rax, %r10 +; EGPR-NEXT: addq %r27, %r28 +; EGPR-NEXT: adcq %r9, %r13 ; EGPR-NEXT: movzbl %sil, %eax -; EGPR-NEXT: adcq %rax, %r9 -; EGPR-NEXT: adcq $0, %r16 +; EGPR-NEXT: adcq %rax, %r8 +; EGPR-NEXT: adcq $0, %r10 ; EGPR-NEXT: movq 96(%r11), %rcx -; EGPR-NEXT: imulq %rcx, %r18 +; EGPR-NEXT: imulq %rcx, %r22 ; EGPR-NEXT: movq %rcx, %rax -; EGPR-NEXT: mulq %rdi -; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: addq %r18, %rdx -; EGPR-NEXT: movq 104(%r11), %r30 -; EGPR-NEXT: movq %rdi, %rax -; EGPR-NEXT: imulq %r30, %rax +; EGPR-NEXT: mulq %r24 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r22, %rdx +; EGPR-NEXT: movq 104(%r11), %rdi +; EGPR-NEXT: movq %r24, %rax +; EGPR-NEXT: imulq %rdi, %rax ; EGPR-NEXT: addq %rdx, %rax -; EGPR-NEXT: movq %rax, %r10 +; EGPR-NEXT: movq %rax, %r24 ; EGPR-NEXT: movq 112(%r11), %rax ; EGPR-NEXT: movq %rax, %rsi -; EGPR-NEXT: imulq %r17, %rsi -; EGPR-NEXT: mulq %r19 -; EGPR-NEXT: movq %rax, %rdi +; EGPR-NEXT: imulq %r23, %rsi +; EGPR-NEXT: mulq %r17 +; EGPR-NEXT: movq %rax, %r22 ; EGPR-NEXT: addq %rsi, %rdx -; EGPR-NEXT: movq 120(%r11), %r18 -; EGPR-NEXT: imulq %r19, %r18 -; EGPR-NEXT: addq %rdx, %r18 -; EGPR-NEXT: addq %r8, %rdi -; EGPR-NEXT: adcq %r10, %r18 -; EGPR-NEXT: movq %r19, %rax +; EGPR-NEXT: movq 120(%r11), %r27 +; EGPR-NEXT: imulq %r17, %r27 +; EGPR-NEXT: addq %rdx, %r27 +; EGPR-NEXT: addq %r9, %r22 +; EGPR-NEXT: adcq %r24, %r27 +; EGPR-NEXT: movq %r17, %rax ; EGPR-NEXT: mulq %rcx -; EGPR-NEXT: movq %rdx, %r8 +; EGPR-NEXT: movq %rdx, %r9 ; EGPR-NEXT: movq %rax, %rsi -; EGPR-NEXT: movq %r17, %rax +; EGPR-NEXT: movq %r23, %rax ; EGPR-NEXT: mulq %rcx ; EGPR-NEXT: movq %rdx, %rcx -; EGPR-NEXT: movq %rax, %r10 -; EGPR-NEXT: addq %r8, %r10 +; EGPR-NEXT: movq %rax, %r24 +; EGPR-NEXT: addq %r9, %r24 ; EGPR-NEXT: adcq $0, %rcx -; EGPR-NEXT: movq %r19, %rax -; EGPR-NEXT: mulq %r30 -; EGPR-NEXT: movq %rdx, %r8 -; EGPR-NEXT: movq %rax, %r11 -; EGPR-NEXT: addq %r10, %r11 -; EGPR-NEXT: adcq %rcx, %r8 -; EGPR-NEXT: setb %cl ; EGPR-NEXT: movq %r17, %rax -; EGPR-NEXT: mulq %r30 -; EGPR-NEXT: movq %rdx, %r10 +; EGPR-NEXT: mulq %rdi +; EGPR-NEXT: movq %rdx, %r9 ; EGPR-NEXT: movq %rax, %r17 -; EGPR-NEXT: addq %r8, %r17 +; EGPR-NEXT: addq %r24, %r17 +; EGPR-NEXT: adcq %rcx, %r9 +; EGPR-NEXT: setb %cl +; EGPR-NEXT: movq %r23, %rax +; EGPR-NEXT: mulq %rdi +; EGPR-NEXT: movq %rdx, %r23 +; EGPR-NEXT: movq %rax, %r24 +; EGPR-NEXT: addq %r9, %r24 ; EGPR-NEXT: movzbl %cl, %eax -; EGPR-NEXT: adcq %rax, %r10 -; EGPR-NEXT: addq %rdi, %r17 -; EGPR-NEXT: adcq %r18, %r10 -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; EGPR-NEXT: imulq %r15, %rdi +; EGPR-NEXT: adcq %rax, %r23 +; EGPR-NEXT: addq %r22, %r24 +; EGPR-NEXT: adcq %r27, %r23 ; EGPR-NEXT: movq %r15, %rax -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; EGPR-NEXT: mulq %r8 +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NEXT: mulq %rdi ; EGPR-NEXT: movq %rax, %rcx -; EGPR-NEXT: addq %rdi, %rdx -; EGPR-NEXT: movq %r8, %rax +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; EGPR-NEXT: imulq %r15, %rax +; EGPR-NEXT: addq %rax, %rdx +; EGPR-NEXT: movq %rdi, %rax ; EGPR-NEXT: imulq %r14, %rax ; EGPR-NEXT: addq %rdx, %rax -; EGPR-NEXT: movq %rax, %r18 +; EGPR-NEXT: movq %rax, %r9 ; EGPR-NEXT: movq %rbx, %rdi -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload -; EGPR-NEXT: imulq %r19, %rdi +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; EGPR-NEXT: imulq %r11, %rdi ; EGPR-NEXT: movq %rbx, %rax -; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; EGPR-NEXT: mulq %r8 -; EGPR-NEXT: movq %rax, %r30 +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r22 # 8-byte Reload +; EGPR-NEXT: mulq %r22 +; EGPR-NEXT: movq %rax, %r27 ; EGPR-NEXT: addq %rdi, %rdx -; EGPR-NEXT: imulq %r8, %r20 +; EGPR-NEXT: imulq %r22, %r20 ; EGPR-NEXT: addq %rdx, %r20 -; EGPR-NEXT: addq %rcx, %r30 -; EGPR-NEXT: adcq %r18, %r20 -; EGPR-NEXT: movq %r8, %rax -; EGPR-NEXT: movq %r8, %rdi +; EGPR-NEXT: addq %rcx, %r27 +; EGPR-NEXT: adcq %r9, %r20 +; EGPR-NEXT: movq %r22, %rax +; EGPR-NEXT: movq %r22, %rdi ; EGPR-NEXT: mulq %r15 ; EGPR-NEXT: movq %rdx, %rcx -; EGPR-NEXT: movq %rax, %r8 -; EGPR-NEXT: movq %r19, %rax +; EGPR-NEXT: movq %rax, %r22 +; EGPR-NEXT: movq %r11, %rax ; EGPR-NEXT: mulq %r15 ; EGPR-NEXT: movq %rdx, %rbx ; EGPR-NEXT: movq %rax, %r15 @@ -964,39 +964,39 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NEXT: movq %rdi, %rax ; EGPR-NEXT: mulq %r14 ; EGPR-NEXT: movq %rdx, %rcx -; EGPR-NEXT: movq %rax, %r18 -; EGPR-NEXT: addq %r15, %r18 +; EGPR-NEXT: movq %rax, %r9 +; EGPR-NEXT: addq %r15, %r9 ; EGPR-NEXT: adcq %rbx, %rcx ; EGPR-NEXT: setb %dil -; EGPR-NEXT: movq %r19, %rax +; EGPR-NEXT: movq %r11, %rax ; EGPR-NEXT: mulq %r14 ; EGPR-NEXT: addq %rcx, %rax ; EGPR-NEXT: movzbl %dil, %ecx ; EGPR-NEXT: adcq %rcx, %rdx -; EGPR-NEXT: addq %r30, %rax +; EGPR-NEXT: addq %r27, %rax ; EGPR-NEXT: adcq %r20, %rdx -; EGPR-NEXT: addq %rsi, %r8 -; EGPR-NEXT: adcq %r11, %r18 -; EGPR-NEXT: adcq %r17, %rax +; EGPR-NEXT: addq %rsi, %r22 +; EGPR-NEXT: adcq %r17, %r9 +; EGPR-NEXT: adcq %r24, %rax +; EGPR-NEXT: adcq %r23, %rdx +; EGPR-NEXT: addq %r28, %r22 +; EGPR-NEXT: adcq %r13, %r9 +; EGPR-NEXT: adcq %r8, %rax ; EGPR-NEXT: adcq %r10, %rdx -; EGPR-NEXT: addq %r29, %r8 -; EGPR-NEXT: adcq %r13, %r18 -; EGPR-NEXT: adcq %r9, %rax -; EGPR-NEXT: adcq %r16, %rdx -; EGPR-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload +; EGPR-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r31 # 8-byte Folded Reload +; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload ; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload -; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload -; EGPR-NEXT: adcq %r24, %r23 -; EGPR-NEXT: adcq %r22, %r8 -; EGPR-NEXT: adcq %r27, %r18 -; EGPR-NEXT: adcq %r31, %rax -; EGPR-NEXT: adcq %r26, %rdx -; EGPR-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload +; EGPR-NEXT: adcq %r19, %r18 +; EGPR-NEXT: adcq %r16, %r22 +; EGPR-NEXT: adcq %r26, %r9 +; EGPR-NEXT: adcq %r30, %rax +; EGPR-NEXT: adcq %r25, %rdx +; EGPR-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r31 # 8-byte Folded Reload +; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload ; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload -; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload -; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload -; EGPR-NEXT: adcq (%rsp), %r8 # 8-byte Folded Reload -; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Folded Reload +; EGPR-NEXT: adcq (%rsp), %r18 # 8-byte Folded Reload +; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r22 # 8-byte Folded Reload +; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; EGPR-NEXT: adcq %r12, %rax ; EGPR-NEXT: adcq %rbp, %rdx ; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload @@ -1016,15 +1016,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NEXT: movq %rsi, 48(%rcx) ; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; EGPR-NEXT: movq %rsi, 56(%rcx) -; EGPR-NEXT: movq %r25, 64(%rcx) -; EGPR-NEXT: movq %r21, 72(%rcx) -; EGPR-NEXT: movq %r28, 80(%rcx) -; EGPR-NEXT: movq %r23, 88(%rcx) -; EGPR-NEXT: movq %r8, 96(%rcx) -; EGPR-NEXT: movq %r18, 104(%rcx) +; EGPR-NEXT: movq %r31, 64(%rcx) +; EGPR-NEXT: movq %r29, 72(%rcx) +; EGPR-NEXT: movq %r21, 80(%rcx) +; EGPR-NEXT: movq %r18, 88(%rcx) +; EGPR-NEXT: movq %r22, 96(%rcx) +; EGPR-NEXT: movq %r9, 104(%rcx) ; EGPR-NEXT: movq %rax, 112(%rcx) ; EGPR-NEXT: movq %rdx, 120(%rcx) -; EGPR-NEXT: addq $104, %rsp +; EGPR-NEXT: addq $96, %rsp ; EGPR-NEXT: popq %rbx ; EGPR-NEXT: popq %r12 ; EGPR-NEXT: popq %r13 @@ -1041,160 +1041,159 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: pushq %r13 ; EGPR-NDD-NEXT: pushq %r12 ; EGPR-NDD-NEXT: pushq %rbx -; EGPR-NDD-NEXT: subq $96, %rsp +; EGPR-NDD-NEXT: subq $80, %rsp ; EGPR-NDD-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %rsi, %r15 -; EGPR-NDD-NEXT: movq %rdi, %r22 -; EGPR-NDD-NEXT: movq (%rdi), %r17 -; EGPR-NDD-NEXT: movq 8(%rdi), %r11 -; EGPR-NDD-NEXT: movq 24(%rdi), %r9 -; EGPR-NDD-NEXT: movq 16(%rdi), %r10 +; EGPR-NDD-NEXT: movq %rdi, %r25 ; EGPR-NDD-NEXT: movq 40(%rdi), %rdi -; EGPR-NDD-NEXT: movq 32(%r22), %r16 -; EGPR-NDD-NEXT: movq 56(%r22), %r18 -; EGPR-NDD-NEXT: movq 48(%r22), %r25 -; EGPR-NDD-NEXT: movq 24(%rsi), %r14 -; EGPR-NDD-NEXT: movq 16(%rsi), %r26 +; EGPR-NDD-NEXT: movq 56(%r25), %r15 +; EGPR-NDD-NEXT: movq 48(%r25), %r28 ; EGPR-NDD-NEXT: movq (%rsi), %r24 -; EGPR-NDD-NEXT: movq 8(%rsi), %r23 -; EGPR-NDD-NEXT: movq %r25, %rax +; EGPR-NDD-NEXT: movq 8(%rsi), %rbx +; EGPR-NDD-NEXT: movq %rsi, %r10 +; EGPR-NDD-NEXT: movq %r28, %rax ; EGPR-NDD-NEXT: mulq %r24 -; EGPR-NDD-NEXT: movq %rdx, %r27 -; EGPR-NDD-NEXT: movq %rax, %r19 -; EGPR-NDD-NEXT: movq %r18, %rax +; EGPR-NDD-NEXT: movq %rdx, %r9 +; EGPR-NDD-NEXT: movq %rax, %r11 +; EGPR-NDD-NEXT: movq %r15, %rax ; EGPR-NDD-NEXT: mulq %r24 -; EGPR-NDD-NEXT: addq %rax, %r27 +; EGPR-NDD-NEXT: addq %rax, %r9 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx -; EGPR-NDD-NEXT: movq %r25, %rax -; EGPR-NDD-NEXT: mulq %r23 -; EGPR-NDD-NEXT: addq %r27, %rax, %rsi +; EGPR-NDD-NEXT: movq %r28, %rax +; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: addq %r9, %rax, %rsi ; EGPR-NDD-NEXT: adcq %rdx, %rcx +; EGPR-NDD-NEXT: movq 32(%r25), %r9 ; EGPR-NDD-NEXT: setb %al ; EGPR-NDD-NEXT: movzbl %al, %r8d -; EGPR-NDD-NEXT: movq %r18, %rax -; EGPR-NDD-NEXT: mulq %r23 -; EGPR-NDD-NEXT: addq %rcx, %rax, %r31 +; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: addq %rcx, %rax, %r17 ; EGPR-NDD-NEXT: adcq %rdx, %r8 -; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r24 -; EGPR-NDD-NEXT: movq %rdx, %r30 +; EGPR-NDD-NEXT: movq %rdx, %r16 ; EGPR-NDD-NEXT: movq %rax, %r27 ; EGPR-NDD-NEXT: movq %rdi, %rax ; EGPR-NDD-NEXT: mulq %r24 -; EGPR-NDD-NEXT: addq %r30, %rax, %rcx -; EGPR-NDD-NEXT: adcq $0, %rdx, %r30 -; EGPR-NDD-NEXT: movq %r16, %rax -; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: addq %r16, %rax, %rcx +; EGPR-NDD-NEXT: adcq $0, %rdx, %r16 +; EGPR-NDD-NEXT: movq %r9, %rax +; EGPR-NDD-NEXT: mulq %rbx ; EGPR-NDD-NEXT: addq %rax, %rcx -; EGPR-NDD-NEXT: adcq %rdx, %r30 +; EGPR-NDD-NEXT: adcq %rdx, %r16 +; EGPR-NDD-NEXT: movq 24(%r10), %r14 ; EGPR-NDD-NEXT: setb %al -; EGPR-NDD-NEXT: movzbl %al, %r20d +; EGPR-NDD-NEXT: movzbl %al, %r18d ; EGPR-NDD-NEXT: movq %rdi, %rax -; EGPR-NDD-NEXT: mulq %r23 -; EGPR-NDD-NEXT: addq %r30, %rax -; EGPR-NDD-NEXT: adcq %r20, %rdx -; EGPR-NDD-NEXT: addq %rax, %r19, %r20 -; EGPR-NDD-NEXT: adcq %rdx, %rsi, %r21 -; EGPR-NDD-NEXT: adcq $0, %r31 +; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: addq %r16, %rax +; EGPR-NDD-NEXT: adcq %r18, %rdx +; EGPR-NDD-NEXT: addq %rax, %r11, %r18 +; EGPR-NDD-NEXT: adcq %rdx, %rsi, %r20 +; EGPR-NDD-NEXT: adcq $0, %r17, %r19 +; EGPR-NDD-NEXT: movq 16(%r10), %r26 ; EGPR-NDD-NEXT: adcq $0, %r8 -; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r26 -; EGPR-NDD-NEXT: movq %rdx, %r19 -; EGPR-NDD-NEXT: movq %rax, %r30 +; EGPR-NDD-NEXT: movq %rdx, %r11 +; EGPR-NDD-NEXT: movq %rax, %r17 ; EGPR-NDD-NEXT: movq %rdi, %rax ; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: mulq %r26 -; EGPR-NDD-NEXT: addq %rax, %r19 +; EGPR-NDD-NEXT: addq %rax, %r11 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi -; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r14 -; EGPR-NDD-NEXT: addq %rax, %r19 +; EGPR-NDD-NEXT: addq %rax, %r11 ; EGPR-NDD-NEXT: adcq %rdx, %rsi +; EGPR-NDD-NEXT: movq (%r25), %r12 ; EGPR-NDD-NEXT: setb %al -; EGPR-NDD-NEXT: movzbl %al, %r28d +; EGPR-NDD-NEXT: movzbl %al, %r23d ; EGPR-NDD-NEXT: movq %rdi, %rax ; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %rsi, %rax -; EGPR-NDD-NEXT: adcq %r28, %rdx -; EGPR-NDD-NEXT: addq %r20, %r30, %rsi -; EGPR-NDD-NEXT: adcq %r21, %r19, %r20 +; EGPR-NDD-NEXT: adcq %r23, %rdx +; EGPR-NDD-NEXT: addq %r18, %r17, %rsi +; EGPR-NDD-NEXT: adcq %r11, %r20 ; EGPR-NDD-NEXT: adcq $0, %rax +; EGPR-NDD-NEXT: movq 24(%r25), %r16 ; EGPR-NDD-NEXT: adcq $0, %rdx -; EGPR-NDD-NEXT: addq %rax, %r31 +; EGPR-NDD-NEXT: addq %rax, %r19 ; EGPR-NDD-NEXT: adcq %rdx, %r8 -; EGPR-NDD-NEXT: setb %al -; EGPR-NDD-NEXT: movzbl %al, %r29d -; EGPR-NDD-NEXT: movq %r25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r25, %rax +; EGPR-NDD-NEXT: setb %r23b +; EGPR-NDD-NEXT: movq %r28, %rax ; EGPR-NDD-NEXT: mulq %r26 -; EGPR-NDD-NEXT: movq %rdx, %r19 -; EGPR-NDD-NEXT: movq %rax, %r30 -; EGPR-NDD-NEXT: movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r18, %rax +; EGPR-NDD-NEXT: movq %rdx, %r11 +; EGPR-NDD-NEXT: movq %rax, %r18 +; EGPR-NDD-NEXT: movq %r15, %rax ; EGPR-NDD-NEXT: mulq %r26 -; EGPR-NDD-NEXT: addq %rax, %r19 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r21 -; EGPR-NDD-NEXT: movq %r25, %rax -; EGPR-NDD-NEXT: mulq %r14 -; EGPR-NDD-NEXT: addq %rax, %r19 -; EGPR-NDD-NEXT: adcq %rdx, %r21 +; EGPR-NDD-NEXT: addq %rax, %r11 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r30 +; EGPR-NDD-NEXT: movq %r28, %rax +; EGPR-NDD-NEXT: mulq %r14 +; EGPR-NDD-NEXT: addq %rax, %r11 +; EGPR-NDD-NEXT: adcq %rdx, %r30 +; EGPR-NDD-NEXT: movzbl %r23b, %r29d ; EGPR-NDD-NEXT: setb %al -; EGPR-NDD-NEXT: movzbl %al, %r28d -; EGPR-NDD-NEXT: movq %r18, %rax +; EGPR-NDD-NEXT: movzbl %al, %r23d +; EGPR-NDD-NEXT: movq %r15, %rax ; EGPR-NDD-NEXT: mulq %r14 -; EGPR-NDD-NEXT: addq %r21, %rax -; EGPR-NDD-NEXT: adcq %r28, %rdx -; EGPR-NDD-NEXT: addq %r31, %r30, %r21 -; EGPR-NDD-NEXT: adcq %r8, %r19, %r28 +; EGPR-NDD-NEXT: addq %r30, %rax +; EGPR-NDD-NEXT: adcq %r23, %rdx +; EGPR-NDD-NEXT: addq %r18, %r19 +; EGPR-NDD-NEXT: adcq %r8, %r11, %rdi ; EGPR-NDD-NEXT: adcq %rax, %r29 -; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi -; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: movq 16(%r25), %r23 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r18 +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r24 -; EGPR-NDD-NEXT: movq %rdx, %r19 +; EGPR-NDD-NEXT: movq %rdx, %r11 ; EGPR-NDD-NEXT: movq %rax, %r30 -; EGPR-NDD-NEXT: movq %r9, %rax +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r24 -; EGPR-NDD-NEXT: addq %rax, %r19 +; EGPR-NDD-NEXT: addq %rax, %r11 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 -; EGPR-NDD-NEXT: movq %r10, %rax -; EGPR-NDD-NEXT: mulq %r23 -; EGPR-NDD-NEXT: addq %rax, %r19 +; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: addq %rax, %r11 ; EGPR-NDD-NEXT: adcq %rdx, %r8 +; EGPR-NDD-NEXT: movq 8(%r25), %r9 ; EGPR-NDD-NEXT: setb %al ; EGPR-NDD-NEXT: movzbl %al, %r31d -; EGPR-NDD-NEXT: movq %r9, %rax -; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: mulq %rbx ; EGPR-NDD-NEXT: addq %rax, %r8 -; EGPR-NDD-NEXT: adcq %r31, %rdx, %rbx -; EGPR-NDD-NEXT: movq %r17, %rax +; EGPR-NDD-NEXT: adcq %r31, %rdx, %r21 +; EGPR-NDD-NEXT: movq %r12, %r17 +; EGPR-NDD-NEXT: movq %r12, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: movq %rdx, %r31 ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r11, %rax +; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: addq %rax, %r31 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r12 ; EGPR-NDD-NEXT: movq %r17, %rax -; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: mulq %rbx ; EGPR-NDD-NEXT: addq %r31, %rax ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq %rdx, %r12 ; EGPR-NDD-NEXT: setb %r31b -; EGPR-NDD-NEXT: movq %r11, %rax -; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: movq %r9, %rax +; EGPR-NDD-NEXT: mulq %rbx ; EGPR-NDD-NEXT: addq %r12, %rax ; EGPR-NDD-NEXT: movzbl %r31b, %r31d ; EGPR-NDD-NEXT: adcq %r31, %rdx ; EGPR-NDD-NEXT: addq %rax, %r30, %r12 -; EGPR-NDD-NEXT: adcq %rdx, %r19 +; EGPR-NDD-NEXT: adcq %rdx, %r11 ; EGPR-NDD-NEXT: adcq $0, %r8 -; EGPR-NDD-NEXT: adcq $0, %rbx +; EGPR-NDD-NEXT: adcq $0, %r21 ; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: movq %rdx, %r30 ; EGPR-NDD-NEXT: movq %rax, %r31 -; EGPR-NDD-NEXT: movq %r11, %rax +; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: addq %rax, %r30 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r13 @@ -1203,72 +1202,73 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: addq %rax, %r30 ; EGPR-NDD-NEXT: adcq %rdx, %r13 ; EGPR-NDD-NEXT: setb %bpl -; EGPR-NDD-NEXT: movq %r11, %rax +; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %r13, %rax ; EGPR-NDD-NEXT: movzbl %bpl, %r13d ; EGPR-NDD-NEXT: adcq %r13, %rdx ; EGPR-NDD-NEXT: addq %r12, %r31 ; EGPR-NDD-NEXT: movq %r31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %r30, %r19 -; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %r30, %r11 +; EGPR-NDD-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx ; EGPR-NDD-NEXT: addq %rax, %r8 -; EGPR-NDD-NEXT: adcq %rdx, %rbx -; EGPR-NDD-NEXT: setb %r19b -; EGPR-NDD-NEXT: movq %r10, %r16 -; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: adcq %rdx, %r21 +; EGPR-NDD-NEXT: setb %r11b +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: movq %rdx, %r30 ; EGPR-NDD-NEXT: movq %rax, %r31 -; EGPR-NDD-NEXT: movq %r9, %rax +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: addq %rax, %r30 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r12 -; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %rax, %r30 ; EGPR-NDD-NEXT: adcq %rdx, %r12 ; EGPR-NDD-NEXT: setb %bpl -; EGPR-NDD-NEXT: movq %r9, %rax +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %r12, %rax ; EGPR-NDD-NEXT: movzbl %bpl, %r12d ; EGPR-NDD-NEXT: adcq %r12, %rdx ; EGPR-NDD-NEXT: addq %r31, %r8 -; EGPR-NDD-NEXT: adcq %r30, %rbx -; EGPR-NDD-NEXT: movzbl %r19b, %r19d -; EGPR-NDD-NEXT: adcq %r19, %rax +; EGPR-NDD-NEXT: adcq %r21, %r30 +; EGPR-NDD-NEXT: movzbl %r11b, %r11d +; EGPR-NDD-NEXT: adcq %r11, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx ; EGPR-NDD-NEXT: addq %r8, %r27, %r12 -; EGPR-NDD-NEXT: movq 32(%r15), %r30 -; EGPR-NDD-NEXT: adcq %rbx, %rcx, %r13 +; EGPR-NDD-NEXT: adcq %r30, %rcx, %r13 ; EGPR-NDD-NEXT: adcq %rax, %rsi, %rbp -; EGPR-NDD-NEXT: adcq %rdx, %r20, %rbx -; EGPR-NDD-NEXT: adcq $0, %r21 -; EGPR-NDD-NEXT: movq %r21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq $0, %r28 -; EGPR-NDD-NEXT: adcq $0, %r29 +; EGPR-NDD-NEXT: adcq %r20, %rdx +; EGPR-NDD-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq $0, %r19 +; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq $0, %rdi ; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: adcq $0, %r29 +; EGPR-NDD-NEXT: adcq $0, %r18 +; EGPR-NDD-NEXT: movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq 32(%r10), %r30 +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r30 ; EGPR-NDD-NEXT: movq %rdx, %r27 ; EGPR-NDD-NEXT: movq %rax, %r31 -; EGPR-NDD-NEXT: movq %r9, %r19 -; EGPR-NDD-NEXT: movq %r9, %rax +; EGPR-NDD-NEXT: movq %r16, %r18 +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r30 ; EGPR-NDD-NEXT: addq %rax, %r27 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx -; EGPR-NDD-NEXT: movq 40(%r15), %r18 -; EGPR-NDD-NEXT: movq %r10, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: movq 40(%r10), %r22 +; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: addq %r27, %rax, %r21 ; EGPR-NDD-NEXT: adcq %rdx, %rcx ; EGPR-NDD-NEXT: setb %r8b -; EGPR-NDD-NEXT: movq %r9, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: addq %rcx, %rax, %rdi ; EGPR-NDD-NEXT: movzbl %r8b, %eax ; EGPR-NDD-NEXT: adcq %rax, %rdx, %rsi @@ -1276,18 +1276,19 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: mulq %r30 ; EGPR-NDD-NEXT: movq %rdx, %r20 ; EGPR-NDD-NEXT: movq %rax, %r27 -; EGPR-NDD-NEXT: movq %r11, %r10 -; EGPR-NDD-NEXT: movq %r11, %rax +; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r30 ; EGPR-NDD-NEXT: addq %r20, %rax, %r8 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r20 ; EGPR-NDD-NEXT: movq %r17, %rax -; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %r8, %rax, %r25 +; EGPR-NDD-NEXT: movq %r17, %r16 +; EGPR-NDD-NEXT: mulq %r22 +; EGPR-NDD-NEXT: addq %r8, %rax, %r19 ; EGPR-NDD-NEXT: adcq %rdx, %r20 ; EGPR-NDD-NEXT: setb %cl -; EGPR-NDD-NEXT: movq %r11, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: movq %r9, %r17 +; EGPR-NDD-NEXT: movq %r9, %rax +; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: addq %r20, %rax ; EGPR-NDD-NEXT: movzbl %cl, %ecx ; EGPR-NDD-NEXT: adcq %rdx, %rcx @@ -1295,26 +1296,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: adcq %rcx, %r21, %r8 ; EGPR-NDD-NEXT: adcq $0, %rdi ; EGPR-NDD-NEXT: adcq $0, %rsi, %r9 -; EGPR-NDD-NEXT: movq 48(%r15), %r11 -; EGPR-NDD-NEXT: movq %r17, %rsi -; EGPR-NDD-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r17, %rax +; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq 48(%r10), %r11 +; EGPR-NDD-NEXT: movq %r16, %rsi +; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: movq %rdx, %r20 ; EGPR-NDD-NEXT: movq %rax, %r21 -; EGPR-NDD-NEXT: movq %r10, %rax -; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: addq %rax, %r20 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx -; EGPR-NDD-NEXT: movq 56(%r15), %r17 +; EGPR-NDD-NEXT: movq 56(%r10), %r16 ; EGPR-NDD-NEXT: movq %rsi, %rax -; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: addq %rax, %r20 ; EGPR-NDD-NEXT: adcq %rdx, %rcx ; EGPR-NDD-NEXT: setb %sil -; EGPR-NDD-NEXT: movq %r10, %rax -; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: movq %r17, %rax +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: addq %rcx, %rax ; EGPR-NDD-NEXT: movzbl %sil, %ecx ; EGPR-NDD-NEXT: adcq %rdx, %rcx @@ -1325,23 +1326,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: addq %rax, %rdi ; EGPR-NDD-NEXT: adcq %rcx, %r9, %r8 ; EGPR-NDD-NEXT: setb %sil -; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: movq %rdx, %r20 ; EGPR-NDD-NEXT: movq %rax, %r21 -; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r19, %rax +; EGPR-NDD-NEXT: movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r18, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: addq %rax, %r20 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r9 -; EGPR-NDD-NEXT: movq %r16, %rax -; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: addq %rax, %r20 ; EGPR-NDD-NEXT: adcq %rdx, %r9 ; EGPR-NDD-NEXT: setb %cl -; EGPR-NDD-NEXT: movq %r19, %rax -; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: movq %r18, %rax +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: addq %r9, %rax ; EGPR-NDD-NEXT: movzbl %cl, %ecx ; EGPR-NDD-NEXT: adcq %rdx, %rcx @@ -1352,67 +1352,65 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: adcq $0, %rcx ; EGPR-NDD-NEXT: addq %r12, %r27 ; EGPR-NDD-NEXT: movq %r27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %r13, %r25, %r19 +; EGPR-NDD-NEXT: adcq %r13, %r19 ; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq %rbp, %r31 ; EGPR-NDD-NEXT: movq %r31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %rbx, %r10 +; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload ; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq $0, %rdi +; EGPR-NDD-NEXT: adcq $0, %rdi, %rsi ; EGPR-NDD-NEXT: adcq $0, %r8 ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: adcq $0, %rcx -; EGPR-NDD-NEXT: addq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %r8, %r28 +; EGPR-NDD-NEXT: addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; EGPR-NDD-NEXT: adcq %rax, %r29 -; EGPR-NDD-NEXT: adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; EGPR-NDD-NEXT: setb %r8b -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r13, %rax +; EGPR-NDD-NEXT: adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: setb %cl +; EGPR-NDD-NEXT: movq %r28, %rax ; EGPR-NDD-NEXT: mulq %r30 ; EGPR-NDD-NEXT: movq %rdx, %r27 ; EGPR-NDD-NEXT: movq %rax, %r20 -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: movq %r15, %rax ; EGPR-NDD-NEXT: mulq %r30 ; EGPR-NDD-NEXT: addq %rax, %r27 -; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi -; EGPR-NDD-NEXT: movq %r13, %rax -; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %r27, %rax, %rdi -; EGPR-NDD-NEXT: adcq %rdx, %rsi -; EGPR-NDD-NEXT: setb %r9b -; EGPR-NDD-NEXT: movq %r10, %rax -; EGPR-NDD-NEXT: movq %r10, %r16 -; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %rax, %rsi -; EGPR-NDD-NEXT: movzbl %r9b, %eax -; EGPR-NDD-NEXT: adcq %rax, %rdx, %r9 -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r25, %rax +; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi +; EGPR-NDD-NEXT: movq %r28, %rax +; EGPR-NDD-NEXT: mulq %r22 +; EGPR-NDD-NEXT: addq %r27, %rax, %r9 +; EGPR-NDD-NEXT: adcq %rdx, %rdi +; EGPR-NDD-NEXT: setb %r8b +; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: mulq %r22 +; EGPR-NDD-NEXT: addq %rax, %rdi +; EGPR-NDD-NEXT: movzbl %r8b, %eax +; EGPR-NDD-NEXT: adcq %rax, %rdx, %r10 +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload +; EGPR-NDD-NEXT: movq %r18, %rax ; EGPR-NDD-NEXT: mulq %r30 ; EGPR-NDD-NEXT: movq %rdx, %r21 ; EGPR-NDD-NEXT: movq %rax, %r27 ; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; EGPR-NDD-NEXT: movq %r12, %rax ; EGPR-NDD-NEXT: mulq %r30 -; EGPR-NDD-NEXT: addq %rax, %r21 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r10 -; EGPR-NDD-NEXT: movq %r25, %rax -; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %r21, %rax, %rbx -; EGPR-NDD-NEXT: adcq %rdx, %r10 +; EGPR-NDD-NEXT: addq %r21, %rax, %r8 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r19 +; EGPR-NDD-NEXT: movq %r18, %rax +; EGPR-NDD-NEXT: mulq %r22 +; EGPR-NDD-NEXT: addq %rax, %r8 +; EGPR-NDD-NEXT: adcq %rdx, %r19 ; EGPR-NDD-NEXT: setb %r31b ; EGPR-NDD-NEXT: movq %r12, %rax -; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %r10, %rax -; EGPR-NDD-NEXT: movzbl %r31b, %r10d -; EGPR-NDD-NEXT: adcq %r10, %rdx -; EGPR-NDD-NEXT: addq %rax, %r20, %r10 -; EGPR-NDD-NEXT: adcq %rdx, %rdi -; EGPR-NDD-NEXT: adcq $0, %rsi -; EGPR-NDD-NEXT: adcq $0, %r9 -; EGPR-NDD-NEXT: movq %r25, %rax +; EGPR-NDD-NEXT: mulq %r22 +; EGPR-NDD-NEXT: addq %r19, %rax +; EGPR-NDD-NEXT: movzbl %r31b, %r19d +; EGPR-NDD-NEXT: adcq %r19, %rdx +; EGPR-NDD-NEXT: addq %rax, %r20, %r19 +; EGPR-NDD-NEXT: adcq %rdx, %r9 +; EGPR-NDD-NEXT: adcq $0, %rdi +; EGPR-NDD-NEXT: adcq $0, %r10 +; EGPR-NDD-NEXT: movq %r18, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: movq %rdx, %r20 ; EGPR-NDD-NEXT: movq %rax, %r21 @@ -1420,64 +1418,66 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: addq %rax, %r20 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r31 -; EGPR-NDD-NEXT: movq %r25, %rax -; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: movq %r18, %rax +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: addq %rax, %r20 ; EGPR-NDD-NEXT: adcq %rdx, %r31 ; EGPR-NDD-NEXT: setb %bpl ; EGPR-NDD-NEXT: movq %r12, %rax -; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: addq %r31, %rax ; EGPR-NDD-NEXT: movzbl %bpl, %r31d ; EGPR-NDD-NEXT: adcq %r31, %rdx -; EGPR-NDD-NEXT: addq %r21, %r10 -; EGPR-NDD-NEXT: adcq %r20, %rdi +; EGPR-NDD-NEXT: addq %r21, %r19 +; EGPR-NDD-NEXT: adcq %r20, %r9 ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx -; EGPR-NDD-NEXT: addq %rax, %rsi -; EGPR-NDD-NEXT: adcq %rdx, %r9 +; EGPR-NDD-NEXT: addq %rax, %rdi +; EGPR-NDD-NEXT: adcq %rdx, %r10 ; EGPR-NDD-NEXT: setb %r31b -; EGPR-NDD-NEXT: movq %r13, %rax +; EGPR-NDD-NEXT: movq %r28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r28, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: movq %rdx, %r20 ; EGPR-NDD-NEXT: movq %rax, %r21 -; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r15, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: addq %rax, %r20 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r12 -; EGPR-NDD-NEXT: movq %r13, %rax -; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: movq %r28, %rax +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: addq %rax, %r20 ; EGPR-NDD-NEXT: adcq %rdx, %r12 ; EGPR-NDD-NEXT: setb %bpl -; EGPR-NDD-NEXT: movq %r16, %rax -; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: addq %r12, %rax ; EGPR-NDD-NEXT: movzbl %bpl, %r12d ; EGPR-NDD-NEXT: adcq %r12, %rdx -; EGPR-NDD-NEXT: addq %r21, %rsi -; EGPR-NDD-NEXT: adcq %r20, %r9 +; EGPR-NDD-NEXT: addq %r21, %rdi +; EGPR-NDD-NEXT: adcq %r20, %r10 ; EGPR-NDD-NEXT: movzbl %r31b, %r31d ; EGPR-NDD-NEXT: adcq %r31, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx -; EGPR-NDD-NEXT: addq %r27, %r19 +; EGPR-NDD-NEXT: addq %r27, %rsi +; EGPR-NDD-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %r13, %r8 +; EGPR-NDD-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %r29, %r19 ; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %rbx, %r28 -; EGPR-NDD-NEXT: movq %r28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %r29, %r10 -; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; EGPR-NDD-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movzbl %cl, %ecx ; EGPR-NDD-NEXT: adcq %rdi, %rcx ; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movzbl %r8b, %ecx -; EGPR-NDD-NEXT: adcq %rsi, %rcx -; EGPR-NDD-NEXT: movq %rcx, (%rsp) # 8-byte Spill -; EGPR-NDD-NEXT: adcq $0, %r9 -; EGPR-NDD-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq $0, %r10 +; EGPR-NDD-NEXT: movq %r10, (%rsp) # 8-byte Spill ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq $0, %rdx ; EGPR-NDD-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq 64(%r22), %r20 +; EGPR-NDD-NEXT: movq 64(%r25), %r20 ; EGPR-NDD-NEXT: movq %r26, %rax ; EGPR-NDD-NEXT: mulq %r20 ; EGPR-NDD-NEXT: movq %rdx, %r27 @@ -1486,7 +1486,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: mulq %r20 ; EGPR-NDD-NEXT: addq %rax, %r27 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx -; EGPR-NDD-NEXT: movq 72(%r22), %r21 +; EGPR-NDD-NEXT: movq 72(%r25), %r21 ; EGPR-NDD-NEXT: movq %r26, %rax ; EGPR-NDD-NEXT: mulq %r21 ; EGPR-NDD-NEXT: addq %rax, %r27 @@ -1501,7 +1501,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: mulq %r20 ; EGPR-NDD-NEXT: movq %rdx, %r29 ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: movq %rbx, %rax ; EGPR-NDD-NEXT: mulq %r20 ; EGPR-NDD-NEXT: addq %rax, %r29 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi @@ -1511,37 +1511,36 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq %rdx, %rdi ; EGPR-NDD-NEXT: setb %r8b -; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: movq %rbx, %rax ; EGPR-NDD-NEXT: mulq %r21 ; EGPR-NDD-NEXT: addq %rdi, %rax ; EGPR-NDD-NEXT: movzbl %r8b, %edi ; EGPR-NDD-NEXT: adcq %rdi, %rdx -; EGPR-NDD-NEXT: addq %rax, %r28, %rdi +; EGPR-NDD-NEXT: addq %rax, %r28, %r13 ; EGPR-NDD-NEXT: adcq %rdx, %r27 ; EGPR-NDD-NEXT: adcq $0, %rcx ; EGPR-NDD-NEXT: adcq $0, %rsi -; EGPR-NDD-NEXT: movq 80(%r22), %r8 +; EGPR-NDD-NEXT: movq 80(%r25), %r8 ; EGPR-NDD-NEXT: movq %r24, %rax ; EGPR-NDD-NEXT: mulq %r8 ; EGPR-NDD-NEXT: movq %rdx, %r28 ; EGPR-NDD-NEXT: movq %rax, %r29 -; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: movq %rbx, %rax ; EGPR-NDD-NEXT: mulq %r8 ; EGPR-NDD-NEXT: addq %rax, %r28 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r9 -; EGPR-NDD-NEXT: movq 88(%r22), %rbx +; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi +; EGPR-NDD-NEXT: movq 88(%r25), %r15 ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: mulq %r15 ; EGPR-NDD-NEXT: addq %rax, %r28 -; EGPR-NDD-NEXT: adcq %rdx, %r9 -; EGPR-NDD-NEXT: setb %r10b -; EGPR-NDD-NEXT: movq %r23, %rax -; EGPR-NDD-NEXT: mulq %rbx -; EGPR-NDD-NEXT: addq %r9, %rax -; EGPR-NDD-NEXT: movzbl %r10b, %r9d -; EGPR-NDD-NEXT: adcq %r9, %rdx -; EGPR-NDD-NEXT: addq %r29, %rdi -; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %rdx, %rdi +; EGPR-NDD-NEXT: setb %r9b +; EGPR-NDD-NEXT: movq %rbx, %rax +; EGPR-NDD-NEXT: mulq %r15 +; EGPR-NDD-NEXT: addq %rdi, %rax +; EGPR-NDD-NEXT: movzbl %r9b, %edi +; EGPR-NDD-NEXT: adcq %rdi, %rdx +; EGPR-NDD-NEXT: addq %r29, %r13 ; EGPR-NDD-NEXT: adcq %r27, %r28, %rbp ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx @@ -1557,32 +1556,32 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: addq %rax, %r28 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r9 ; EGPR-NDD-NEXT: movq %r26, %rax -; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: mulq %r15 ; EGPR-NDD-NEXT: addq %rax, %r28 ; EGPR-NDD-NEXT: adcq %rdx, %r9 ; EGPR-NDD-NEXT: setb %r10b ; EGPR-NDD-NEXT: movq %r14, %rax -; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: mulq %r15 ; EGPR-NDD-NEXT: addq %r9, %rax ; EGPR-NDD-NEXT: movzbl %r10b, %r9d ; EGPR-NDD-NEXT: adcq %r9, %rdx ; EGPR-NDD-NEXT: addq %rcx, %r29, %r27 ; EGPR-NDD-NEXT: adcq %rsi, %r28, %r12 -; EGPR-NDD-NEXT: movzbl %dil, %r19d -; EGPR-NDD-NEXT: adcq %rax, %r19 +; EGPR-NDD-NEXT: movzbl %dil, %r9d +; EGPR-NDD-NEXT: adcq %rax, %r9 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r29 -; EGPR-NDD-NEXT: imulq %r30, %rbx +; EGPR-NDD-NEXT: imulq %r30, %r15 ; EGPR-NDD-NEXT: movq %r30, %rax ; EGPR-NDD-NEXT: mulq %r8 ; EGPR-NDD-NEXT: movq %rax, %r28 -; EGPR-NDD-NEXT: addq %rbx, %rdx -; EGPR-NDD-NEXT: imulq %r18, %r8 +; EGPR-NDD-NEXT: addq %r15, %rdx +; EGPR-NDD-NEXT: imulq %r22, %r8 ; EGPR-NDD-NEXT: addq %rdx, %r8 ; EGPR-NDD-NEXT: imulq %r21, %r11, %rcx ; EGPR-NDD-NEXT: movq %r11, %rax ; EGPR-NDD-NEXT: mulq %r20 ; EGPR-NDD-NEXT: addq %rdx, %rcx -; EGPR-NDD-NEXT: imulq %r20, %r17, %r16 +; EGPR-NDD-NEXT: imulq %r20, %r16 ; EGPR-NDD-NEXT: addq %r16, %rcx ; EGPR-NDD-NEXT: addq %r28, %rax, %rsi ; EGPR-NDD-NEXT: adcq %rcx, %r8 @@ -1595,35 +1594,35 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: addq %r28, %rax, %rcx ; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi ; EGPR-NDD-NEXT: movq %r20, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: addq %rax, %rcx ; EGPR-NDD-NEXT: adcq %rdx, %rdi -; EGPR-NDD-NEXT: setb %r9b +; EGPR-NDD-NEXT: setb %r10b ; EGPR-NDD-NEXT: movq %r21, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: addq %rdi, %rax -; EGPR-NDD-NEXT: movzbl %r9b, %edi +; EGPR-NDD-NEXT: movzbl %r10b, %edi ; EGPR-NDD-NEXT: adcq %rdi, %rdx ; EGPR-NDD-NEXT: addq %rax, %rsi ; EGPR-NDD-NEXT: adcq %rdx, %r8 -; EGPR-NDD-NEXT: movq 112(%r22), %rdi +; EGPR-NDD-NEXT: movq 112(%r25), %rdi ; EGPR-NDD-NEXT: movq %r24, %rax ; EGPR-NDD-NEXT: mulq %rdi ; EGPR-NDD-NEXT: movq %rax, %r30 -; EGPR-NDD-NEXT: imulq %r23, %rdi -; EGPR-NDD-NEXT: addq %rdi, %rdx -; EGPR-NDD-NEXT: imulq 120(%r22), %r24, %rax -; EGPR-NDD-NEXT: addq %rax, %rdx, %r9 -; EGPR-NDD-NEXT: movq 96(%r22), %r20 -; EGPR-NDD-NEXT: movq 104(%r22), %rdi -; EGPR-NDD-NEXT: imulq %rdi, %r26, %r10 +; EGPR-NDD-NEXT: imulq 120(%r25), %r24, %rax +; EGPR-NDD-NEXT: addq %rdx, %rax +; EGPR-NDD-NEXT: imulq %rbx, %rdi +; EGPR-NDD-NEXT: addq %rdi, %rax, %r10 +; EGPR-NDD-NEXT: movq 96(%r25), %r20 +; EGPR-NDD-NEXT: movq 104(%r25), %rdi +; EGPR-NDD-NEXT: imulq %rdi, %r26, %r11 ; EGPR-NDD-NEXT: movq %r26, %rax ; EGPR-NDD-NEXT: mulq %r20 -; EGPR-NDD-NEXT: addq %r10, %rdx +; EGPR-NDD-NEXT: addq %r11, %rdx ; EGPR-NDD-NEXT: imulq %r20, %r14, %r25 ; EGPR-NDD-NEXT: addq %r25, %rdx ; EGPR-NDD-NEXT: addq %rax, %r30 -; EGPR-NDD-NEXT: adcq %rdx, %r9 +; EGPR-NDD-NEXT: adcq %rdx, %r10 ; EGPR-NDD-NEXT: movq %r20, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: movq %rdx, %r25 @@ -1631,94 +1630,94 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: movq %rdi, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: addq %rax, %r25 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r10 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r11 ; EGPR-NDD-NEXT: movq %r20, %rax -; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: mulq %rbx ; EGPR-NDD-NEXT: addq %rax, %r25 -; EGPR-NDD-NEXT: adcq %rdx, %r10 -; EGPR-NDD-NEXT: setb %r11b +; EGPR-NDD-NEXT: adcq %rdx, %r11 +; EGPR-NDD-NEXT: setb %r16b ; EGPR-NDD-NEXT: movq %rdi, %rax -; EGPR-NDD-NEXT: mulq %r23 -; EGPR-NDD-NEXT: addq %r10, %rax -; EGPR-NDD-NEXT: movzbl %r11b, %edi +; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: addq %r11, %rax +; EGPR-NDD-NEXT: movzbl %r16b, %edi ; EGPR-NDD-NEXT: adcq %rdi, %rdx ; EGPR-NDD-NEXT: addq %r30, %rax -; EGPR-NDD-NEXT: adcq %r9, %rdx +; EGPR-NDD-NEXT: adcq %r10, %rdx ; EGPR-NDD-NEXT: addq %r31, %r26 ; EGPR-NDD-NEXT: adcq %r25, %rcx ; EGPR-NDD-NEXT: adcq %rsi, %rax ; EGPR-NDD-NEXT: adcq %r8, %rdx -; EGPR-NDD-NEXT: addq %r26, %r27, %rbx +; EGPR-NDD-NEXT: addq %r26, %r27, %r15 ; EGPR-NDD-NEXT: adcq %rcx, %r12 -; EGPR-NDD-NEXT: adcq %rax, %r19, %r13 +; EGPR-NDD-NEXT: adcq %rax, %r9, %r21 ; EGPR-NDD-NEXT: adcq %rdx, %r29, %r28 -; EGPR-NDD-NEXT: movq 80(%r15), %r24 -; EGPR-NDD-NEXT: movq %r24, %rax ; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: movq 80(%r16), %r24 +; EGPR-NDD-NEXT: movq %r24, %rax +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload +; EGPR-NDD-NEXT: mulq %r18 ; EGPR-NDD-NEXT: movq %rax, %r30 ; EGPR-NDD-NEXT: movq %rdx, %rdi -; EGPR-NDD-NEXT: movq 88(%r15), %r22 +; EGPR-NDD-NEXT: movq 88(%r16), %r22 ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: mulq %r18 ; EGPR-NDD-NEXT: addq %rax, %rdi ; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload -; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: mulq %r17 ; EGPR-NDD-NEXT: addq %rax, %rdi ; EGPR-NDD-NEXT: adcq %rdx, %rcx ; EGPR-NDD-NEXT: setb %sil ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: movq %r17, %rbx ; EGPR-NDD-NEXT: addq %rax, %rcx ; EGPR-NDD-NEXT: movzbl %sil, %eax ; EGPR-NDD-NEXT: adcq %rax, %rdx, %rsi -; EGPR-NDD-NEXT: movq 64(%r15), %r26 +; EGPR-NDD-NEXT: movq 64(%r16), %r26 ; EGPR-NDD-NEXT: movq %r26, %rax -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: movq %rax, %r21 +; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: movq %rax, %r20 ; EGPR-NDD-NEXT: movq %rdx, %r31 -; EGPR-NDD-NEXT: movq 72(%r15), %r25 +; EGPR-NDD-NEXT: movq 72(%r16), %r25 ; EGPR-NDD-NEXT: movq %r25, %rax -; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: mulq %r18 ; EGPR-NDD-NEXT: addq %rax, %r31 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 ; EGPR-NDD-NEXT: movq %r26, %rax -; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: mulq %r17 ; EGPR-NDD-NEXT: addq %r31, %rax, %r29 ; EGPR-NDD-NEXT: adcq %rdx, %r8 ; EGPR-NDD-NEXT: setb %r9b ; EGPR-NDD-NEXT: movq %r25, %rax -; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: mulq %r17 ; EGPR-NDD-NEXT: addq %r8, %rax ; EGPR-NDD-NEXT: movzbl %r9b, %r8d ; EGPR-NDD-NEXT: adcq %r8, %rdx -; EGPR-NDD-NEXT: addq %rax, %r30, %r20 +; EGPR-NDD-NEXT: addq %rax, %r30, %r11 ; EGPR-NDD-NEXT: adcq %rdx, %rdi ; EGPR-NDD-NEXT: adcq $0, %rcx ; EGPR-NDD-NEXT: adcq $0, %rsi ; EGPR-NDD-NEXT: movq %r26, %rax -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; EGPR-NDD-NEXT: mulq %r10 +; EGPR-NDD-NEXT: mulq %r23 ; EGPR-NDD-NEXT: movq %rdx, %r30 ; EGPR-NDD-NEXT: movq %rax, %r31 ; EGPR-NDD-NEXT: movq %r25, %rax -; EGPR-NDD-NEXT: mulq %r10 +; EGPR-NDD-NEXT: mulq %r23 ; EGPR-NDD-NEXT: addq %rax, %r30 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 ; EGPR-NDD-NEXT: movq %r26, %rax -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; EGPR-NDD-NEXT: mulq %r11 +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r17 # 8-byte Reload +; EGPR-NDD-NEXT: mulq %r17 ; EGPR-NDD-NEXT: addq %r30, %rax, %r27 ; EGPR-NDD-NEXT: adcq %rdx, %r8 ; EGPR-NDD-NEXT: setb %r9b ; EGPR-NDD-NEXT: movq %r25, %rax -; EGPR-NDD-NEXT: mulq %r11 +; EGPR-NDD-NEXT: mulq %r17 ; EGPR-NDD-NEXT: addq %r8, %rax ; EGPR-NDD-NEXT: movzbl %r9b, %r8d ; EGPR-NDD-NEXT: adcq %r8, %rdx -; EGPR-NDD-NEXT: addq %r31, %r20 +; EGPR-NDD-NEXT: addq %r31, %r11 ; EGPR-NDD-NEXT: adcq %rdi, %r27 ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx @@ -1726,127 +1725,125 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: adcq %rdx, %rsi ; EGPR-NDD-NEXT: setb %dil ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: mulq %r10 +; EGPR-NDD-NEXT: mulq %r23 ; EGPR-NDD-NEXT: movq %rdx, %r30 ; EGPR-NDD-NEXT: movq %rax, %r31 ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: mulq %r10 +; EGPR-NDD-NEXT: mulq %r23 ; EGPR-NDD-NEXT: addq %rax, %r30 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: mulq %r11 -; EGPR-NDD-NEXT: addq %r30, %rax, %r19 +; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: addq %r30, %rax, %r9 ; EGPR-NDD-NEXT: adcq %rdx, %r8 -; EGPR-NDD-NEXT: setb %r9b +; EGPR-NDD-NEXT: setb %r10b ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: mulq %r11 +; EGPR-NDD-NEXT: mulq %r17 ; EGPR-NDD-NEXT: addq %r8, %rax -; EGPR-NDD-NEXT: movzbl %r9b, %r8d +; EGPR-NDD-NEXT: movzbl %r10b, %r8d ; EGPR-NDD-NEXT: adcq %r8, %rdx ; EGPR-NDD-NEXT: addq %rcx, %r31 -; EGPR-NDD-NEXT: adcq %rsi, %r19 +; EGPR-NDD-NEXT: adcq %rsi, %r9 ; EGPR-NDD-NEXT: movzbl %dil, %ecx ; EGPR-NDD-NEXT: adcq %rax, %rcx ; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi -; EGPR-NDD-NEXT: movq 96(%r15), %r30 -; EGPR-NDD-NEXT: imulq %r11, %r30, %rsi +; EGPR-NDD-NEXT: movq 96(%r16), %r30 +; EGPR-NDD-NEXT: imulq %r17, %r30, %rsi ; EGPR-NDD-NEXT: movq %r30, %rax -; EGPR-NDD-NEXT: mulq %r10 -; EGPR-NDD-NEXT: movq %rax, %r18 +; EGPR-NDD-NEXT: mulq %r23 +; EGPR-NDD-NEXT: movq %rax, %r17 ; EGPR-NDD-NEXT: addq %rsi, %rdx -; EGPR-NDD-NEXT: movq 104(%r15), %r8 -; EGPR-NDD-NEXT: imulq %r10, %r8, %rax -; EGPR-NDD-NEXT: addq %rax, %rdx, %rsi -; EGPR-NDD-NEXT: movq 112(%r15), %rax -; EGPR-NDD-NEXT: imulq %r23, %rax, %r9 -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: addq %r9, %rdx -; EGPR-NDD-NEXT: imulq 120(%r15), %r16, %r9 -; EGPR-NDD-NEXT: addq %r9, %rdx -; EGPR-NDD-NEXT: addq %r18, %rax, %r10 -; EGPR-NDD-NEXT: adcq %rsi, %rdx, %r9 -; EGPR-NDD-NEXT: movq %r16, %rax -; EGPR-NDD-NEXT: movq %r16, %r18 +; EGPR-NDD-NEXT: movq 104(%r16), %r8 +; EGPR-NDD-NEXT: imulq %r8, %r23 +; EGPR-NDD-NEXT: addq %rdx, %r23 +; EGPR-NDD-NEXT: movq 112(%r16), %rax +; EGPR-NDD-NEXT: imulq %rbx, %rax, %rsi +; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: addq %rsi, %rdx +; EGPR-NDD-NEXT: imulq 120(%r16), %r18, %rsi +; EGPR-NDD-NEXT: addq %rsi, %rdx +; EGPR-NDD-NEXT: addq %r17, %rax, %r16 +; EGPR-NDD-NEXT: adcq %rdx, %r23 +; EGPR-NDD-NEXT: movq %r18, %rax ; EGPR-NDD-NEXT: mulq %r30 ; EGPR-NDD-NEXT: movq %rdx, %r17 ; EGPR-NDD-NEXT: movq %rax, %rsi -; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: movq %rbx, %rax ; EGPR-NDD-NEXT: mulq %r30 -; EGPR-NDD-NEXT: addq %r17, %rax, %r11 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r16 +; EGPR-NDD-NEXT: addq %r17, %rax, %r10 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r17 ; EGPR-NDD-NEXT: movq %r18, %rax ; EGPR-NDD-NEXT: mulq %r8 -; EGPR-NDD-NEXT: addq %rax, %r11 -; EGPR-NDD-NEXT: adcq %rdx, %r16 -; EGPR-NDD-NEXT: setb %r17b -; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: addq %rax, %r10 +; EGPR-NDD-NEXT: adcq %rdx, %r17 +; EGPR-NDD-NEXT: setb %r19b +; EGPR-NDD-NEXT: movq %rbx, %rax ; EGPR-NDD-NEXT: mulq %r8 -; EGPR-NDD-NEXT: addq %r16, %rax -; EGPR-NDD-NEXT: movzbl %r17b, %r8d +; EGPR-NDD-NEXT: addq %r17, %rax +; EGPR-NDD-NEXT: movzbl %r19b, %r8d ; EGPR-NDD-NEXT: adcq %r8, %rdx -; EGPR-NDD-NEXT: addq %rax, %r10 -; EGPR-NDD-NEXT: adcq %r9, %rdx, %r17 -; EGPR-NDD-NEXT: imulq {{[-0-9]+}}(%r{{[sb]}}p), %r26, %r8 # 8-byte Folded Reload +; EGPR-NDD-NEXT: addq %r16, %rax, %r17 +; EGPR-NDD-NEXT: adcq %r23, %rdx, %r18 +; EGPR-NDD-NEXT: imulq {{[-0-9]+}}(%r{{[sb]}}p), %r26, %r16 # 8-byte Folded Reload ; EGPR-NDD-NEXT: movq %r26, %rax -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: movq %rax, %r9 -; EGPR-NDD-NEXT: addq %r8, %rdx -; EGPR-NDD-NEXT: imulq %r16, %r25, %rax -; EGPR-NDD-NEXT: addq %rax, %rdx, %r8 -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload -; EGPR-NDD-NEXT: imulq %r23, %r24, %r16 -; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Reload -; EGPR-NDD-NEXT: mulq %r30 +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload +; EGPR-NDD-NEXT: mulq %r19 +; EGPR-NDD-NEXT: movq %rax, %r8 ; EGPR-NDD-NEXT: addq %r16, %rdx -; EGPR-NDD-NEXT: imulq %r30, %r22 +; EGPR-NDD-NEXT: imulq %r19, %r25, %rax +; EGPR-NDD-NEXT: addq %rax, %rdx, %r16 +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Reload +; EGPR-NDD-NEXT: imulq %r30, %r24, %r19 +; EGPR-NDD-NEXT: movq %r24, %rax +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r24 # 8-byte Reload +; EGPR-NDD-NEXT: mulq %r24 +; EGPR-NDD-NEXT: addq %r19, %rdx +; EGPR-NDD-NEXT: imulq %r24, %r22 ; EGPR-NDD-NEXT: addq %r22, %rdx -; EGPR-NDD-NEXT: addq %r9, %rax, %r16 -; EGPR-NDD-NEXT: adcq %r8, %rdx, %r18 -; EGPR-NDD-NEXT: movq %r30, %rax +; EGPR-NDD-NEXT: addq %r8, %rax, %r19 +; EGPR-NDD-NEXT: adcq %r16, %rdx, %r22 +; EGPR-NDD-NEXT: movq %r24, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: movq %rdx, %r8 -; EGPR-NDD-NEXT: movq %rax, %r9 -; EGPR-NDD-NEXT: movq %r23, %rax -; EGPR-NDD-NEXT: movq %r23, %r24 +; EGPR-NDD-NEXT: movq %rax, %r16 +; EGPR-NDD-NEXT: movq %r30, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: addq %rax, %r8 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r22 -; EGPR-NDD-NEXT: movq %r30, %rax +; EGPR-NDD-NEXT: adcq $0, %rdx, %r23 +; EGPR-NDD-NEXT: movq %r24, %rax ; EGPR-NDD-NEXT: mulq %r25 ; EGPR-NDD-NEXT: addq %rax, %r8 -; EGPR-NDD-NEXT: adcq %rdx, %r22 -; EGPR-NDD-NEXT: setb %r23b -; EGPR-NDD-NEXT: movq %r24, %rax +; EGPR-NDD-NEXT: adcq %rdx, %r23 +; EGPR-NDD-NEXT: setb %r24b +; EGPR-NDD-NEXT: movq %r30, %rax ; EGPR-NDD-NEXT: mulq %r25 -; EGPR-NDD-NEXT: addq %r22, %rax -; EGPR-NDD-NEXT: movzbl %r23b, %r22d +; EGPR-NDD-NEXT: addq %r23, %rax +; EGPR-NDD-NEXT: movzbl %r24b, %r23d +; EGPR-NDD-NEXT: adcq %r23, %rdx +; EGPR-NDD-NEXT: addq %r19, %rax ; EGPR-NDD-NEXT: adcq %r22, %rdx -; EGPR-NDD-NEXT: addq %r16, %rax +; EGPR-NDD-NEXT: addq %r16, %rsi +; EGPR-NDD-NEXT: adcq %r10, %r8 +; EGPR-NDD-NEXT: adcq %r17, %rax ; EGPR-NDD-NEXT: adcq %r18, %rdx -; EGPR-NDD-NEXT: addq %r9, %rsi -; EGPR-NDD-NEXT: adcq %r11, %r8 -; EGPR-NDD-NEXT: adcq %r10, %rax -; EGPR-NDD-NEXT: adcq %r17, %rdx ; EGPR-NDD-NEXT: addq %r31, %rsi -; EGPR-NDD-NEXT: adcq %r19, %r8 +; EGPR-NDD-NEXT: adcq %r9, %r8 ; EGPR-NDD-NEXT: adcq %rcx, %rax ; EGPR-NDD-NEXT: adcq %rdi, %rdx -; EGPR-NDD-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload +; EGPR-NDD-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload ; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %r13, %r11 ; EGPR-NDD-NEXT: adcq %rbp, %r27 -; EGPR-NDD-NEXT: adcq %rbx, %rsi +; EGPR-NDD-NEXT: adcq %r15, %rsi ; EGPR-NDD-NEXT: adcq %r12, %r8 -; EGPR-NDD-NEXT: adcq %r13, %rax +; EGPR-NDD-NEXT: adcq %r21, %rax ; EGPR-NDD-NEXT: adcq %r28, %rdx -; EGPR-NDD-NEXT: addq %r21, {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload +; EGPR-NDD-NEXT: addq %r20, {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload ; EGPR-NDD-NEXT: adcq %r29, {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %r20, {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %r11, {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; EGPR-NDD-NEXT: adcq %r27, {{[-0-9]+}}(%r{{[sb]}}p), %r27 # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %rsi, (%rsp), %rsi # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %rsi, {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %r8, (%rsp), %r8 # 8-byte Folded Reload ; EGPR-NDD-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; EGPR-NDD-NEXT: adcq %rdx, {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload @@ -1866,15 +1863,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: movq %rdi, 48(%rcx) ; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; EGPR-NDD-NEXT: movq %rdi, 56(%rcx) -; EGPR-NDD-NEXT: movq %r21, 64(%rcx) +; EGPR-NDD-NEXT: movq %r20, 64(%rcx) ; EGPR-NDD-NEXT: movq %r29, 72(%rcx) -; EGPR-NDD-NEXT: movq %r20, 80(%rcx) +; EGPR-NDD-NEXT: movq %r11, 80(%rcx) ; EGPR-NDD-NEXT: movq %r27, 88(%rcx) ; EGPR-NDD-NEXT: movq %rsi, 96(%rcx) ; EGPR-NDD-NEXT: movq %r8, 104(%rcx) ; EGPR-NDD-NEXT: movq %rax, 112(%rcx) ; EGPR-NDD-NEXT: movq %rdx, 120(%rcx) -; EGPR-NDD-NEXT: addq $96, %rsp +; EGPR-NDD-NEXT: addq $80, %rsp ; EGPR-NDD-NEXT: popq %rbx ; EGPR-NDD-NEXT: popq %r12 ; EGPR-NDD-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll index 3c2ef21527f50..787a9917b7497 100644 --- a/llvm/test/CodeGen/X86/arithmetic_fence2.ll +++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll @@ -222,9 +222,9 @@ define <3 x bfloat> @f10(<3 x bfloat> %a) nounwind { ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NEXT: pextrw $0, %xmm0, %edx ; X86-NEXT: #ARITH_FENCE +; X86-NEXT: pinsrw $0, %eax, %xmm0 ; X86-NEXT: #ARITH_FENCE ; X86-NEXT: #ARITH_FENCE -; X86-NEXT: pinsrw $0, %eax, %xmm0 ; X86-NEXT: pinsrw $0, %ecx, %xmm1 ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: pinsrw $0, %edx, %xmm1 @@ -240,9 +240,9 @@ define <3 x bfloat> @f10(<3 x bfloat> %a) nounwind { ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: pextrw $0, %xmm0, %edx ; X64-NEXT: #ARITH_FENCE +; X64-NEXT: pinsrw $0, %eax, %xmm0 ; X64-NEXT: #ARITH_FENCE ; X64-NEXT: #ARITH_FENCE -; X64-NEXT: pinsrw $0, %eax, %xmm0 ; X64-NEXT: pinsrw $0, %ecx, %xmm1 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: pinsrw $0, %edx, %xmm1 @@ -269,13 +269,13 @@ define <4 x bfloat> @f11(<4 x bfloat> %a) nounwind { ; X86-NEXT: #ARITH_FENCE ; X86-NEXT: #ARITH_FENCE ; X86-NEXT: #ARITH_FENCE -; X86-NEXT: pinsrw $0, %eax, %xmm0 -; X86-NEXT: pinsrw $0, %edx, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: pinsrw $0, %eax, %xmm1 +; X86-NEXT: pinsrw $0, %edx, %xmm2 ; X86-NEXT: pinsrw $0, %ecx, %xmm0 -; X86-NEXT: pinsrw $0, %esi, %xmm2 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: pinsrw $0, %esi, %xmm3 +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -294,13 +294,13 @@ define <4 x bfloat> @f11(<4 x bfloat> %a) nounwind { ; X64-NEXT: #ARITH_FENCE ; X64-NEXT: #ARITH_FENCE ; X64-NEXT: #ARITH_FENCE -; X64-NEXT: pinsrw $0, %eax, %xmm0 -; X64-NEXT: pinsrw $0, %ecx, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pinsrw $0, %eax, %xmm1 +; X64-NEXT: pinsrw $0, %ecx, %xmm2 ; X64-NEXT: pinsrw $0, %edx, %xmm0 -; X64-NEXT: pinsrw $0, %esi, %xmm2 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: pinsrw $0, %esi, %xmm3 +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: retq %b = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> %a) ret <4 x bfloat> %b diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll index 8f91f4120842b..c278ec2eb7989 100644 --- a/llvm/test/CodeGen/X86/atomic-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll @@ -568,9 +568,9 @@ if.end: ; preds = %entry define i32 @split_hoist_and(i32 %0) nounwind { ; X86-LABEL: split_hoist_and: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: lock btsl $3, v32 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: setb %al ; X86-NEXT: shll $3, %eax ; X86-NEXT: testl %ecx, %ecx diff --git a/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll b/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll index 0c5abef83c45d..5c0cf1ea90846 100644 --- a/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll +++ b/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll @@ -5,15 +5,15 @@ define i32 @test_add_1_cmov_slt(ptr %p, i32 %a0, i32 %a1) #0 { ; FASTINCDEC-LABEL: test_add_1_cmov_slt: ; FASTINCDEC: # %bb.0: # %entry -; FASTINCDEC-NEXT: movl %esi, %eax ; FASTINCDEC-NEXT: lock incq (%rdi) +; FASTINCDEC-NEXT: movl %esi, %eax ; FASTINCDEC-NEXT: cmovgl %edx, %eax ; FASTINCDEC-NEXT: retq ; ; SLOWINCDEC-LABEL: test_add_1_cmov_slt: ; SLOWINCDEC: # %bb.0: # %entry -; SLOWINCDEC-NEXT: movl %esi, %eax ; SLOWINCDEC-NEXT: lock addq $1, (%rdi) +; SLOWINCDEC-NEXT: movl %esi, %eax ; SLOWINCDEC-NEXT: cmovgl %edx, %eax ; SLOWINCDEC-NEXT: retq entry: @@ -26,15 +26,15 @@ entry: define i32 @test_add_1_cmov_sge(ptr %p, i32 %a0, i32 %a1) #0 { ; FASTINCDEC-LABEL: test_add_1_cmov_sge: ; FASTINCDEC: # %bb.0: # %entry -; FASTINCDEC-NEXT: movl %esi, %eax ; FASTINCDEC-NEXT: lock incq (%rdi) +; FASTINCDEC-NEXT: movl %esi, %eax ; FASTINCDEC-NEXT: cmovlel %edx, %eax ; FASTINCDEC-NEXT: retq ; ; SLOWINCDEC-LABEL: test_add_1_cmov_sge: ; SLOWINCDEC: # %bb.0: # %entry -; SLOWINCDEC-NEXT: movl %esi, %eax ; SLOWINCDEC-NEXT: lock addq $1, (%rdi) +; SLOWINCDEC-NEXT: movl %esi, %eax ; SLOWINCDEC-NEXT: cmovlel %edx, %eax ; SLOWINCDEC-NEXT: retq entry: @@ -47,15 +47,15 @@ entry: define i32 @test_sub_1_cmov_sle(ptr %p, i32 %a0, i32 %a1) #0 { ; FASTINCDEC-LABEL: test_sub_1_cmov_sle: ; FASTINCDEC: # %bb.0: # %entry -; FASTINCDEC-NEXT: movl %esi, %eax ; FASTINCDEC-NEXT: lock decq (%rdi) +; FASTINCDEC-NEXT: movl %esi, %eax ; FASTINCDEC-NEXT: cmovgel %edx, %eax ; FASTINCDEC-NEXT: retq ; ; SLOWINCDEC-LABEL: test_sub_1_cmov_sle: ; SLOWINCDEC: # %bb.0: # %entry -; SLOWINCDEC-NEXT: movl %esi, %eax ; SLOWINCDEC-NEXT: lock subq $1, (%rdi) +; SLOWINCDEC-NEXT: movl %esi, %eax ; SLOWINCDEC-NEXT: cmovgel %edx, %eax ; SLOWINCDEC-NEXT: retq entry: @@ -68,15 +68,15 @@ entry: define i32 @test_sub_1_cmov_sgt(ptr %p, i32 %a0, i32 %a1) #0 { ; FASTINCDEC-LABEL: test_sub_1_cmov_sgt: ; FASTINCDEC: # %bb.0: # %entry -; FASTINCDEC-NEXT: movl %esi, %eax ; FASTINCDEC-NEXT: lock decq (%rdi) +; FASTINCDEC-NEXT: movl %esi, %eax ; FASTINCDEC-NEXT: cmovll %edx, %eax ; FASTINCDEC-NEXT: retq ; ; SLOWINCDEC-LABEL: test_sub_1_cmov_sgt: ; SLOWINCDEC: # %bb.0: # %entry -; SLOWINCDEC-NEXT: movl %esi, %eax ; SLOWINCDEC-NEXT: lock addq $-1, (%rdi) +; SLOWINCDEC-NEXT: movl %esi, %eax ; SLOWINCDEC-NEXT: cmovll %edx, %eax ; SLOWINCDEC-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll index fe79dfe39f645..a077d6fa7a520 100644 --- a/llvm/test/CodeGen/X86/atomic-fp.ll +++ b/llvm/test/CodeGen/X86/atomic-fp.ll @@ -1046,17 +1046,17 @@ define dso_local void @fsub_64g() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: fchs ; X86-SSE1-NEXT: faddl (%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, glob64 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; X86-SSE1-NEXT: movlps %xmm1, glob64 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl @@ -1212,17 +1212,17 @@ define dso_local void @fsub_64imm() nounwind { ; X86-SSE1-NEXT: andl $-8, %esp ; X86-SSE1-NEXT: subl $16, %esp ; X86-SSE1-NEXT: xorps %xmm0, %xmm0 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: xorps %xmm1, %xmm1 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-SSE1-NEXT: movss %xmm1, (%esp) -; X86-SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-SSE1-NEXT: movss %xmm1, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movss %xmm0, (%esp) +; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE1-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: fld1 ; X86-SSE1-NEXT: fchs ; X86-SSE1-NEXT: faddl (%esp) ; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-SSE1-NEXT: movlps %xmm0, -559038737 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; X86-SSE1-NEXT: movlps %xmm1, -559038737 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp ; X86-SSE1-NEXT: retl diff --git a/llvm/test/CodeGen/X86/atomic-mi.ll b/llvm/test/CodeGen/X86/atomic-mi.ll index 0d0108f55f2ab..13c8fb6f9bfe5 100644 --- a/llvm/test/CodeGen/X86/atomic-mi.ll +++ b/llvm/test/CodeGen/X86/atomic-mi.ll @@ -367,8 +367,8 @@ define void @add_64r(ptr %p, i64 %v) { ; X32-NEXT: fildll (%eax) ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: addl 12(%ebp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: adcl 16(%ebp), %edx ; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -550,8 +550,8 @@ define void @sub_64r(ptr %p, i64 %v) { ; X32-NEXT: fildll (%eax) ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: subl 12(%ebp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: sbbl 16(%ebp), %edx ; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll index b4d40fee01e41..a3efe8c86ce85 100644 --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll @@ -333,33 +333,33 @@ entry: define zeroext i8 @atomic_shl1_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounwind { ; X86-LABEL: atomic_shl1_and_8_gpr_brnz: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movb %bl, %ah +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movb %dl, %ah ; X86-NEXT: notb %ah -; X86-NEXT: movb (%edx), %al +; X86-NEXT: movb (%esi), %al ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB6_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb %ah, %ch -; X86-NEXT: lock cmpxchgb %ch, (%edx) +; X86-NEXT: lock cmpxchgb %ch, (%esi) ; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: testl %eax, %ebx +; X86-NEXT: testl %eax, %edx ; X86-NEXT: je .LBB6_3 ; X86-NEXT: # %bb.4: # %if.then ; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: movzbl (%edx,%eax), %eax -; X86-NEXT: popl %ebx +; X86-NEXT: movzbl (%esi,%eax), %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; X86-NEXT: .LBB6_3: ; X86-NEXT: movb $123, %al -; X86-NEXT: popl %ebx +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: atomic_shl1_and_8_gpr_brnz: @@ -491,29 +491,29 @@ define zeroext i8 @atomic_shl1_mask01_and_8_gpr_brnz(ptr %v, i8 zeroext %c) noun ; X86-LABEL: atomic_shl1_mask01_and_8_gpr_brnz: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movb %ah, %cl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: andb $7, %cl ; X86-NEXT: movl $1, %ebx ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: notb %cl -; X86-NEXT: movb (%edx), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movb %bl, %ah +; X86-NEXT: notb %ah +; X86-NEXT: movb (%ecx), %al ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB8_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movb %al, %ch -; X86-NEXT: andb %cl, %ch -; X86-NEXT: lock cmpxchgb %ch, (%edx) +; X86-NEXT: movb %al, %dh +; X86-NEXT: andb %ah, %dh +; X86-NEXT: lock cmpxchgb %dh, (%ecx) ; X86-NEXT: jne .LBB8_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: testl %ecx, %ebx +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: testl %eax, %ebx ; X86-NEXT: je .LBB8_3 ; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movzbl %ah, %eax -; X86-NEXT: movzbl (%edx,%eax), %eax +; X86-NEXT: movzbl %dl, %eax +; X86-NEXT: movzbl (%ecx,%eax), %eax ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; X86-NEXT: .LBB8_3: @@ -1068,11 +1068,11 @@ define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_valz(ptr %v, i16 zeroext % ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andl $7, %ecx ; X86-NEXT: movl $1, %esi ; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzwl (%edx), %eax ; X86-NEXT: movzwl %si, %esi ; X86-NEXT: .p2align 4 @@ -1461,11 +1461,11 @@ define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_valnz(ptr %v, i16 zeroext ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andl $7, %ecx ; X86-NEXT: movl $1, %esi ; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzwl (%edx), %eax ; X86-NEXT: movzwl %si, %esi ; X86-NEXT: .p2align 4 @@ -1871,11 +1871,11 @@ define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_brz(ptr %v, i16 zeroext %c ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andl $7, %ecx ; X86-NEXT: movl $1, %esi ; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzwl (%edx), %eax ; X86-NEXT: movzwl %si, %esi ; X86-NEXT: .p2align 4 @@ -2288,10 +2288,10 @@ define zeroext i16 @atomic_shl1_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $1, %edx ; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl $-2, %edi ; X86-NEXT: roll %cl, %edi ; X86-NEXT: movzwl (%esi), %eax @@ -2603,10 +2603,10 @@ define zeroext i16 @atomic_shl1_and_16_gpr_valnz(ptr %v, i16 zeroext %c) nounwin ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $1, %edx ; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl $-2, %edi ; X86-NEXT: roll %cl, %edi ; X86-NEXT: movzwl (%esi), %eax @@ -2873,27 +2873,27 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_valnz(ptr %v, i16 zeroext %c) ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $15, %cl -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl $-2, %edi ; X86-NEXT: roll %cl, %edi -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: movzwl (%esi), %eax ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB46_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl %edi, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%edx) +; X86-NEXT: lock cmpxchgw %cx, (%esi) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB46_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end ; X86-NEXT: movzwl %ax, %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testl %ecx, %esi +; X86-NEXT: testl %ecx, %edx ; X86-NEXT: setne %al ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi @@ -3017,28 +3017,28 @@ define zeroext i16 @atomic_shl1_and_16_gpr_brnz(ptr %v, i16 zeroext %c) nounwind ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl $-2, %edi ; X86-NEXT: roll %cl, %edi -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: movzwl (%esi), %eax ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB48_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: andl %edi, %ebx ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %bx, (%edx) +; X86-NEXT: lock cmpxchgw %bx, (%esi) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB48_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end ; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: testl %eax, %esi +; X86-NEXT: testl %eax, %edx ; X86-NEXT: je .LBB48_3 ; X86-NEXT: # %bb.4: # %if.then ; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl (%edx,%eax,2), %eax +; X86-NEXT: movzwl (%esi,%eax,2), %eax ; X86-NEXT: jmp .LBB48_5 ; X86-NEXT: .LBB48_3: ; X86-NEXT: movw $123, %ax @@ -3361,22 +3361,22 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: andb $15, %cl ; X86-NEXT: movl $1, %esi ; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl $-2, %edi -; X86-NEXT: roll %cl, %edi -; X86-NEXT: movzwl (%edx), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl $-2, %ebx +; X86-NEXT: roll %cl, %ebx +; X86-NEXT: movzwl (%edi), %eax ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB52_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl %edi, %ecx +; X86-NEXT: andl %ebx, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: lock cmpxchgw %cx, (%edx) +; X86-NEXT: lock cmpxchgw %cx, (%edi) ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB52_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end @@ -3384,8 +3384,8 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X86-NEXT: testl %eax, %esi ; X86-NEXT: je .LBB52_3 ; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movzwl %bx, %eax -; X86-NEXT: movzwl (%edx,%eax,2), %eax +; X86-NEXT: movzwl %dx, %eax +; X86-NEXT: movzwl (%edi,%eax,2), %eax ; X86-NEXT: jmp .LBB52_5 ; X86-NEXT: .LBB52_3: ; X86-NEXT: movw $123, %ax diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 0de308a9e0738..aec2b848e3a55 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -509,21 +509,21 @@ define void @avg_v32i16(ptr %a, ptr %b) nounwind { define void @avg_v40i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v40i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa 64(%rdi), %xmm0 -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: movdqa 32(%rdi), %xmm3 -; SSE2-NEXT: movdqa 48(%rdi), %xmm4 -; SSE2-NEXT: pavgw (%rsi), %xmm1 -; SSE2-NEXT: pavgw 16(%rsi), %xmm2 -; SSE2-NEXT: pavgw 32(%rsi), %xmm3 -; SSE2-NEXT: pavgw 48(%rsi), %xmm4 -; SSE2-NEXT: pavgw 64(%rsi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 32(%rsi), %xmm2 +; SSE2-NEXT: pavgw 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 64(%rdi), %xmm4 +; SSE2-NEXT: pavgw 64(%rsi), %xmm4 ; SSE2-NEXT: movdqu %xmm4, (%rax) ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v40i16: @@ -1191,7 +1191,7 @@ define void @avg_v16i16_const(ptr %a) nounwind { ; ; AVX1-LABEL: avg_v16i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) @@ -1241,7 +1241,7 @@ define void @avg_v32i16_const(ptr %a) nounwind { ; ; AVX1-LABEL: avg_v32i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm3 @@ -1468,17 +1468,17 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: movdqa %xmm8, 144(%rdi) ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm8 ; SSE2-NEXT: movdqa %xmm8, 128(%rdi) -; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm7 ; SSE2-NEXT: movdqa %xmm7, 112(%rdi) ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm6 ; SSE2-NEXT: movdqa %xmm6, 96(%rdi) ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: movdqa %xmm5, 80(%rdi) +; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3 ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm4 ; SSE2-NEXT: movdqa %xmm4, 64(%rdi) -; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm3 ; SSE2-NEXT: movdqa %xmm3, 48(%rdi) ; SSE2-NEXT: pavgb {{[0-9]+}}(%rsp), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdi) @@ -1546,11 +1546,11 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 ; AVX1-NEXT: vpavgb 512(%rbp), %xmm8, %xmm8 ; AVX1-NEXT: vmovdqa %xmm8, 240(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-NEXT: vpavgb 480(%rbp), %xmm8, %xmm8 ; AVX1-NEXT: vpavgb 496(%rbp), %xmm7, %xmm7 ; AVX1-NEXT: vmovdqa %xmm7, 224(%rdi) -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vpavgb 480(%rbp), %xmm7, %xmm7 -; AVX1-NEXT: vmovdqa %xmm7, 208(%rdi) +; AVX1-NEXT: vmovdqa %xmm8, 208(%rdi) ; AVX1-NEXT: vpavgb 464(%rbp), %xmm6, %xmm6 ; AVX1-NEXT: vmovdqa %xmm6, 192(%rdi) ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 @@ -1566,11 +1566,11 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vpavgb 384(%rbp), %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, 112(%rdi) +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpavgb 352(%rbp), %xmm4, %xmm4 ; AVX1-NEXT: vpavgb 368(%rbp), %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, 96(%rdi) -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpavgb 352(%rbp), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, 80(%rdi) +; AVX1-NEXT: vmovdqa %xmm4, 80(%rdi) ; AVX1-NEXT: vpavgb 336(%rbp), %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi) ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -1595,14 +1595,13 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $32, %rsp ; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: vmovdqa 240(%rbp), %ymm8 -; AVX2-NEXT: vmovdqa 208(%rbp), %ymm9 -; AVX2-NEXT: vmovdqa 176(%rbp), %ymm10 -; AVX2-NEXT: vmovdqa 144(%rbp), %ymm11 -; AVX2-NEXT: vmovdqa 112(%rbp), %ymm12 -; AVX2-NEXT: vmovdqa 80(%rbp), %ymm13 -; AVX2-NEXT: vmovdqa 48(%rbp), %ymm14 -; AVX2-NEXT: vmovdqa 16(%rbp), %ymm15 +; AVX2-NEXT: vmovdqa 208(%rbp), %ymm8 +; AVX2-NEXT: vmovdqa 176(%rbp), %ymm9 +; AVX2-NEXT: vmovdqa 144(%rbp), %ymm10 +; AVX2-NEXT: vmovdqa 112(%rbp), %ymm11 +; AVX2-NEXT: vmovdqa 80(%rbp), %ymm12 +; AVX2-NEXT: vmovdqa 48(%rbp), %ymm13 +; AVX2-NEXT: vmovdqa 16(%rbp), %ymm14 ; AVX2-NEXT: vpavgb 272(%rbp), %ymm0, %ymm0 ; AVX2-NEXT: vpavgb 304(%rbp), %ymm1, %ymm1 ; AVX2-NEXT: vpavgb 336(%rbp), %ymm2, %ymm2 @@ -1611,22 +1610,23 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; AVX2-NEXT: vpavgb 432(%rbp), %ymm5, %ymm5 ; AVX2-NEXT: vpavgb 464(%rbp), %ymm6, %ymm6 ; AVX2-NEXT: vpavgb 496(%rbp), %ymm7, %ymm7 -; AVX2-NEXT: vpavgb 528(%rbp), %ymm15, %ymm15 -; AVX2-NEXT: vpavgb 560(%rbp), %ymm14, %ymm14 -; AVX2-NEXT: vpavgb 592(%rbp), %ymm13, %ymm13 -; AVX2-NEXT: vpavgb 624(%rbp), %ymm12, %ymm12 -; AVX2-NEXT: vpavgb 656(%rbp), %ymm11, %ymm11 -; AVX2-NEXT: vpavgb 688(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vpavgb 720(%rbp), %ymm9, %ymm9 -; AVX2-NEXT: vpavgb 752(%rbp), %ymm8, %ymm8 -; AVX2-NEXT: vmovdqa %ymm8, 480(%rdi) -; AVX2-NEXT: vmovdqa %ymm9, 448(%rdi) -; AVX2-NEXT: vmovdqa %ymm10, 416(%rdi) -; AVX2-NEXT: vmovdqa %ymm11, 384(%rdi) -; AVX2-NEXT: vmovdqa %ymm12, 352(%rdi) -; AVX2-NEXT: vmovdqa %ymm13, 320(%rdi) -; AVX2-NEXT: vmovdqa %ymm14, 288(%rdi) -; AVX2-NEXT: vmovdqa %ymm15, 256(%rdi) +; AVX2-NEXT: vpavgb 528(%rbp), %ymm14, %ymm14 +; AVX2-NEXT: vpavgb 560(%rbp), %ymm13, %ymm13 +; AVX2-NEXT: vpavgb 592(%rbp), %ymm12, %ymm12 +; AVX2-NEXT: vpavgb 624(%rbp), %ymm11, %ymm11 +; AVX2-NEXT: vpavgb 656(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vpavgb 688(%rbp), %ymm9, %ymm9 +; AVX2-NEXT: vpavgb 720(%rbp), %ymm8, %ymm8 +; AVX2-NEXT: vmovdqa 240(%rbp), %ymm15 +; AVX2-NEXT: vpavgb 752(%rbp), %ymm15, %ymm15 +; AVX2-NEXT: vmovdqa %ymm15, 480(%rdi) +; AVX2-NEXT: vmovdqa %ymm8, 448(%rdi) +; AVX2-NEXT: vmovdqa %ymm9, 416(%rdi) +; AVX2-NEXT: vmovdqa %ymm10, 384(%rdi) +; AVX2-NEXT: vmovdqa %ymm11, 352(%rdi) +; AVX2-NEXT: vmovdqa %ymm12, 320(%rdi) +; AVX2-NEXT: vmovdqa %ymm13, 288(%rdi) +; AVX2-NEXT: vmovdqa %ymm14, 256(%rdi) ; AVX2-NEXT: vmovdqa %ymm7, 224(%rdi) ; AVX2-NEXT: vmovdqa %ymm6, 192(%rdi) ; AVX2-NEXT: vmovdqa %ymm5, 160(%rdi) @@ -1646,7 +1646,6 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; AVX512F-NEXT: movq %rsp, %rbp ; AVX512F-NEXT: andq $-64, %rsp ; AVX512F-NEXT: subq $64, %rsp -; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: vpavgb 16(%rbp), %ymm0, %ymm8 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpavgb 48(%rbp), %ymm0, %ymm0 @@ -1665,6 +1664,7 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; AVX512F-NEXT: vpavgb 336(%rbp), %ymm5, %ymm13 ; AVX512F-NEXT: vextracti64x4 $1, %zmm5, %ymm5 ; AVX512F-NEXT: vpavgb 368(%rbp), %ymm5, %ymm5 +; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: vpavgb 400(%rbp), %ymm6, %ymm14 ; AVX512F-NEXT: vextracti64x4 $1, %zmm6, %ymm6 ; AVX512F-NEXT: vpavgb 432(%rbp), %ymm6, %ymm6 @@ -1698,7 +1698,6 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; AVX512BW-NEXT: movq %rsp, %rbp ; AVX512BW-NEXT: andq $-64, %rsp ; AVX512BW-NEXT: subq $64, %rsp -; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: vpavgb 16(%rbp), %zmm0, %zmm0 ; AVX512BW-NEXT: vpavgb 80(%rbp), %zmm1, %zmm1 ; AVX512BW-NEXT: vpavgb 144(%rbp), %zmm2, %zmm2 @@ -1706,6 +1705,7 @@ define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind { ; AVX512BW-NEXT: vpavgb 272(%rbp), %zmm4, %zmm4 ; AVX512BW-NEXT: vpavgb 336(%rbp), %zmm5, %zmm5 ; AVX512BW-NEXT: vpavgb 400(%rbp), %zmm6, %zmm6 +; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: vpavgb 464(%rbp), %zmm7, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdi) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdi) @@ -1975,7 +1975,8 @@ define <1 x i8> @avg_v1i8(<1 x i8> %x, <1 x i8> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: movzbl %sil, %eax ; CHECK-NEXT: movzbl %dil, %ecx -; CHECK-NEXT: leal 1(%rcx,%rax), %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: incl %eax ; CHECK-NEXT: shrl %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avgceils-scalar.ll b/llvm/test/CodeGen/X86/avgceils-scalar.ll index 91121bd4ad935..a3c35dfd9cda5 100644 --- a/llvm/test/CodeGen/X86/avgceils-scalar.ll +++ b/llvm/test/CodeGen/X86/avgceils-scalar.ll @@ -13,7 +13,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind { ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 1(%ecx,%eax), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: incl %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl @@ -22,7 +23,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movsbl %sil, %eax ; X64-NEXT: movsbl %dil, %ecx -; X64-NEXT: leal 1(%rcx,%rax), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -38,7 +40,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind { ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 1(%ecx,%eax), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: incl %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl @@ -47,7 +50,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movsbl %sil, %eax ; X64-NEXT: movsbl %dil, %ecx -; X64-NEXT: leal 1(%rcx,%rax), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -65,7 +69,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind { ; X86: # %bb.0: ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 1(%ecx,%eax), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: incl %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl @@ -74,7 +79,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movswl %si, %eax ; X64-NEXT: movswl %di, %ecx -; X64-NEXT: leal 1(%rcx,%rax), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -90,7 +96,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind { ; X86: # %bb.0: ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 1(%ecx,%eax), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: incl %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl @@ -99,7 +106,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movswl %si, %eax ; X64-NEXT: movswl %di, %ecx -; X64-NEXT: leal 1(%rcx,%rax), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -128,7 +136,8 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movslq %esi, %rax ; X64-NEXT: movslq %edi, %rcx -; X64-NEXT: leaq 1(%rcx,%rax), %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: incq %rax ; X64-NEXT: shrq %rax ; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq @@ -155,7 +164,8 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movslq %esi, %rax ; X64-NEXT: movslq %edi, %rcx -; X64-NEXT: leaq 1(%rcx,%rax), %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: incq %rax ; X64-NEXT: shrq %rax ; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll index 70b2d4be2fd81..640091cef5899 100644 --- a/llvm/test/CodeGen/X86/avgceils.ll +++ b/llvm/test/CodeGen/X86/avgceils.ll @@ -30,7 +30,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind { ; ; AVX2-LABEL: test_fixed_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 @@ -73,7 +73,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind { ; ; AVX2-LABEL: test_ext_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 @@ -343,7 +343,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind { ; ; AVX1-LABEL: test_fixed_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 @@ -394,7 +394,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind { ; ; AVX1-LABEL: test_ext_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 @@ -841,7 +841,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; ; AVX1-LABEL: test_fixed_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 @@ -912,7 +912,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; ; AVX1-LABEL: test_ext_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avgceilu-scalar.ll b/llvm/test/CodeGen/X86/avgceilu-scalar.ll index 4ab4851eccd2c..96efe6b4d7c8e 100644 --- a/llvm/test/CodeGen/X86/avgceilu-scalar.ll +++ b/llvm/test/CodeGen/X86/avgceilu-scalar.ll @@ -13,7 +13,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 1(%ecx,%eax), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: incl %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl @@ -22,7 +23,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: leal 1(%rcx,%rax), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -38,7 +40,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 1(%ecx,%eax), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: incl %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl @@ -47,7 +50,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: leal 1(%rcx,%rax), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -65,7 +69,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 1(%ecx,%eax), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: incl %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl @@ -74,7 +79,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movzwl %si, %eax ; X64-NEXT: movzwl %di, %ecx -; X64-NEXT: leal 1(%rcx,%rax), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -90,7 +96,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 1(%ecx,%eax), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: incl %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl @@ -99,7 +106,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movzwl %si, %eax ; X64-NEXT: movzwl %di, %ecx -; X64-NEXT: leal 1(%rcx,%rax), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq @@ -128,7 +136,8 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl %edi, %ecx -; X64-NEXT: leaq 1(%rcx,%rax), %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: incq %rax ; X64-NEXT: shrq %rax ; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq @@ -155,7 +164,8 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %esi, %eax ; X64-NEXT: movl %edi, %ecx -; X64-NEXT: leaq 1(%rcx,%rax), %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: incq %rax ; X64-NEXT: shrq %rax ; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avgfloors-scalar.ll b/llvm/test/CodeGen/X86/avgfloors-scalar.ll index fd303192e6c50..1b9de676f28cf 100644 --- a/llvm/test/CodeGen/X86/avgfloors-scalar.ll +++ b/llvm/test/CodeGen/X86/avgfloors-scalar.ll @@ -320,11 +320,11 @@ define i64 @test_lsb_i64(i64 %a0, i64 %a1) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sarl %ebx ; X86-NEXT: shldl $31, %eax, %edi +; X86-NEXT: sarl %ebx ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl %edx ; X86-NEXT: shldl $31, %esi, %ecx +; X86-NEXT: sarl %edx ; X86-NEXT: addl %edi, %ecx ; X86-NEXT: adcl %ebx, %edx ; X86-NEXT: andl %esi, %eax diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll index 0508e5ccb5430..b682df1b1d5de 100644 --- a/llvm/test/CodeGen/X86/avgfloors.ll +++ b/llvm/test/CodeGen/X86/avgfloors.ll @@ -41,7 +41,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind { ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -96,7 +96,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind { ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll index 11e886e25ba4e..256b06ef85942 100644 --- a/llvm/test/CodeGen/X86/avgflooru-i128.ll +++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll @@ -118,23 +118,23 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq %r9, %rsi ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: setb %dil -; CHECK-NEXT: movzbl %dil, %edi -; CHECK-NEXT: shldq $63, %rdx, %rdi +; CHECK-NEXT: setb %al +; CHECK-NEXT: movzbl %al, %r9d +; CHECK-NEXT: shldq $63, %rdx, %r9 ; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: movzbl %r9b, %r9d -; CHECK-NEXT: shldq $63, %r8, %r9 +; CHECK-NEXT: setb %al +; CHECK-NEXT: movzbl %al, %r10d +; CHECK-NEXT: shldq $63, %r8, %r10 ; CHECK-NEXT: shldq $63, %rsi, %rdx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shldq $63, %rcx, %r8 -; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %r9, 24(%rax) -; CHECK-NEXT: movq %rdi, 8(%rax) +; CHECK-NEXT: movq %r8, 16(%rdi) +; CHECK-NEXT: movq %rdx, (%rdi) +; CHECK-NEXT: movq %r10, 24(%rdi) +; CHECK-NEXT: movq %r9, 8(%rdi) ; CHECK-NEXT: retq start: %xor = xor <2 x i128> %y, %x diff --git a/llvm/test/CodeGen/X86/avgflooru-scalar.ll b/llvm/test/CodeGen/X86/avgflooru-scalar.ll index 9ae4492bb4cd4..42d9edf4a564b 100644 --- a/llvm/test/CodeGen/X86/avgflooru-scalar.ll +++ b/llvm/test/CodeGen/X86/avgflooru-scalar.ll @@ -273,8 +273,8 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { ; X86-LABEL: test_fixed_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setb %dl ; X86-NEXT: movzbl %dl, %edx @@ -308,11 +308,11 @@ define i64 @test_lsb_i64(i64 %a0, i64 %a1) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shrl %ebx ; X86-NEXT: shldl $31, %eax, %edi +; X86-NEXT: shrl %ebx ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl %edx ; X86-NEXT: shldl $31, %esi, %ecx +; X86-NEXT: shrl %edx ; X86-NEXT: addl %edi, %ecx ; X86-NEXT: adcl %ebx, %edx ; X86-NEXT: andl %esi, %eax @@ -348,8 +348,8 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind { ; X86-LABEL: test_ext_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setb %dl ; X86-NEXT: movzbl %dl, %edx diff --git a/llvm/test/CodeGen/X86/avoid-lea-scale2.ll b/llvm/test/CodeGen/X86/avoid-lea-scale2.ll index cee2ee4e03992..af171410b1242 100644 --- a/llvm/test/CodeGen/X86/avoid-lea-scale2.ll +++ b/llvm/test/CodeGen/X86/avoid-lea-scale2.ll @@ -1,8 +1,19 @@ -; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -; CHECK: leal -2({{%rdi,%rdi|%rcx,%rcx}}) +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX +; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefix=DARWIN define i32 @foo(i32 %x) nounwind readnone { +; LINUX-LABEL: foo: +; LINUX: # %bb.0: +; LINUX-NEXT: # kill: def $edi killed $edi def $rdi +; LINUX-NEXT: leal -2(,%rdi,2), %eax +; LINUX-NEXT: retq +; +; DARWIN-LABEL: foo: +; DARWIN: # %bb.0: +; DARWIN-NEXT: # kill: def $ecx killed $ecx def $rcx +; DARWIN-NEXT: leal -2(,%rcx,2), %eax +; DARWIN-NEXT: retq %t0 = shl i32 %x, 1 %t1 = add i32 %t0, -2 ret i32 %t1 diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll index b47f424acc942..d37d290e55a25 100644 --- a/llvm/test/CodeGen/X86/avx-basic.ll +++ b/llvm/test/CodeGen/X86/avx-basic.ll @@ -87,7 +87,7 @@ define <8 x i32> @VMOVZQI2PQI(ptr nocapture %aFOO) nounwind { define <16 x float> @fneg(<16 x float> %a) nounwind { ; CHECK-LABEL: fneg: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-cvt-3.ll b/llvm/test/CodeGen/X86/avx-cvt-3.ll index 760db4af1f1b4..fc3e6b7af5a33 100644 --- a/llvm/test/CodeGen/X86/avx-cvt-3.ll +++ b/llvm/test/CodeGen/X86/avx-cvt-3.ll @@ -91,13 +91,13 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X86: # %bb.0: ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-NEXT: movl $2, %eax ; X86-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 +; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; X86-NEXT: movl $-3, %eax ; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl @@ -106,13 +106,13 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X64: # %bb.0: ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X64-NEXT: movl $2, %eax ; X64-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 +; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; X64-NEXT: movl $-3, %eax ; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 7d0a5679936da..b9affa01414b2 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1486,13 +1486,13 @@ define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 ; X64-LABEL: test_mm256_set_epi8: ; X64: # %bb.0: ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; X64-NEXT: vmovd %eax, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; X64-NEXT: vpinsrb $3, %r10d, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax @@ -2095,28 +2095,28 @@ define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i ; X64-LABEL: test_mm256_setr_epi8: ; X64: # %bb.0: ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vmovd %r10d, %xmm0 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0 ; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrb $6, %r10d, %xmm0, %xmm0 ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrb $9, %r10d, %xmm0, %xmm0 +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrb $10, %r10d, %xmm0, %xmm0 ; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 @@ -2231,12 +2231,12 @@ define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 ; X64-LABEL: test_mm256_setr_epi16: ; X64: # %bb.0: ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vmovd %r10d, %xmm0 +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm0 +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0 ; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86_64.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86_64.ll index ea010ebc28a43..eaea5a0c46145 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86_64.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86_64.ll @@ -5,19 +5,23 @@ define <4 x double> @test_x86_avx_vzeroall(<4 x double> %a, <4 x double> %b) { ; AVX-LABEL: test_x86_avx_vzeroall: ; AVX: # %bb.0: -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: # encoding: [0xc5,0xfd,0x11,0x44,0x24,0xd8] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: # encoding: [0xc5,0xfc,0x11,0x4c,0x24,0xd8] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: # encoding: [0xc5,0xfc,0x11,0x44,0x24,0xb8] ; AVX-NEXT: vzeroall # encoding: [0xc5,0xfc,0x77] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x44,0x24,0xd8] +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # encoding: [0xc5,0xfd,0x10,0x44,0x24,0xb8] +; AVX-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # encoding: [0xc5,0xfd,0x58,0x44,0x24,0xd8] ; AVX-NEXT: retq # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vzeroall: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm16 # encoding: [0x62,0xe1,0xfd,0x28,0x58,0xc1] +; AVX512VL-NEXT: vmovapd %ymm1, %ymm16 # encoding: [0x62,0xe1,0xfd,0x28,0x28,0xc1] +; AVX512VL-NEXT: vmovapd %ymm0, %ymm17 # encoding: [0x62,0xe1,0xfd,0x28,0x28,0xc8] ; AVX512VL-NEXT: vzeroall # encoding: [0xc5,0xfc,0x77] -; AVX512VL-NEXT: vmovapd %ymm16, %ymm0 # encoding: [0x62,0xb1,0xfd,0x28,0x28,0xc0] +; AVX512VL-NEXT: vaddpd %ymm16, %ymm17, %ymm0 # encoding: [0x62,0xb1,0xf5,0x20,0x58,0xc0] ; AVX512VL-NEXT: retq # encoding: [0xc3] %c = fadd <4 x double> %a, %b call void @llvm.x86.avx.vzeroall() @@ -28,19 +32,23 @@ declare void @llvm.x86.avx.vzeroall() nounwind define <4 x double> @test_x86_avx_vzeroupper(<4 x double> %a, <4 x double> %b) { ; AVX-LABEL: test_x86_avx_vzeroupper: ; AVX: # %bb.0: -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: # encoding: [0xc5,0xfd,0x11,0x44,0x24,0xd8] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: # encoding: [0xc5,0xfc,0x11,0x4c,0x24,0xd8] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: # encoding: [0xc5,0xfc,0x11,0x44,0x24,0xb8] ; AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x44,0x24,0xd8] +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # encoding: [0xc5,0xfd,0x10,0x44,0x24,0xb8] +; AVX-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # encoding: [0xc5,0xfd,0x58,0x44,0x24,0xd8] ; AVX-NEXT: retq # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vzeroupper: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm16 # encoding: [0x62,0xe1,0xfd,0x28,0x58,0xc1] +; AVX512VL-NEXT: vmovapd %ymm1, %ymm16 # encoding: [0x62,0xe1,0xfd,0x28,0x28,0xc1] +; AVX512VL-NEXT: vmovapd %ymm0, %ymm17 # encoding: [0x62,0xe1,0xfd,0x28,0x28,0xc8] ; AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; AVX512VL-NEXT: vmovapd %ymm16, %ymm0 # encoding: [0x62,0xb1,0xfd,0x28,0x28,0xc0] +; AVX512VL-NEXT: vaddpd %ymm16, %ymm17, %ymm0 # encoding: [0x62,0xb1,0xf5,0x20,0x58,0xc0] ; AVX512VL-NEXT: retq # encoding: [0xc3] %c = fadd <4 x double> %a, %b call void @llvm.x86.avx.vzeroupper() diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll index 3b14e5a20b2f5..50a3b03c96979 100644 --- a/llvm/test/CodeGen/X86/avx-logic.ll +++ b/llvm/test/CodeGen/X86/avx-logic.ll @@ -461,9 +461,9 @@ define <8 x i32> @or_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) ; ; INT256-LABEL: or_disguised_i8_elts: ; INT256: # %bb.0: +; INT256-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255] ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; INT256-NEXT: vpor %ymm1, %ymm0, %ymm0 +; INT256-NEXT: vpor %ymm3, %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq %a = add <8 x i32> %x, %y @@ -490,9 +490,9 @@ define <8 x i32> @xor_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; ; INT256-LABEL: xor_disguised_i8_elts: ; INT256: # %bb.0: +; INT256-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255] ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; INT256-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; INT256-NEXT: vpxor %ymm3, %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq %a = add <8 x i32> %x, %y @@ -548,9 +548,9 @@ define <8 x i32> @or_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; ; INT256-LABEL: or_disguised_i16_elts: ; INT256: # %bb.0: +; INT256-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; INT256-NEXT: vpor %ymm1, %ymm0, %ymm0 +; INT256-NEXT: vpor %ymm3, %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq %a = add <8 x i32> %x, %y @@ -577,9 +577,9 @@ define <8 x i32> @xor_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; ; INT256-LABEL: xor_disguised_i16_elts: ; INT256: # %bb.0: +; INT256-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; INT256-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; INT256-NEXT: vpxor %ymm3, %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq %a = add <8 x i32> %x, %y diff --git a/llvm/test/CodeGen/X86/avx-select.ll b/llvm/test/CodeGen/X86/avx-select.ll index 1b688c8cf9cca..4c8d3a76cb0ce 100644 --- a/llvm/test/CodeGen/X86/avx-select.ll +++ b/llvm/test/CodeGen/X86/avx-select.ll @@ -16,8 +16,8 @@ define <8 x i32> @select00(i32 %a, <8 x i32> %b) nounwind { ; ; X64-LABEL: select00: ; X64: # %bb.0: -; X64-NEXT: cmpl $255, %edi ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: cmpl $255, %edi ; X64-NEXT: je .LBB0_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: vmovaps %ymm0, %ymm1 @@ -44,8 +44,8 @@ define <4 x i64> @select01(i32 %a, <4 x i64> %b) nounwind { ; ; X64-LABEL: select01: ; X64: # %bb.0: -; X64-NEXT: cmpl $255, %edi ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: cmpl $255, %edi ; X64-NEXT: je .LBB1_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: vmovaps %ymm0, %ymm1 diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll index 15c2aab1a82e1..9da732b9f6963 100644 --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -71,8 +71,8 @@ define <8 x float> @funcE() nounwind { ; X86-LABEL: funcE: ; X86: # %bb.0: # %allocas ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testb %al, %al ; X86-NEXT: # implicit-def: $ymm0 +; X86-NEXT: testb %al, %al ; X86-NEXT: jne .LBB4_2 ; X86-NEXT: # %bb.1: # %load.i1247 ; X86-NEXT: pushl %ebp @@ -88,8 +88,8 @@ define <8 x float> @funcE() nounwind { ; X64-LABEL: funcE: ; X64: # %bb.0: # %allocas ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testb %al, %al ; X64-NEXT: # implicit-def: $ymm0 +; X64-NEXT: testb %al, %al ; X64-NEXT: jne .LBB4_2 ; X64-NEXT: # %bb.1: # %load.i1247 ; X64-NEXT: pushq %rbp diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index 0bfd8921e8b42..5a438eee62e49 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -25,17 +25,13 @@ entry: define <4 x i64> @A2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp { ; X86-LABEL: A2: ; X86: ## %bb.0: ## %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-NEXT: movl 4(%ecx), %ecx ; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: popl %esi +; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: retl ; ; X64-LABEL: A2: diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll index 60fab8bc67379..d028e7475c067 100644 --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -651,7 +651,7 @@ entry: define <8 x i32> @ld0_hi0_lo1_8i32(ptr %pa, <8 x i32> %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: ld0_hi0_lo1_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -672,7 +672,7 @@ entry: define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, ptr %pb) nounwind uwtable readnone ssp { ; AVX1-LABEL: ld1_hi0_hi1_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4] ; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index 70b3b99b46ce9..d3e126e97a928 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -232,7 +232,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) { define <8 x i32> @mul_const9(<8 x i32> %x) { ; CHECK-LABEL: mul_const9: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,0] +; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0] ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %y = mul <8 x i32> %x, diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll index 7b35e602cc0fa..46d6e3d66a328 100644 --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -16,8 +16,7 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind { ; ; X86-FAST-ALL-LABEL: trunc4: ; X86-FAST-ALL: # %bb.0: -; X86-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; X86-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; X86-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; X86-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X86-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; X86-FAST-ALL-NEXT: vzeroupper @@ -39,8 +38,7 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind { ; ; X64-FAST-ALL-LABEL: trunc4: ; X64-FAST-ALL: # %bb.0: -; X64-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; X64-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; X64-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; X64-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X64-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; X64-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll index c5243a5c18a2d..22a82433c487e 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -72,34 +72,30 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readn define <32 x i8> @test_x86_avx2_packsswb_fold() { ; X86-AVX-LABEL: test_x86_avx2_packsswb_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] +; X86-AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packsswb_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] +; X64-AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> zeroinitializer) ret <32 x i8> %res @@ -125,34 +121,30 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readn define <32 x i8> @test_x86_avx2_packuswb_fold() { ; X86-AVX-LABEL: test_x86_avx2_packuswb_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] +; X86-AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packuswb_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] +; X64-AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> , <16 x i16> zeroinitializer) ret <32 x i8> %res @@ -1062,56 +1054,56 @@ define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i32> @test_x86_avx2_psllv_d_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_d_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x47,0xc9] ; X86-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x47,0xc9] ; X86-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psllv_d_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x47,0xc9] ; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x47,0xc9] ; X64-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] @@ -1140,14 +1132,14 @@ define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i32> @test_x86_avx2_psllv_d_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_d_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1155,14 +1147,14 @@ define <8 x i32> @test_x86_avx2_psllv_d_256_const() { ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1170,14 +1162,14 @@ define <8 x i32> @test_x86_avx2_psllv_d_256_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psllv_d_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1185,14 +1177,14 @@ define <8 x i32> @test_x86_avx2_psllv_d_256_const() { ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1221,36 +1213,36 @@ define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) { define <2 x i64> @test_x86_avx2_psllv_q_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_q_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,18446744073709551615] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,0,4294967295,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,18446744073709551615] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [4,0,4294967295,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psllv_q_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,18446744073709551615] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,18446744073709551615] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,18446744073709551615] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [4,18446744073709551615] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] @@ -1277,36 +1269,36 @@ define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) { define <4 x i64> @test_x86_avx2_psllv_q_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_q_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,18446744073709551615] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [4,0,4,0,4,0,4294967295,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,18446744073709551615] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [4,0,4,0,4,0,4294967295,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psllv_q_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,18446744073709551615] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,4,18446744073709551615] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,18446744073709551615] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,4,18446744073709551615] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] @@ -1333,14 +1325,14 @@ define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i32> @test_x86_avx2_psrlv_d_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_d_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] @@ -1348,14 +1340,14 @@ define <4 x i32> @test_x86_avx2_psrlv_d_const() { ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] @@ -1363,14 +1355,14 @@ define <4 x i32> @test_x86_avx2_psrlv_d_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_d_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] @@ -1378,14 +1370,14 @@ define <4 x i32> @test_x86_avx2_psrlv_d_const() { ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] @@ -1415,14 +1407,14 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i32> @test_x86_avx2_psrlv_d_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_d_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1430,14 +1422,14 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256_const() { ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1445,14 +1437,14 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_d_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1460,14 +1452,14 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256_const() { ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] @@ -1497,27 +1489,27 @@ define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) { define <2 x i64> @test_x86_avx2_psrlv_q_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,4] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,0,4,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,4] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [4,0,4,0] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,4] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] @@ -1554,8 +1546,8 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) { define <4 x i64> @test_x86_avx2_psrlv_q_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,4] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] +; X86-AVX-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -1563,9 +1555,9 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256_const() { ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,4,4,4] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] @@ -1610,36 +1602,36 @@ define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i32> @test_x86_avx2_psrav_d_const() { ; X86-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,4294967284,23] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,4294967284,23] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,4294967284,23] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,9,4294967284,23] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] @@ -1665,36 +1657,36 @@ define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i32> @test_x86_avx2_psrav_d_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X64-AVX512VL-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx2-nontemporal.ll b/llvm/test/CodeGen/X86/avx2-nontemporal.ll index cd16b30184482..b8de6f21bef70 100644 --- a/llvm/test/CodeGen/X86/avx2-nontemporal.ll +++ b/llvm/test/CodeGen/X86/avx2-nontemporal.ll @@ -9,9 +9,7 @@ define i32 @f(<8 x float> %A, ptr %B, <4 x double> %C, <4 x i64> %E, <8 x i32> % ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-32, %esp ; X86-NEXT: subl $32, %esp -; X86-NEXT: vmovdqa 104(%ebp), %ymm3 -; X86-NEXT: vmovdqa 72(%ebp), %ymm4 -; X86-NEXT: vmovdqa 40(%ebp), %ymm5 +; X86-NEXT: vmovdqa 72(%ebp), %ymm3 ; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: movl 136(%ebp), %edx ; X86-NEXT: movl (%edx), %eax @@ -19,17 +17,19 @@ define i32 @f(<8 x float> %A, ptr %B, <4 x double> %C, <4 x i64> %E, <8 x i32> % ; X86-NEXT: vmovntps %ymm0, (%ecx) ; X86-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm0 ; X86-NEXT: addl (%edx), %eax +; X86-NEXT: vmovdqa 40(%ebp), %ymm2 ; X86-NEXT: vmovntdq %ymm0, (%ecx) ; X86-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0 ; X86-NEXT: addl (%edx), %eax +; X86-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm1 ; X86-NEXT: vmovntpd %ymm0, (%ecx) -; X86-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm5, %ymm0 ; X86-NEXT: addl (%edx), %eax -; X86-NEXT: vmovntdq %ymm0, (%ecx) -; X86-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm4, %ymm0 +; X86-NEXT: vmovntdq %ymm1, (%ecx) +; X86-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm0 ; X86-NEXT: addl (%edx), %eax +; X86-NEXT: vmovdqa 104(%ebp), %ymm1 ; X86-NEXT: vmovntdq %ymm0, (%ecx) -; X86-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm0 +; X86-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0 ; X86-NEXT: addl (%edx), %eax ; X86-NEXT: vmovntdq %ymm0, (%ecx) ; X86-NEXT: movl %ebp, %esp @@ -47,10 +47,10 @@ define i32 @f(<8 x float> %A, ptr %B, <4 x double> %C, <4 x i64> %E, <8 x i32> % ; X64-NEXT: vmovntdq %ymm0, (%rdi) ; X64-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; X64-NEXT: addl (%rsi), %eax +; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 ; X64-NEXT: vmovntpd %ymm0, (%rdi) -; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm0 ; X64-NEXT: addl (%rsi), %eax -; X64-NEXT: vmovntdq %ymm0, (%rdi) +; X64-NEXT: vmovntdq %ymm1, (%rdi) ; X64-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm0 ; X64-NEXT: addl (%rsi), %eax ; X64-NEXT: vmovntdq %ymm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll index c50af6968f5bb..5f1e76808b44b 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -655,15 +655,25 @@ define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp { } define <8 x i8> @_e4(ptr %ptr) nounwind uwtable readnone ssp { -; X86-LABEL: _e4: -; X86: ## %bb.0: -; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52] -; X86-NEXT: retl +; X86-AVX2-LABEL: _e4: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52] +; X86-AVX2-NEXT: retl ; -; X64-LABEL: _e4: -; X64: ## %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52] -; X64-NEXT: retq +; X64-AVX2-LABEL: _e4: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52] +; X64-AVX2-NEXT: retq +; +; X86-AVX512VL-LABEL: _e4: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u] +; X86-AVX512VL-NEXT: retl +; +; X64-AVX512VL-LABEL: _e4: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u] +; X64-AVX512VL-NEXT: retq %vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0 %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1 %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2 diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll index 983c69d1a1c2e..2dd0244d4cd75 100644 --- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -60,14 +60,14 @@ define <8 x i32> @test_vpslld_var(i32 %shift) { ; X86-LABEL: test_vpslld_var: ; X86: # %bb.0: ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] +; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] ; X86-NEXT: vpslld %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_vpslld_var: ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] ; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq %amt = insertelement <8 x i32> undef, i32 %shift, i32 0 @@ -267,19 +267,19 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X86-SLOW-LABEL: srl_trunc_and_v4i64: ; X86-SLOW: # %bb.0: ; X86-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-SLOW-NEXT: vbroadcastss {{.*#+}} xmm3 = [8,8,8,8] ; X86-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X86-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X86-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X86-SLOW-NEXT: vandps %xmm3, %xmm1, %xmm1 ; X86-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X86-SLOW-NEXT: vzeroupper ; X86-SLOW-NEXT: retl ; ; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64: ; X86-FAST-ALL: # %bb.0: -; X86-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] +; X86-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,u,u,u,u] +; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [8,8,8,8] ; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-FAST-ALL-NEXT: vpand %xmm3, %xmm1, %xmm1 ; X86-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X86-FAST-ALL-NEXT: vzeroupper ; X86-FAST-ALL-NEXT: retl @@ -287,9 +287,9 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X86-FAST-PERLANE-LABEL: srl_trunc_and_v4i64: ; X86-FAST-PERLANE: # %bb.0: ; X86-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm3 = [8,8,8,8] ; X86-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X86-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X86-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X86-FAST-PERLANE-NEXT: vandps %xmm3, %xmm1, %xmm1 ; X86-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X86-FAST-PERLANE-NEXT: vzeroupper ; X86-FAST-PERLANE-NEXT: retl @@ -297,19 +297,19 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X64-SLOW-LABEL: srl_trunc_and_v4i64: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm3 = [8,8,8,8] ; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X64-SLOW-NEXT: vandps %xmm3, %xmm1, %xmm1 ; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X64-SLOW-NEXT: vzeroupper ; X64-SLOW-NEXT: retq ; ; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64: ; X64-FAST-ALL: # %bb.0: -; X64-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] +; X64-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,u,u,u,u] +; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [8,8,8,8] ; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-FAST-ALL-NEXT: vpand %xmm3, %xmm1, %xmm1 ; X64-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X64-FAST-ALL-NEXT: vzeroupper ; X64-FAST-ALL-NEXT: retq @@ -317,9 +317,9 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; X64-FAST-PERLANE-LABEL: srl_trunc_and_v4i64: ; X64-FAST-PERLANE: # %bb.0: ; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm3 = [8,8,8,8] ; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X64-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X64-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X64-FAST-PERLANE-NEXT: vandps %xmm3, %xmm1, %xmm1 ; X64-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X64-FAST-PERLANE-NEXT: vzeroupper ; X64-FAST-PERLANE-NEXT: retq @@ -446,7 +446,6 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { ; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4 ; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3 @@ -457,8 +456,9 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { ; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3 ; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm1 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0 -; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %ashr = ashr <32 x i8> %r, %a ret <32 x i8> %ashr diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index 20550fc4eb9fa..25ae50ae6e1ac 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -293,7 +293,7 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; AVX512F-LABEL: imulq128_bcast: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [8086,8086] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -313,7 +313,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; ; AVX512BW-LABEL: imulq128_bcast: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [8086,8086] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -324,7 +324,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; AVX512DQ-LABEL: imulq128_bcast: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm1 = [8086,8086] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll index ba2cacc087b36..c39e317245f86 100644 --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -7,14 +7,15 @@ define void @bcast_unfold_add_v16i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_add_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpaddd (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -39,14 +40,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_add_v8i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_add_v8i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpaddd (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -71,14 +73,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_add_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_add_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB2_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpaddd (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB2_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -102,14 +105,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_add_v8i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_add_v8i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB3_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpaddq (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB3_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -134,14 +138,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_add_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_add_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB4_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpaddq (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB4_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -166,14 +171,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_add_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_add_v2i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB5_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpaddq (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB5_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -197,15 +203,16 @@ bb10: ; preds = %bb2 define void @bcast_unfold_mul_v16i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_mul_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0 +; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB6_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -230,15 +237,16 @@ bb10: ; preds = %bb2 define void @bcast_unfold_mul_v8i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_mul_v8i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB7_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0 +; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm0 ; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm1 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovdqu %ymm0, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB7_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -263,15 +271,16 @@ bb10: ; preds = %bb2 define void @bcast_unfold_mul_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_mul_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB8_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0 +; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovdqu %xmm0, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB8_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -295,15 +304,16 @@ bb10: ; preds = %bb2 define void @bcast_unfold_mul_v8i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_mul_v8i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB9_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0 +; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB9_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -328,15 +338,16 @@ bb10: ; preds = %bb2 define void @bcast_unfold_mul_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_mul_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB10_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0 +; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm0 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovdqu %ymm0, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB10_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -361,15 +372,16 @@ bb10: ; preds = %bb2 define void @bcast_unfold_mul_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_mul_v2i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB11_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm0 +; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm0 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vmovdqu %xmm0, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovdqu %xmm0, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB11_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -393,14 +405,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_or_v16i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_or_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB12_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpord 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpord (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB12_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -425,14 +438,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_or_v8i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_or_v8i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB13_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vorps (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB13_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -457,14 +471,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_or_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_or_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB14_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vorps (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB14_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -488,14 +503,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_or_v8i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_or_v8i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB15_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vporq 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vporq (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB15_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -520,14 +536,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_or_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_or_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB16_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vorps (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB16_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -552,15 +569,16 @@ bb10: ; preds = %bb2 define void @bcast_unfold_or_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_or_v2i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3,3] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB17_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vorps 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vorps (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB17_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -584,14 +602,15 @@ bb10: ; preds = %bb2 define void @bcast_unfold_fneg_v16f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fneg_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB18_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpxord 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpxord (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB18_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -616,14 +635,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fneg_v8f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fneg_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB19_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vxorps (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB19_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -648,14 +668,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fneg_v4f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fneg_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB20_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vxorps (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB20_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -679,14 +700,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fneg_v8f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fneg_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB21_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpxorq 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpxorq (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB21_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -711,14 +733,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fneg_v4f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fneg_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB22_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vxorps (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB22_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -743,15 +766,16 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fneg_v2f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fneg_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB23_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vxorps 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vxorps (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB23_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -775,14 +799,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fabs_v16f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fabs_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB24_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpandd 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpandd (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB24_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -810,14 +835,15 @@ declare <16 x float> @llvm.fabs.v16f32(<16 x float>) #0 define void @bcast_unfold_fabs_v8f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fabs_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB25_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vandps (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB25_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -845,14 +871,15 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0 define void @bcast_unfold_fabs_v4f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fabs_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB26_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vandps (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB26_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -879,14 +906,15 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #0 define void @bcast_unfold_fabs_v8f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fabs_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB27_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpandq 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpandq (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB27_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -914,14 +942,15 @@ declare <8 x double> @llvm.fabs.v8f64(<8 x double>) #0 define void @bcast_unfold_fabs_v4f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fabs_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB28_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vandps (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB28_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -949,15 +978,16 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #0 define void @bcast_unfold_fabs_v2f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fabs_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [NaN,NaN] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB29_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vandps 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vandps (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB29_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -984,14 +1014,15 @@ declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0 define void @bcast_unfold_fadd_v16f32(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fadd_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB30_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vaddps 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vaddps (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovups %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB30_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1016,14 +1047,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fadd_v8f32(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fadd_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB31_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vaddps 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vaddps (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB31_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1048,14 +1080,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fadd_v4f32(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fadd_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB32_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vaddps 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vaddps (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB32_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -1079,14 +1112,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fadd_v8f64(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fadd_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB33_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vaddpd (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovupd %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB33_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1111,14 +1145,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fadd_v4f64(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fadd_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB34_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vaddpd (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovupd %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB34_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1143,15 +1178,16 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fadd_v2f64(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fadd_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB35_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vaddpd (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovupd %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB35_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -1175,14 +1211,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fmul_v16f32(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fmul_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB36_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmulps 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmulps (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovups %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB36_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1207,14 +1244,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fmul_v8f32(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fmul_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB37_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmulps 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmulps (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB37_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1239,14 +1277,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fmul_v4f32(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fmul_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB38_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmulps 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmulps (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB38_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -1270,14 +1309,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fmul_v8f64(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fmul_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB39_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmulpd (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovupd %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB39_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1302,14 +1342,15 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fmul_v4f64(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fmul_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB40_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmulpd (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovupd %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB40_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1334,15 +1375,16 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fmul_v2f64(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fmul_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB41_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmulpd (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovupd %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB41_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -1366,15 +1408,16 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fdiv_v16f32(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fdiv_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB42_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %zmm1 ; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovups %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB42_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1399,15 +1442,16 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fdiv_v8f32(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fdiv_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB43_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %ymm1 ; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB43_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1432,15 +1476,16 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fdiv_v4f32(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fdiv_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB44_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %xmm1 ; CHECK-NEXT: vdivps %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB44_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -1464,15 +1509,16 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fdiv_v8f64(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fdiv_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB45_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %zmm1 ; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovupd %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB45_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1497,15 +1543,16 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fdiv_v4f64(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fdiv_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB46_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovupd %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB46_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: vzeroupper @@ -1530,16 +1577,17 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fdiv_v2f64(ptr nocapture %arg) { ; CHECK-LABEL: bcast_unfold_fdiv_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB47_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vdivpd %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovupd %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB47_1 ; CHECK-NEXT: # %bb.2: # %bb9 ; CHECK-NEXT: retq @@ -1563,15 +1611,16 @@ bb9: ; preds = %bb1 define void @bcast_unfold_fma213_v4f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB48_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %xmm1 ; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB48_1 ; CHECK-NEXT: # %bb.2: # %bb11 ; CHECK-NEXT: retq @@ -1596,15 +1645,16 @@ bb11: ; preds = %bb2 define void @bcast_unfold_fma231_v4f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB49_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %xmm1 ; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB49_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -1629,15 +1679,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fma213_v8f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB50_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %ymm1 ; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB50_1 ; CHECK-NEXT: # %bb.2: # %bb11 ; CHECK-NEXT: vzeroupper @@ -1663,15 +1714,16 @@ bb11: ; preds = %bb2 define void @bcast_unfold_fma231_v8f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB51_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %ymm1 ; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB51_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -1697,15 +1749,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fma213_v16f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB52_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %zmm1 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovups %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB52_1 ; CHECK-NEXT: # %bb.2: # %bb11 ; CHECK-NEXT: vzeroupper @@ -1731,15 +1784,16 @@ bb11: ; preds = %bb2 define void @bcast_unfold_fma231_v16f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB53_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %zmm1 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovups %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB53_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -1765,16 +1819,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fma213_v2f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB54_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vfmadd213pd {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovupd %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB54_1 ; CHECK-NEXT: # %bb.2: # %bb11 ; CHECK-NEXT: retq @@ -1799,16 +1854,17 @@ bb11: ; preds = %bb2 define void @bcast_unfold_fma231_v2f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB55_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovupd %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB55_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -1833,15 +1889,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fma213_v4f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB56_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovupd %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB56_1 ; CHECK-NEXT: # %bb.2: # %bb11 ; CHECK-NEXT: vzeroupper @@ -1867,15 +1924,16 @@ bb11: ; preds = %bb2 define void @bcast_unfold_fma231_v4f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB57_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovupd %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB57_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -1901,15 +1959,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fma213_v8f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB58_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %zmm1 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovupd %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB58_1 ; CHECK-NEXT: # %bb.2: # %bb11 ; CHECK-NEXT: vzeroupper @@ -1935,15 +1994,16 @@ bb11: ; preds = %bb2 define void @bcast_unfold_fma231_v8f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB59_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %zmm1 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovupd %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB59_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -1969,15 +2029,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmax_v4f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmax_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB60_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %xmm1 ; CHECK-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB60_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2002,15 +2063,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmax_v8f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmax_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB61_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %ymm1 ; CHECK-NEXT: vmaxps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB61_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2036,15 +2098,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmax_v16f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmax_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB62_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %zmm1 ; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovups %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB62_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2070,16 +2133,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmax_v2f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmax_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB63_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vmaxpd %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovupd %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB63_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2104,15 +2168,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmax_v4f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmax_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB64_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovupd %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB64_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2138,15 +2203,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmax_v8f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmax_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB65_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %zmm1 ; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovupd %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB65_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2172,15 +2238,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmin_v4f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmin_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB66_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %xmm1 ; CHECK-NEXT: vminps %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB66_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2205,15 +2272,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmin_v8f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmin_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB67_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %ymm1 ; CHECK-NEXT: vminps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovups %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB67_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2239,15 +2307,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmin_v16f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmin_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB68_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %zmm1 ; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovups %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB68_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2273,16 +2342,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmin_v2f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmin_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB69_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovupd %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB69_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2307,15 +2377,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmin_v4f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmin_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB70_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovupd %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB70_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2341,15 +2412,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_fmin_v8f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_fmin_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB71_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %zmm1 ; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovupd %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB71_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2375,14 +2447,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smin_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smin_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB72_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpminsd (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB72_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2407,14 +2480,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smin_v8i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smin_v8i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB73_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpminsd (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB73_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2440,14 +2514,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smin_v16i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smin_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB74_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpminsd (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB74_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2473,14 +2548,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smin_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smin_v2i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB75_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpminsq (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB75_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2505,14 +2581,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smin_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smin_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB76_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpminsq (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB76_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2538,14 +2615,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smin_v8i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smin_v8i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB77_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpminsq (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB77_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2571,14 +2649,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smax_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smax_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB78_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpmaxsd (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB78_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2603,14 +2682,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smax_v8i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smax_v8i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB79_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpmaxsd (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB79_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2636,14 +2716,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smax_v16i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smax_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB80_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpmaxsd (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB80_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2669,14 +2750,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smax_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smax_v2i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB81_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpmaxsq (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB81_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2701,14 +2783,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smax_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smax_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB82_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpmaxsq (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB82_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2734,14 +2817,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_smax_v8i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_smax_v8i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB83_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpmaxsq (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB83_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2767,14 +2851,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umin_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umin_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB84_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminud 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpminud (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB84_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2799,14 +2884,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umin_v8i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umin_v8i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB85_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminud 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpminud (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB85_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2832,14 +2918,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umin_v16i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umin_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB86_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminud 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpminud (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB86_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2865,14 +2952,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umin_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umin_v2i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB87_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpminuq (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB87_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2897,14 +2985,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umin_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umin_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB88_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpminuq (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB88_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2930,14 +3019,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umin_v8i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umin_v8i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB89_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpminuq (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB89_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -2963,14 +3053,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umax_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umax_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB90_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpmaxud (%rdi,%rax,4), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB90_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -2995,14 +3086,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umax_v8i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umax_v8i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB91_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpmaxud (%rdi,%rax,4), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB91_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3028,14 +3120,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umax_v16i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umax_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB92_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpmaxud (%rdi,%rax,4), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB92_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3061,14 +3154,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umax_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umax_v2i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB93_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vpmaxuq (%rdi,%rax,8), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB93_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -3093,14 +3187,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umax_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umax_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB94_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vpmaxuq (%rdi,%rax,8), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB94_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3126,14 +3221,15 @@ bb10: ; preds = %bb1 define void @bcast_unfold_umax_v8i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_umax_v8i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB95_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vpmaxuq (%rdi,%rax,8), %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB95_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3159,16 +3255,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpgt_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpgt_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB96_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB96_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -3193,16 +3290,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpgt_v8i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpgt_v8i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB97_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 ; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB97_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3228,16 +3326,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpgt_v16i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpgt_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB98_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 ; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB98_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3263,16 +3362,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpgt_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB99_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB99_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -3297,16 +3397,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpgt_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpgt_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB100_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB100_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3332,16 +3433,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpgt_v8i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpgt_v8i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB101_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 ; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB101_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3367,16 +3469,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpeq_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpeq_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB102_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB102_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -3401,16 +3504,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpeq_v8i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpeq_v8i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB103_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 ; CHECK-NEXT: vpcmpeqd %ymm0, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB103_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3436,16 +3540,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpeq_v16i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpeq_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB104_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 ; CHECK-NEXT: vpcmpeqd %zmm0, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB104_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3471,16 +3576,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpeq_v2i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB105_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 ; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB105_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -3505,16 +3611,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpeq_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpeq_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB106_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB106_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -3540,16 +3647,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_pcmpeq_v8i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_pcmpeq_v8i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB107_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 ; CHECK-NEXT: vpcmpeqq %zmm0, %zmm1, %k1 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB107_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -4003,17 +4111,18 @@ bb10: ; preds = %bb1 define void @bcast_unfold_cmp_v4f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v4f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB120_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm2 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %xmm2 ; CHECK-NEXT: vcmpltps %xmm0, %xmm2, %k1 ; CHECK-NEXT: vblendmps %xmm2, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovups %xmm2, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovups %xmm2, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB120_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -4038,17 +4147,18 @@ bb10: ; preds = %bb1 define void @bcast_unfold_cmp_v8f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v8f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB121_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm2 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %ymm2 ; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %k1 ; CHECK-NEXT: vblendmps %ymm2, %ymm1, %ymm2 {%k1} -; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovups %ymm2, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB121_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -4074,17 +4184,18 @@ bb10: ; preds = %bb1 define void @bcast_unfold_cmp_v16f32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v16f32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB122_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm2 +; CHECK-NEXT: vmovups (%rdi,%rax,4), %zmm2 ; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1 ; CHECK-NEXT: vblendmps %zmm2, %zmm1, %zmm2 {%k1} -; CHECK-NEXT: vmovups %zmm2, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovups %zmm2, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB122_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -4110,7 +4221,7 @@ bb10: ; preds = %bb1 define void @bcast_unfold_cmp_v2f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v2f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] ; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [3.0E+0,3.0E+0] @@ -4118,11 +4229,12 @@ define void @bcast_unfold_cmp_v2f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB123_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm2 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %xmm2 ; CHECK-NEXT: vcmpltpd %xmm0, %xmm2, %k1 ; CHECK-NEXT: vblendmpd %xmm2, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovupd %xmm2, 8192(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovupd %xmm2, (%rdi,%rax,8) +; CHECK-NEXT: addq $2, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB123_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -4147,17 +4259,18 @@ bb10: ; preds = %bb1 define void @bcast_unfold_cmp_v4f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v4f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB124_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm2 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %ymm2 ; CHECK-NEXT: vcmpltpd %ymm0, %ymm2, %k1 ; CHECK-NEXT: vblendmpd %ymm2, %ymm1, %ymm2 {%k1} -; CHECK-NEXT: vmovupd %ymm2, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovupd %ymm2, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB124_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -4183,17 +4296,18 @@ bb10: ; preds = %bb1 define void @bcast_unfold_cmp_v8f64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v8f64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB125_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm2 +; CHECK-NEXT: vmovupd (%rdi,%rax,8), %zmm2 ; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1 ; CHECK-NEXT: vblendmpd %zmm2, %zmm1, %zmm2 {%k1} -; CHECK-NEXT: vmovupd %zmm2, 8192(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovupd %zmm2, (%rdi,%rax,8) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB125_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -4219,15 +4333,16 @@ bb10: ; preds = %bb1 define void @bcast_unfold_cmp_v8f32_refold(ptr nocapture %0) { ; CHECK-LABEL: bcast_unfold_cmp_v8f32_refold: ; CHECK: # %bb.0: -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB126_1: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vcmpgtps 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vcmpgtps (%rdi,%rax,4), %ymm0, %k1 ; CHECK-NEXT: vblendmps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 {%k1} -; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovups %ymm2, (%rdi,%rax,4) +; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB126_1 ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: vzeroupper @@ -4252,16 +4367,17 @@ define void @bcast_unfold_cmp_v8f32_refold(ptr nocapture %0) { define void @bcast_unfold_ptestm_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_ptestm_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB127_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB127_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -4287,16 +4403,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_ptestnm_v4i32(ptr %arg) { ; CHECK-LABEL: bcast_unfold_ptestnm_v4i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB128_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB128_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: retq @@ -4322,16 +4439,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_ptestm_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_ptestm_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB129_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB129_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -4358,16 +4476,17 @@ bb10: ; preds = %bb1 define void @bcast_unfold_ptestnm_v4i64(ptr %arg) { ; CHECK-LABEL: bcast_unfold_ptestnm_v4i64: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB130_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 ; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) -; CHECK-NEXT: addq $32, %rax +; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: addq $4, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB130_1 ; CHECK-NEXT: # %bb.2: # %bb10 ; CHECK-NEXT: vzeroupper @@ -4394,18 +4513,19 @@ bb10: ; preds = %bb1 define void @bcast_unfold_vpternlog_v16i32(ptr %arg, ptr %arg1) { ; CHECK-LABEL: bcast_unfold_vpternlog_v16i32: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB131_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vmovdqu64 4096(%rsi,%rax), %zmm2 +; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 +; CHECK-NEXT: vmovdqu64 (%rsi,%rax,4), %zmm2 ; CHECK-NEXT: vpmulld %zmm2, %zmm1, %zmm3 ; CHECK-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm1)) ; CHECK-NEXT: vpmulld %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) -; CHECK-NEXT: addq $64, %rax +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: addq $16, %rax +; CHECK-NEXT: cmpq $1024, %rax # imm = 0x400 ; CHECK-NEXT: jne .LBB131_1 ; CHECK-NEXT: # %bb.2: # %bb20 ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index a78d97782e6a3..3fb92e485bfc7 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -25,24 +25,24 @@ define <8 x double> @sltof864(<8 x i64> %a) { ; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -69,12 +69,12 @@ define <4 x double> @slto4f64(<4 x i64> %a) { ; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq ; @@ -150,11 +150,11 @@ define <4 x float> @slto4f32_mem(ptr %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 ; NODQ-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 +; NODQ-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm2 +; NODQ-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; NODQ-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; NODQ-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; NODQ-NEXT: retq ; ; VLDQ-LABEL: slto4f32_mem: @@ -249,13 +249,13 @@ define <4 x float> @slto4f32(<4 x i64> %a) { ; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NODQ-NEXT: vzeroupper ; NODQ-NEXT: retq @@ -284,13 +284,13 @@ define <4 x float> @ulto4f32(<4 x i64> %a) { ; NODQ-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NODQ-NEXT: vzeroupper ; NODQ-NEXT: retq @@ -319,13 +319,13 @@ define <4 x float> @ulto4f32_nneg(<4 x i64> %a) { ; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NODQ-NEXT: vzeroupper ; NODQ-NEXT: retq @@ -784,8 +784,8 @@ define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x doubl ; NOVL-LABEL: f32to4f64_mask: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVL-NEXT: vcvtps2pd %xmm0, %ymm0 +; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVL-NEXT: vcmpltpd %zmm2, %zmm1, %k1 ; NOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -831,8 +831,8 @@ define <4 x double> @f32to4f64_maskz_load(ptr %p, <4 x double> %b1, <4 x double> ; NOVL-LABEL: f32to4f64_maskz_load: ; NOVL: # %bb.0: ; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; NOVL-NEXT: vcvtps2pd (%rdi), %ymm2 +; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; NOVL-NEXT: vcmpltpd %zmm1, %zmm0, %k1 ; NOVL-NEXT: vmovapd %zmm2, %zmm0 {%k1} {z} ; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -993,26 +993,26 @@ define <8 x float> @slto8f32(<8 x i64> %a) { ; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-NEXT: vmovq %xmm0, %rcx +; NODQ-NEXT: vcvtsi2ss %rcx, %xmm5, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq ; @@ -1037,52 +1037,52 @@ define <16 x float> @slto16f32(<16 x i64> %a) { ; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NODQ-NEXT: vmovq %xmm4, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 +; NODQ-NEXT: vpextrq $1, %xmm4, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm7, %xmm7 ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; NODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm8, %xmm8 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm9, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[2,3] +; NODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm4 +; NODQ-NEXT: vpextrq $1, %xmm4, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm9, %xmm5 +; NODQ-NEXT: vmovq %xmm4, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm9, %xmm4 +; NODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm6 +; NODQ-NEXT: vmovq %xmm6, %rax +; NODQ-NEXT: vcvtsi2ss %rax, %xmm9, %xmm7 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm8[0],xmm3[3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm6, %rax +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm9, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[2,3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm9, %xmm4 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm9, %xmm5 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm9, %xmm6 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[2,3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm6[0],xmm3[3] +; NODQ-NEXT: vcvtsi2ss %rax, %xmm9, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; NODQ-NEXT: retq @@ -1112,24 +1112,24 @@ define <8 x double> @slto8f64(<8 x i64> %a) { ; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1162,44 +1162,44 @@ define <16 x double> @slto16f64(<16 x i64> %a) { ; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax ; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm5 +; NODQ-NEXT: vpextrq $1, %xmm5, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm6, %xmm6 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; NODQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; NODQ-NEXT: vmovq %xmm5, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm4 ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm5 +; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 -; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm0 +; NODQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm6[0] +; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm2 -; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm3 +; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NODQ-NEXT: vpextrq $1, %xmm5, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm6 +; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; NODQ-NEXT: vmovq %xmm5, %rax +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm2 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm3 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm4 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm7, %xmm1 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 @@ -1228,26 +1228,26 @@ define <8 x float> @ulto8f32(<8 x i64> %a) { ; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax ; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 +; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NODQ-NEXT: vmovq %xmm3, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 +; NODQ-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-NEXT: vmovq %xmm0, %rcx +; NODQ-NEXT: vcvtusi2ss %rcx, %xmm5, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq ; @@ -1272,52 +1272,52 @@ define <16 x float> @ulto16f32(<16 x i64> %a) { ; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax ; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NODQ-NEXT: vmovq %xmm4, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm5 +; NODQ-NEXT: vpextrq $1, %xmm4, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm6, %xmm4 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm6, %xmm6 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vcvtusi2ss %rax, %xmm7, %xmm7 ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; NODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm8, %xmm8 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0],xmm2[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0] +; NODQ-NEXT: vcvtusi2ss %rax, %xmm9, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[2,3] +; NODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm4 +; NODQ-NEXT: vpextrq $1, %xmm4, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm9, %xmm5 +; NODQ-NEXT: vmovq %xmm4, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm9, %xmm4 +; NODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm6 +; NODQ-NEXT: vmovq %xmm6, %rax +; NODQ-NEXT: vcvtusi2ss %rax, %xmm9, %xmm7 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm8[0],xmm3[3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] +; NODQ-NEXT: vpextrq $1, %xmm6, %rax +; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm9, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[2,3] +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm9, %xmm4 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; NODQ-NEXT: vcvtusi2ss %rax, %xmm9, %xmm5 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; NODQ-NEXT: vcvtusi2ss %rax, %xmm9, %xmm6 +; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] +; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[2,3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm6[0],xmm3[3] +; NODQ-NEXT: vcvtusi2ss %rax, %xmm9, %xmm3 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; NODQ-NEXT: retq @@ -2383,8 +2383,8 @@ define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) { define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) { ; NOVLDQ-LABEL: test_4f64tosb: ; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2406,8 +2406,8 @@ define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) { ; ; DQNOVL-LABEL: test_4f64tosb: ; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 ; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2598,10 +2598,10 @@ define <2 x double> @test_sito2f64_mask_load(ptr%a, <2 x i64> %c) { ; NOVLDQ-LABEL: test_sito2f64_mask_load: ; NOVLDQ: # %bb.0: ; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vcvtdq2pd (%rdi), %xmm0 -; NOVLDQ-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} +; NOVLDQ-NEXT: vcvtdq2pd (%rdi), %xmm1 +; NOVLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; NOVLDQ-NEXT: vpcmpgtq %zmm0, %zmm2, %k1 +; NOVLDQ-NEXT: vmovapd %zmm1, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; NOVLDQ-NEXT: vzeroupper ; NOVLDQ-NEXT: retq @@ -2621,10 +2621,10 @@ define <2 x double> @test_sito2f64_mask_load(ptr%a, <2 x i64> %c) { ; ; DQNOVL-LABEL: test_sito2f64_mask_load: ; DQNOVL: # %bb.0: +; DQNOVL-NEXT: vcvtdq2pd (%rdi), %xmm1 ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; DQNOVL-NEXT: vpmovq2m %zmm0, %k1 -; DQNOVL-NEXT: vcvtdq2pd (%rdi), %xmm0 -; DQNOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} +; DQNOVL-NEXT: vmovapd %zmm1, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; DQNOVL-NEXT: vzeroupper ; DQNOVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll index 5115c3cdc259a..712f32a6784e5 100644 --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -1256,9 +1256,9 @@ define void @load_v32i1_broadcast_31_v8i1_store(ptr %a0,ptr %a1) { ; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: kmovb 3(%rdi), %k0 -; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm0 -; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; AVX512-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [7,7,7,7,7,7,7,7] +; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm1 +; AVX512-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512-FAST-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-FAST-NEXT: kmovb %k0, (%rsi) ; AVX512-FAST-NEXT: vzeroupper @@ -1519,9 +1519,9 @@ define void @load_v64i1_broadcast_63_v8i1_store(ptr %a0,ptr %a1) { ; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: kmovb 7(%rdi), %k0 -; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm0 -; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; AVX512-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [7,7,7,7,7,7,7,7] +; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm1 +; AVX512-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512-FAST-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-FAST-NEXT: kmovb %k0, (%rsi) ; AVX512-FAST-NEXT: vzeroupper @@ -1574,9 +1574,9 @@ define void @load_v64i1_broadcast_63_v16i1_store(ptr %a0,ptr %a1) { ; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: kmovw 6(%rdi), %k0 -; AVX512-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} zmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-FAST-NEXT: vpmovm2d %k0, %zmm1 +; AVX512-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 ; AVX512-FAST-NEXT: vpmovd2m %zmm0, %k0 ; AVX512-FAST-NEXT: kmovw %k0, (%rsi) ; AVX512-FAST-NEXT: vzeroupper @@ -1596,9 +1596,9 @@ define void @load_v64i1_broadcast_63_v16i1_store(ptr %a0,ptr %a1) { ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store: ; AVX512NOTDQ-FAST: # %bb.0: ; AVX512NOTDQ-FAST-NEXT: kmovw 6(%rdi), %k1 -; AVX512NOTDQ-FAST-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512NOTDQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} zmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512NOTDQ-FAST-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 +; AVX512NOTDQ-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 ; AVX512NOTDQ-FAST-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512NOTDQ-FAST-NEXT: kmovw %k0, (%rsi) ; AVX512NOTDQ-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index e183da1386d5b..27ae59f0eebb1 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -381,13 +381,21 @@ define i8 @test17(ptr%addr, i8 %a) nounwind { } define i64 @extract_v8i64(<8 x i64> %x, ptr %dst) nounwind { -; CHECK-LABEL: extract_v8i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; KNL-LABEL: extract_v8i64: +; KNL: ## %bb.0: +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpextrq $1, %xmm1, (%rdi) +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: extract_v8i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 +; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %r1 = extractelement <8 x i64> %x, i32 1 %r2 = extractelement <8 x i64> %x, i32 3 store i64 %r2, ptr %dst, align 1 @@ -395,13 +403,21 @@ define i64 @extract_v8i64(<8 x i64> %x, ptr %dst) nounwind { } define i64 @extract_v4i64(<4 x i64> %x, ptr %dst) nounwind { -; CHECK-LABEL: extract_v4i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; KNL-LABEL: extract_v4i64: +; KNL: ## %bb.0: +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpextrq $1, %xmm1, (%rdi) +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: extract_v4i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 +; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %r1 = extractelement <4 x i64> %x, i32 1 %r2 = extractelement <4 x i64> %x, i32 3 store i64 %r2, ptr %dst, align 1 @@ -409,11 +425,17 @@ define i64 @extract_v4i64(<4 x i64> %x, ptr %dst) nounwind { } define i64 @extract_v2i64(<2 x i64> %x, ptr %dst) nounwind { -; CHECK-LABEL: extract_v2i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vmovq %xmm0, %rax -; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi) -; CHECK-NEXT: retq +; KNL-LABEL: extract_v2i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpextrq $1, %xmm0, (%rdi) +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: retq +; +; SKX-LABEL: extract_v2i64: +; SKX: ## %bb.0: +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) +; SKX-NEXT: retq %r1 = extractelement <2 x i64> %x, i32 0 %r2 = extractelement <2 x i64> %x, i32 1 store i64 %r2, ptr %dst, align 1 @@ -421,13 +443,21 @@ define i64 @extract_v2i64(<2 x i64> %x, ptr %dst) nounwind { } define i32 @extract_v16i32(<16 x i32> %x, ptr %dst) nounwind { -; CHECK-LABEL: extract_v16i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vextractps $1, %xmm0, %eax -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vextractps $1, %xmm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; KNL-LABEL: extract_v16i32: +; KNL: ## %bb.0: +; KNL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; KNL-NEXT: vextractps $1, %xmm1, (%rdi) +; KNL-NEXT: vextractps $1, %xmm0, %eax +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: extract_v16i32: +; SKX: ## %bb.0: +; SKX-NEXT: vextractps $1, %xmm0, %eax +; SKX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; SKX-NEXT: vextractps $1, %xmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %r1 = extractelement <16 x i32> %x, i32 1 %r2 = extractelement <16 x i32> %x, i32 5 store i32 %r2, ptr %dst, align 1 @@ -435,13 +465,21 @@ define i32 @extract_v16i32(<16 x i32> %x, ptr %dst) nounwind { } define i32 @extract_v8i32(<8 x i32> %x, ptr %dst) nounwind { -; CHECK-LABEL: extract_v8i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vextractps $1, %xmm0, %eax -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vextractps $1, %xmm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; KNL-LABEL: extract_v8i32: +; KNL: ## %bb.0: +; KNL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; KNL-NEXT: vextractps $1, %xmm1, (%rdi) +; KNL-NEXT: vextractps $1, %xmm0, %eax +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: extract_v8i32: +; SKX: ## %bb.0: +; SKX-NEXT: vextractps $1, %xmm0, %eax +; SKX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; SKX-NEXT: vextractps $1, %xmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %r1 = extractelement <8 x i32> %x, i32 1 %r2 = extractelement <8 x i32> %x, i32 5 store i32 %r2, ptr %dst, align 1 @@ -449,11 +487,17 @@ define i32 @extract_v8i32(<8 x i32> %x, ptr %dst) nounwind { } define i32 @extract_v4i32(<4 x i32> %x, ptr %dst) nounwind { -; CHECK-LABEL: extract_v4i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vextractps $1, %xmm0, %eax -; CHECK-NEXT: vextractps $3, %xmm0, (%rdi) -; CHECK-NEXT: retq +; KNL-LABEL: extract_v4i32: +; KNL: ## %bb.0: +; KNL-NEXT: vextractps $3, %xmm0, (%rdi) +; KNL-NEXT: vextractps $1, %xmm0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: extract_v4i32: +; SKX: ## %bb.0: +; SKX-NEXT: vextractps $1, %xmm0, %eax +; SKX-NEXT: vextractps $3, %xmm0, (%rdi) +; SKX-NEXT: retq %r1 = extractelement <4 x i32> %x, i32 1 %r2 = extractelement <4 x i32> %x, i32 3 store i32 %r2, ptr %dst, align 1 @@ -1754,7 +1798,6 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $192, %rsp ; KNL-NEXT: movl 744(%rbp), %eax -; KNL-NEXT: andl $127, %eax ; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0 @@ -1770,6 +1813,7 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n ; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0 +; KNL-NEXT: andl $127, %eax ; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0 ; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1 @@ -1788,23 +1832,22 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n ; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; KNL-NEXT: vmovd %edi, %xmm0 +; KNL-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, %r8d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm0, %xmm2 ; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm1 -; KNL-NEXT: vmovd %edi, %xmm2 -; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2 ; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2 ; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3 @@ -1821,44 +1864,45 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n ; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3 ; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3 ; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm1 ; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3 ; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; KNL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm2 +; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm3, %xmm3 ; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; KNL-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm2, %xmm2 -; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm3, %xmm3 -; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2 +; KNL-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; KNL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm2 ; KNL-NEXT: vpternlogq {{.*#+}} zmm2 = ~zmm2 ; KNL-NEXT: cmpb $0, 736(%rbp) @@ -1869,8 +1913,8 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx @@ -1879,8 +1923,8 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1953,23 +1997,23 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n ; SKX-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1 ; SKX-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1 ; SKX-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vmovd %edi, %xmm2 +; SKX-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2 ; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; SKX-NEXT: vmovd %edi, %xmm1 -; SKX-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm1 ; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SKX-NEXT: vpinsrb $1, 104(%rbp), %xmm2, %xmm2 ; SKX-NEXT: vpinsrb $2, 112(%rbp), %xmm2, %xmm2 @@ -1987,23 +2031,23 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n ; SKX-NEXT: vpinsrb $14, 208(%rbp), %xmm2, %xmm2 ; SKX-NEXT: vpinsrb $15, 216(%rbp), %xmm2, %xmm2 ; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm2, %xmm2 +; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm2, %xmm2 ; SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1 -; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm2, %xmm1 ; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SKX-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2 ; SKX-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2 @@ -2024,8 +2068,8 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) n ; SKX-NEXT: movl 744(%rbp), %eax ; SKX-NEXT: andl $127, %eax ; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0 -; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1 ; SKX-NEXT: cmpb $0, 736(%rbp) +; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1 ; SKX-NEXT: vpmovm2b %k1, %zmm0 ; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) ; SKX-NEXT: vpmovm2b %k0, %zmm0 @@ -2075,8 +2119,8 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx @@ -2085,8 +2129,8 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -2163,9 +2207,9 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind { ; KNL-NEXT: vucomiss %xmm2, %xmm1 ; KNL-NEXT: setb %al ; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k0 ; KNL-NEXT: vpsrld $16, %xmm0, %xmm0 ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 +; KNL-NEXT: kmovw %eax, %k0 ; KNL-NEXT: vucomiss %xmm2, %xmm0 ; KNL-NEXT: setb %al ; KNL-NEXT: kmovw %eax, %k1 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index a8574c0b7516c..2208a04718ab7 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -66,9 +66,9 @@ define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 -; X86-NEXT: korw %k0, %k1, %k0 +; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k0 +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; X86-NEXT: korw %k1, %k0, %k0 ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: cmpw $-1, %ax ; X86-NEXT: sete %al @@ -115,9 +115,9 @@ define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 -; X86-NEXT: korw %k0, %k1, %k0 +; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k0 +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; X86-NEXT: korw %k1, %k0, %k0 ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: cmpw $0, %ax ; X86-NEXT: sete %al @@ -557,9 +557,9 @@ entry: define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) { ; X86-LABEL: test_mm512_mask_set1_epi64: ; X86: # %bb.0: # %entry -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} ; X86-NEXT: retl @@ -580,9 +580,9 @@ entry: define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { ; X86-LABEL: test_mm512_maskz_set1_epi64: ; X86: # %bb.0: # %entry -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl @@ -6775,7 +6775,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm2 @@ -7140,8 +7140,8 @@ define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_mul_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0 @@ -7157,8 +7157,8 @@ define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; ; X64-LABEL: test_mm512_mask_reduce_mul_epi32: ; X64: # %bb.0: # %entry -; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0 @@ -7670,8 +7670,8 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: pushl %eax ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0 @@ -7978,7 +7978,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648] ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -8128,7 +8128,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647] ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 @@ -8531,8 +8531,8 @@ define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_max_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 @@ -8548,8 +8548,8 @@ define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; ; X64-LABEL: test_mm512_mask_reduce_max_epi32: ; X64: # %bb.0: # %entry -; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 @@ -8657,8 +8657,8 @@ define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: pushl %eax ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0 @@ -8714,8 +8714,8 @@ define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-LABEL: test_mm512_mask_reduce_min_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0 @@ -8731,8 +8731,8 @@ define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; ; X64-LABEL: test_mm512_mask_reduce_min_epi32: ; X64: # %bb.0: # %entry -; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0 @@ -8842,8 +8842,8 @@ define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: pushl %eax ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vminps %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index c1ef500d9d3de..2db40be05f4b9 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -5979,22 +5979,22 @@ define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] ; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc8] -; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xe9] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] ; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] ; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] @@ -6026,22 +6026,22 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] ; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X86-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] ; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6051,22 +6051,22 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] ; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X64-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] ; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6096,22 +6096,22 @@ define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] ; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe9,0x06] ; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] ; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] ; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] @@ -6143,22 +6143,22 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] ; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] ; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6168,22 +6168,22 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] ; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] ; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6214,22 +6214,22 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf2,0xf5,0x48,0x37,0xc0] -; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x02] -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd1,0x04] -; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd9,0x05] -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xe1] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0] -; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; CHECK-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xc1] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -6261,22 +6261,22 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; X86-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1] ; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x37,0xc0] -; X86-NEXT: vpcmpleq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd1,0x02] -; X86-NEXT: vpcmpneqq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd9,0x04] -; X86-NEXT: vpcmpnltq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xe1,0x05] -; X86-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc9] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] ; X86-NEXT: movzbl %cl, %ecx ## encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] -; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd2,0x01] +; X86-NEXT: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02] +; X86-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04] +; X86-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05] +; X86-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc1] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6287,22 +6287,22 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; X64-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1] ; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x37,0xc0] -; X64-NEXT: vpcmpleq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd1,0x02] -; X64-NEXT: vpcmpneqq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd9,0x04] -; X64-NEXT: vpcmpnltq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xe1,0x05] -; X64-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc9] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X64-NEXT: movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0] -; X64-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X64-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; X64-NEXT: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; X64-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; X64-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6333,22 +6333,22 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x01] -; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc9,0x02] -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd1,0x04] -; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xd9,0x05] -; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xe1,0x06] ; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0] -; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; CHECK-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x06] +; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -6380,22 +6380,22 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; X86-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1] ; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x01] -; X86-NEXT: vpcmpleuq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xd1,0x02] -; X86-NEXT: vpcmpneqq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd9,0x04] -; X86-NEXT: vpcmpnltuq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xe1,0x05] -; X86-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc9,0x06] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] ; X86-NEXT: movzbl %cl, %ecx ## encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] -; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd2,0x01] +; X86-NEXT: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02] +; X86-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04] +; X86-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05] +; X86-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6406,22 +6406,22 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; X64-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1] ; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x01] -; X64-NEXT: vpcmpleuq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xd1,0x02] -; X64-NEXT: vpcmpneqq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd9,0x04] -; X64-NEXT: vpcmpnltuq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xe1,0x05] -; X64-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc9,0x06] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X64-NEXT: movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0] -; X64-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X64-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; X64-NEXT: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; X64-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; X64-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -10159,16 +10159,16 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) { define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) { ; X86-LABEL: fmadd_ss_maskz_memfold: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c] -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x02] -; X86-NEXT: vfmadd231ss (%ecx), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0x01] +; X86-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x01] +; X86-NEXT: vfmadd231ss (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0x00] ; X86-NEXT: ## xmm0 = (xmm0 * mem) + xmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc0] -; X86-NEXT: vmovss %xmm0, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x02] +; X86-NEXT: vmovss %xmm0, (%ecx) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x01] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: fmadd_ss_maskz_memfold: @@ -10247,16 +10247,16 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) { define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) { ; X86-LABEL: fmadd_sd_maskz_memfold: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c] -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x02] -; X86-NEXT: vfmadd231sd (%ecx), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0x01] +; X86-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x01] +; X86-NEXT: vfmadd231sd (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0x00] ; X86-NEXT: ## xmm0 = (xmm0 * mem) + xmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x10,0xc0] -; X86-NEXT: vmovsd %xmm0, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x02] +; X86-NEXT: vmovsd %xmm0, (%ecx) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x01] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: fmadd_sd_maskz_memfold: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 926af4e9957af..d17785ebbfea8 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1013,20 +1013,18 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z} ; X64-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1} -; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X64-NEXT: vcvtps2ph $2, %zmm0, (%rsi) -; X64-NEXT: vmovdqa %ymm1, %ymm0 +; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_vcvtps2ph_256: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z} +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1} -; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X86-NEXT: vcvtps2ph $2, %zmm0, (%eax) -; X86-NEXT: vmovdqa %ymm1, %ymm0 +; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm0 ; X86-NEXT: retl %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask) @@ -6093,14 +6091,14 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) { ; ; X86-LABEL: fmadd_ss_maskz_memfold: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z} -; X86-NEXT: vmovss %xmm0, (%edx) +; X86-NEXT: vmovss %xmm0, (%ecx) ; X86-NEXT: retl %a.val = load float, ptr %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -6181,14 +6179,14 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) { ; ; X86-LABEL: fmadd_sd_maskz_memfold: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z} -; X86-NEXT: vmovsd %xmm0, (%edx) +; X86-NEXT: vmovsd %xmm0, (%ecx) ; X86-NEXT: retl %a.val = load double, ptr %a %av0 = insertelement <2 x double> undef, double %a.val, i32 0 @@ -7107,18 +7105,18 @@ define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) { define <16 x i32> @test_x86_avx512_psllv_d_512_const() { ; X64-LABEL: test_x86_avx512_psllv_d_512_const: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] ; X64-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] ; X64-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_avx512_psllv_d_512_const: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] ; X86-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 -; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] ; X86-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl @@ -7180,18 +7178,18 @@ define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) { define <8 x i64> @test_x86_avx512_psllv_q_512_const() { ; X64-LABEL: test_x86_avx512_psllv_q_512_const: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] ; X64-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] ; X64-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_avx512_psllv_q_512_const: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,0,9,0,0,0,4294967295,4294967295,3,0,7,0,4294967295,4294967295,0,0] ; X86-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 -; X86-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,0,4,0,4,0,4,0,4,0,4,0,4,0,4294967295,4294967295] ; X86-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl @@ -7355,18 +7353,18 @@ define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) { define <16 x i32> @test_x86_avx512_psrlv_d_512_const() { ; X64-LABEL: test_x86_avx512_psrlv_d_512_const: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] ; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] ; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_avx512_psrlv_d_512_const: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] ; X86-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 -; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] ; X86-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl @@ -7428,18 +7426,18 @@ define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) { define <8 x i64> @test_x86_avx512_psrlv_q_512_const() { ; X64-LABEL: test_x86_avx512_psrlv_q_512_const: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] ; X64-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; X64-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] ; X64-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_avx512_psrlv_q_512_const: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,0,9,0,0,0,4294967295,4294967295,3,0,7,0,4294967295,4294967295,0,0] ; X86-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 -; X86-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,0,4,0,4,0,4,0,4,0,4,0,4,0,4294967295,4294967295] ; X86-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; X86-NEXT: retl @@ -7577,9 +7575,9 @@ define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp ; X86-NEXT: vmovaps 72(%ebp), %zmm3 -; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0 -; X86-NEXT: vcmplt_oqpd 8(%ebp), %zmm2, %k1 -; X86-NEXT: kunpckbw %k0, %k1, %k1 +; X86-NEXT: vcmplt_oqpd 8(%ebp), %zmm2, %k0 +; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k1 +; X86-NEXT: kunpckbw %k1, %k0, %k1 ; X86-NEXT: vmovaps 136(%ebp), %zmm3 {%k1} ; X86-NEXT: vmovaps %zmm3, %zmm0 ; X86-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 8aa898f3ec576..8d98290ba29a6 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1060,12 +1060,12 @@ define i32 @test13_crash(i32 %x, i32 %y) { define <4 x i1> @test14() { ; CHECK-LABEL: test14: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,1,0,1] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1] ; CHECK-NEXT: retq ; ; X86-LABEL: test14: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,1,0,1] +; X86-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1] ; X86-NEXT: retl %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 2 diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll index f6e5986afac53..a938ebb307a50 100644 --- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -243,15 +243,15 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double> ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5 ; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5 -; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k2} -; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k1} -; AVX512F-NEXT: kshiftrw $8, %k2, %k2 -; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2} +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k2 +; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k2} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: kshiftrw $8, %k2, %k1 ; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} ; AVX512F-NEXT: vmovapd %zmm5, %zmm2 ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-nontemporal.ll b/llvm/test/CodeGen/X86/avx512-nontemporal.ll index 7c6837cca9023..197622992b020 100644 --- a/llvm/test/CodeGen/X86/avx512-nontemporal.ll +++ b/llvm/test/CodeGen/X86/avx512-nontemporal.ll @@ -24,10 +24,10 @@ define i32 @f(<16 x float> %A, <16 x float> %AA, ptr %B, <8 x double> %C, <8 x d ; CHECK-NEXT: vmovntpd %zmm0, (%rdi) ; CHECK-NEXT: vpaddd %zmm7, %zmm6, %zmm0 ; CHECK-NEXT: addl (%rsi), %eax +; CHECK-NEXT: vpaddw 80(%rbp), %zmm9, %zmm1 ; CHECK-NEXT: vmovntdq %zmm0, (%rdi) -; CHECK-NEXT: vpaddw 80(%rbp), %zmm9, %zmm0 ; CHECK-NEXT: addl (%rsi), %eax -; CHECK-NEXT: vmovntdq %zmm0, (%rdi) +; CHECK-NEXT: vmovntdq %zmm1, (%rdi) ; CHECK-NEXT: vpaddb 208(%rbp), %zmm8, %zmm0 ; CHECK-NEXT: addl (%rsi), %eax ; CHECK-NEXT: vmovntdq %zmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll index 162f5efd78f6d..0c57bec46ac6b 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -98,7 +98,7 @@ define dso_local i64 @caller_argv64i1() #0 { ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %edi ; X32-NEXT: subl $88, %esp -; X32-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1] +; X32-NEXT: vmovaps {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1] ; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovups %zmm0, (%esp) ; X32-NEXT: movl $1, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll index 88c99a06326ab..2f84369cee298 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -1078,7 +1078,7 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; LINUXOSX64-NEXT: movl %eax, %r9d ; LINUXOSX64-NEXT: subl %ecx, %r9d ; LINUXOSX64-NEXT: imull %r9d, %r8d -; LINUXOSX64-NEXT: leal (%r13,%r14), %r9d +; LINUXOSX64-NEXT: leal (%r14,%r13), %r9d ; LINUXOSX64-NEXT: movl %r13d, %r12d ; LINUXOSX64-NEXT: subl %r14d, %r12d ; LINUXOSX64-NEXT: imull %r11d, %r12d @@ -1217,24 +1217,22 @@ define dso_local x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a0, <32 define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signext, i32, i64, i16 signext, ptr) #0 { ; X32-LABEL: test_argRetMixTypes: ; X32: # %bb.0: -; X32-NEXT: pushl %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; X32-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; X32-NEXT: vcvtsi2sd %eax, %xmm2, %xmm1 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X32-NEXT: vcvtsi2sd %ecx, %xmm2, %xmm1 +; X32-NEXT: vmovd %edx, %xmm2 +; X32-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 +; X32-NEXT: vcvtqq2pd %ymm2, %ymm2 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; X32-NEXT: vmovd %edx, %xmm1 -; X32-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 -; X32-NEXT: vcvtqq2pd %ymm1, %ymm1 -; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; X32-NEXT: vcvtsi2sd %esi, %xmm2, %xmm1 +; X32-NEXT: vaddsd %xmm2, %xmm0, %xmm0 +; X32-NEXT: vcvtsi2sd %esi, %xmm3, %xmm1 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; X32-NEXT: vcvtsi2sdl (%ebx), %xmm2, %xmm1 +; X32-NEXT: vcvtsi2sdl (%eax), %xmm3, %xmm1 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X32-NEXT: vcvttsd2si %xmm0, %eax -; X32-NEXT: popl %ebx ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -1247,10 +1245,10 @@ define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signex ; WIN64-NEXT: vcvtsi2sd %ecx, %xmm2, %xmm1 ; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; WIN64-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm1 +; WIN64-NEXT: vcvtsi2sd %edi, %xmm2, %xmm2 ; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; WIN64-NEXT: vcvtsi2sd %edi, %xmm2, %xmm1 -; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; WIN64-NEXT: vcvtsi2sdl (%rsi), %xmm2, %xmm1 +; WIN64-NEXT: vaddsd %xmm2, %xmm0, %xmm0 +; WIN64-NEXT: vcvtsi2sdl (%rsi), %xmm3, %xmm1 ; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; WIN64-NEXT: vcvttsd2si %xmm0, %eax ; WIN64-NEXT: retq @@ -1264,10 +1262,10 @@ define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signex ; LINUXOSX64-NEXT: vcvtsi2sd %ecx, %xmm2, %xmm1 ; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; LINUXOSX64-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm1 +; LINUXOSX64-NEXT: vcvtsi2sd %edi, %xmm2, %xmm2 ; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; LINUXOSX64-NEXT: vcvtsi2sd %edi, %xmm2, %xmm1 -; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; LINUXOSX64-NEXT: vcvtsi2sdl (%rsi), %xmm2, %xmm1 +; LINUXOSX64-NEXT: vaddsd %xmm2, %xmm0, %xmm0 +; LINUXOSX64-NEXT: vcvtsi2sdl (%rsi), %xmm3, %xmm1 ; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; LINUXOSX64-NEXT: vcvttsd2si %xmm0, %eax ; LINUXOSX64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll index 721ffbe1ceb79..e9a6a90a25376 100644 --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -18,8 +18,8 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind { ; ; X64-LABEL: select00: ; X64: # %bb.0: -; X64-NEXT: cmpl $255, %edi ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: cmpl $255, %edi ; X64-NEXT: je .LBB0_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 @@ -46,8 +46,8 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind { ; ; X64-LABEL: select01: ; X64: # %bb.0: -; X64-NEXT: cmpl $255, %edi ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: cmpl $255, %edi ; X64-NEXT: je .LBB1_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index b3bf464b529d0..c713a3724f266 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -7,7 +7,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -19,7 +19,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -34,7 +34,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -49,7 +49,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -64,7 +64,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -79,7 +79,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -94,7 +94,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -108,7 +108,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -120,7 +120,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -135,7 +135,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -149,7 +149,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -162,7 +162,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,13,3,5,13,3,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -178,7 +178,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -195,7 +195,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [3,15,12,7,1,5,8,14] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -211,7 +211,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -228,7 +228,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> ; CHECK-FAST-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-FAST-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,8,11,8,13,8,15,9] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [1,8,11,8,13,8,15,9] ; CHECK-FAST-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-FAST-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -254,7 +254,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-FAST-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9] ; CHECK-FAST-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-FAST-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -280,7 +280,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -293,7 +293,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -309,7 +309,7 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -325,7 +325,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16 define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -336,7 +336,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -350,7 +350,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [24,28,29,26,28,29,17,12,22,8,25,27,28,18,30,18] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -364,7 +364,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -378,7 +378,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,21,31,29,25,2,19,15,20,27,7,23,3,7,25,10] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -392,7 +392,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -406,7 +406,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [26,3,4,22,1,18,29,17,21,0,20,19,18,12,11,31] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -419,7 +419,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <1 define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -430,7 +430,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -444,7 +444,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -457,7 +457,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <1 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [6,11,23,26,29,5,21,30] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [6,11,23,26,29,5,21,30] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -469,7 +469,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [6,11,23,26,29,5,21,30] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [6,11,23,26,29,5,21,30] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -484,7 +484,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,11,23,26,29,5,21,30] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,11,23,26,29,5,21,30] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -499,7 +499,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -514,7 +514,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -529,7 +529,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -544,7 +544,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -558,7 +558,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -570,7 +570,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -585,7 +585,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -599,7 +599,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -611,7 +611,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i1 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -626,7 +626,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -642,7 +642,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i1 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -657,7 +657,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -673,7 +673,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i1 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -688,7 +688,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -703,7 +703,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -715,7 +715,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i1 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -730,7 +730,7 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i1 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -745,7 +745,7 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,1,21,17,30,30,29,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,21,17,30,30,29,1] ; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -758,7 +758,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,21,17,30,30,29,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,21,17,30,30,29,1] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -774,7 +774,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,21,17,30,30,29,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,21,17,30,30,29,1] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -791,7 +791,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -807,7 +807,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [23,22,20,22,28,20,11,17] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [23,22,20,22,28,20,11,17] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -824,7 +824,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -840,7 +840,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -856,7 +856,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -869,7 +869,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -885,7 +885,7 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -901,7 +901,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16 define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -913,7 +913,7 @@ define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) { define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,0,3,2] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -925,7 +925,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,0,3,2] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,0,3,2] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -940,7 +940,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,3,2] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -955,7 +955,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,7,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,7,3] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -970,7 +970,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,0,7,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1011,7 +1011,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,3,2,5] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -1023,7 +1023,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,3,2,5] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,3,2,5] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1038,7 +1038,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,3,2,5] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1093,7 +1093,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> % ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,0,0,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,0,0,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1109,7 +1109,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> % define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1126,7 +1126,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> % ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,3,3,4] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,3,4] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1142,7 +1142,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> % define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,3,3,4] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,3,4] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1159,7 +1159,7 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,1,2,7] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7] ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp @@ -1170,7 +1170,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> % ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,2,7] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7] ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} @@ -1186,7 +1186,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,1,2,7] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1201,7 +1201,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -1212,7 +1212,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -1226,7 +1226,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1240,7 +1240,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -1254,7 +1254,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1268,7 +1268,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -1282,7 +1282,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1295,7 +1295,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -1306,7 +1306,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -1320,7 +1320,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1333,7 +1333,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -1345,7 +1345,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1360,7 +1360,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1375,7 +1375,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,3,4] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,3,4] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} @@ -1391,7 +1391,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,1,3,4] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4] ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1407,7 +1407,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,1,13,0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1422,7 +1422,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,1,13,0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1436,7 +1436,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,0,0,13] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -1448,7 +1448,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,0,13] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1463,7 +1463,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,0,0,13] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1477,7 +1477,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4] ; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1487,7 +1487,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) { define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -1501,7 +1501,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1516,7 +1516,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [15,11,14,3,8,9,13,7] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [15,11,14,3,8,9,13,7] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1531,7 +1531,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,11,14,3,8,9,13,7] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,11,14,3,8,9,13,7] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1547,7 +1547,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [12,6,9,13,12,10,0,2] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [12,6,9,13,12,10,0,2] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1562,7 +1562,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [12,6,9,13,12,10,0,2] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,6,9,13,12,10,0,2] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1577,7 +1577,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -1589,7 +1589,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,4,1,13,15,4,6,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1604,7 +1604,7 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1619,7 +1619,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [13,0,0,6] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [13,0,0,6] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -1632,7 +1632,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1648,7 +1648,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1665,7 +1665,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,13,11,10,7,13,15,14] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,13,11,10,7,13,15,14] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1681,7 +1681,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,13,11,10,7,13,15,14] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,13,11,10,7,13,15,14] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1698,7 +1698,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1714,7 +1714,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,15,6,9] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,15,6,9] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1730,7 +1730,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [6,0,7,2] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [6,0,7,2] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -1743,7 +1743,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,0,7,2] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,0,7,2] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1759,7 +1759,7 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,0,7,2] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [6,0,7,2] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1775,7 +1775,7 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32 define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [12,9,4,10] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10] ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-FAST-NEXT: vzeroupper @@ -1783,11 +1783,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; ; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,0,2] +; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,u,2] ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,1,4,3] +; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3] ; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 ; CHECK-FAST-PERLANE-NEXT: vzeroupper ; CHECK-FAST-PERLANE-NEXT: retq @@ -1968,7 +1968,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,4,6,1] ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -1991,7 +1991,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,4,6,1] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2013,7 +2013,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,3,6,3] ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -2036,7 +2036,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,3,6,3] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,3,6,3] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2057,7 +2057,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,0,0,7] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7] ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq @@ -2075,7 +2075,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,0,0,7] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,0,0,7] ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -2098,7 +2098,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,0,0,7] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2120,7 +2120,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,7,7,5] ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -2143,7 +2143,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,7,5] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2165,7 +2165,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,1,0,6] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,1,0,6] ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -2179,7 +2179,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,1,0,6] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2192,7 +2192,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,6,5,3] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3] ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq @@ -2210,7 +2210,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,6,5,3] ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 @@ -2233,7 +2233,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,6,5,3] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2256,7 +2256,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,4] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,0,3,4] ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -2269,7 +2269,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,0,3,4] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2386,7 +2386,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,7,6,0] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,6,0] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2409,7 +2409,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,6,0] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,0] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2433,7 +2433,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,1,1,5] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,1,1,5] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2456,7 +2456,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,1,1,5] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,1,1,5] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2479,7 +2479,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,0,2] ; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq @@ -2498,7 +2498,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,0,0,2] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,0,2] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2521,7 +2521,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2545,7 +2545,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> % ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,6,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,6,1] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2560,7 +2560,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,4,6,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2576,7 +2576,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,7,1] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,7,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2599,7 +2599,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2622,7 +2622,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,2,3,2] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -2634,7 +2634,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> % ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,2,3,2] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,2,3,2] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2649,7 +2649,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2665,7 +2665,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,7,5,1] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,5,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2688,7 +2688,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,7,5,1] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,5,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -2711,7 +2711,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { ; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,1] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [4,1] ; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-FAST-NEXT: vzeroupper @@ -2730,7 +2730,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> % ; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,1] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1] ; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -2754,7 +2754,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> % define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,1] +; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1] ; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -2847,7 +2847,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,3,5,0] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -2863,7 +2863,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,0] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -2879,7 +2879,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,2,7,0] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -2895,7 +2895,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,2,7,0] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -2910,7 +2910,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,5,2] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -2922,7 +2922,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,3,5,2] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -2938,7 +2938,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,5,2] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -2953,7 +2953,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [6,2,4,5] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [6,2,4,5] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -2966,7 +2966,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,2,4,5] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [6,2,4,5] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -2983,7 +2983,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,2,4,5] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [6,2,4,5] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -3001,7 +3001,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,3,3,6] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [6,3,3,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -3018,7 +3018,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,3,3,6] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [6,3,3,6] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -3036,7 +3036,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,1,3,7] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,1,3,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -3053,7 +3053,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -3070,7 +3070,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -3083,7 +3083,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -3100,7 +3100,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -3117,7 +3117,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -3128,7 +3128,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7] +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -3143,7 +3143,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3158,7 +3158,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14] +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -3173,7 +3173,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3188,7 +3188,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -3203,7 +3203,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3217,7 +3217,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %v define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -3228,7 +3228,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8] +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -3243,7 +3243,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3257,7 +3257,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %v define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,8,9,10] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -3269,7 +3269,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,8,9,10] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -3285,7 +3285,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,8,9,10] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3301,7 +3301,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec ; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm3 = [8,6,10,6] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6] ; CHECK-FAST-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-FAST-NEXT: vcmpeqps %xmm4, %xmm2, %k1 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -3313,7 +3313,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,6,2,6] +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6] ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm0, %xmm2, %k1 @@ -3329,7 +3329,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [8,6,10,6] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6] ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3341,7 +3341,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %v ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,2,6] +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6] ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z} @@ -3387,7 +3387,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %v define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [10,2,11,6] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [10,2,11,6] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -3399,7 +3399,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [10,2,11,6] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [10,2,11,6] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -3415,7 +3415,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [10,2,11,6] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [10,2,11,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3430,7 +3430,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %v define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -3442,7 +3442,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,7,11,5,10,0,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,7,11,5,10,0,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -3458,7 +3458,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -3475,7 +3475,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [11,0,9,0,7,14,0,8] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [11,0,9,0,7,14,0,8] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -3491,7 +3491,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -3508,7 +3508,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x ; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,13,10,11,10,0,0,9] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,13,10,11,10,0,0,9] ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -3518,7 +3518,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,13,10,11,10,0,0,9] +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = [1,13,10,11,10,0,0,9] ; CHECK-FAST-PERLANE-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1 @@ -3534,7 +3534,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) { ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9] ; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -3544,7 +3544,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9] +; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9] ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm3, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} @@ -3560,7 +3560,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [15,13,11,11,3,12,4,1] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [15,13,11,11,3,12,4,1] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -3572,7 +3572,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [15,13,11,11,3,12,4,1] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [15,13,11,11,3,12,4,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -3588,7 +3588,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,13,11,11,3,12,4,1] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [15,13,11,11,3,12,4,1] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -3605,7 +3605,7 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3] -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,7,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3617,7 +3617,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,6,7,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -3635,7 +3635,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,6,7,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} @@ -3653,7 +3653,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,14,7,12,6,14,7] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [8,2,14,7,12,6,14,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -3670,7 +3670,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,2,14,7,12,6,14,7] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [8,2,14,7,12,6,14,7] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -3688,7 +3688,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm2 = [12,6,12,6,12,6,12,6] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,6,12,6,12,6,12,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -3705,7 +3705,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [12,6,12,6,12,6,12,6] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [12,6,12,6,12,6,12,6] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -3722,7 +3722,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,3,15,9] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [3,3,15,9] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -3735,7 +3735,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -3752,7 +3752,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9] +; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -3911,7 +3911,7 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,3,7,3] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,3,7,3] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -3922,7 +3922,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,3,7,3] +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [7,3,7,3] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -3937,7 +3937,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,3,7,3] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [7,3,7,3] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3952,7 +3952,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %v ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,7,6] +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,0,7,6] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -3967,7 +3967,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,0,7,6] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -4007,7 +4007,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,1,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,1,4] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -4018,7 +4018,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %v ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,1,4] +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,1,4] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -4033,7 +4033,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,1,4] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -4047,7 +4047,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,5] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [1,5] ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 @@ -4073,7 +4073,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,5] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [1,5] ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 @@ -4097,7 +4097,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %v ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,6,2,2] +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,6,2,2] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -4112,7 +4112,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,6,2,2] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -4126,7 +4126,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { ; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [5,0,7,0] ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-FAST-NEXT: retq @@ -4144,7 +4144,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [5,0,7,0] ; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -4169,7 +4169,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [5,0,7,0] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -4193,7 +4193,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %v ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,0,6] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,0,6] ; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -4218,7 +4218,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,5,0,6] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -4241,7 +4241,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,6] +; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-FAST-NEXT: vzeroupper @@ -4260,7 +4260,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v ; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [0,6] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] ; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-FAST-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -4286,7 +4286,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,6] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -4311,7 +4311,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %v ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,7] +; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [3,7] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -4327,7 +4327,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %v define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,7] +; CHECK-NEXT: vmovapd {{.*#+}} xmm2 = [3,7] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -4342,7 +4342,7 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,6,7,2] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [1,6,7,2] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -4354,7 +4354,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,6,7,2] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,6,7,2] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} @@ -4370,7 +4370,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,6,7,2] +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -4387,7 +4387,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vbroadcastsd 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,0,6,2] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [7,0,6,2] ; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 @@ -4412,7 +4412,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vbroadcastsd 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,6,2] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [7,0,6,2] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} @@ -4437,7 +4437,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,2,3,4] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,2,3,4] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} @@ -4462,7 +4462,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4] ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -4487,7 +4487,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4,2,1,0] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -4499,7 +4499,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,2,1,0] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [4,2,1,0] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} @@ -4515,7 +4515,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0] +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -4532,7 +4532,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,0,5,1] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [6,0,5,1] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} @@ -4548,7 +4548,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,0,5,1] +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,0,5,1] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -4596,7 +4596,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,6,0,5] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4,6,0,5] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -4608,7 +4608,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,6,0,5] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [4,6,0,5] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} @@ -4624,7 +4624,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,6,0,5] +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,6,0,5] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll index 367e28eb7364e..884ff6ccf675e 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll @@ -4,7 +4,7 @@ define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -13,7 +13,7 @@ define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -27,7 +27,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -39,7 +39,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> % define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -53,7 +53,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -65,7 +65,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> % define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -79,7 +79,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -91,7 +91,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> % define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> @@ -100,7 +100,7 @@ define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) { define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -114,7 +114,7 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -126,7 +126,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> % define <16 x i16> @test_16xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] ; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp @@ -136,7 +136,7 @@ define <16 x i16> @test_16xi16_perm_mem_mask0(ptr %vp) { define <16 x i16> @test_masked_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -150,7 +150,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -164,7 +164,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask define <16 x i16> @test_masked_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -178,7 +178,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -192,7 +192,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask define <16 x i16> @test_masked_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -206,7 +206,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -220,7 +220,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask define <16 x i16> @test_16xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] ; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp @@ -230,7 +230,7 @@ define <16 x i16> @test_16xi16_perm_mem_mask3(ptr %vp) { define <16 x i16> @test_masked_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -244,7 +244,7 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -258,7 +258,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> @@ -267,7 +267,7 @@ define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] ; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -281,7 +281,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -293,7 +293,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] ; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -307,7 +307,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -319,7 +319,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] ; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -333,7 +333,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -345,7 +345,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) { ; CHECK-LABEL: test_32xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> @@ -354,7 +354,7 @@ define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] ; CHECK-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -368,7 +368,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -380,7 +380,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_32xi16_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_32xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] ; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -390,7 +390,7 @@ define <32 x i16> @test_32xi16_perm_mem_mask0(ptr %vp) { define <32 x i16> @test_masked_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -404,7 +404,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %vec2, define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] ; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -418,7 +418,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(ptr %vp, <32 x i16> %mask define <32 x i16> @test_masked_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -432,7 +432,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %vec2, define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] ; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -446,7 +446,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(ptr %vp, <32 x i16> %mask define <32 x i16> @test_masked_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -460,7 +460,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %vec2, define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] ; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -474,7 +474,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(ptr %vp, <32 x i16> %mask define <32 x i16> @test_32xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_32xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] ; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, ptr %vp @@ -484,7 +484,7 @@ define <32 x i16> @test_32xi16_perm_mem_mask3(ptr %vp) { define <32 x i16> @test_masked_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] ; CHECK-NEXT: vptestnmw %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -498,7 +498,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %vec2, define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] ; CHECK-NEXT: vptestnmw %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -512,7 +512,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(ptr %vp, <32 x i16> %mask define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> @@ -521,7 +521,7 @@ define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -535,7 +535,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -547,7 +547,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -561,7 +561,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -573,7 +573,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -587,7 +587,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -599,7 +599,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> @@ -608,7 +608,7 @@ define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 @@ -622,7 +622,7 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -634,7 +634,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask define <8 x i32> @test_8xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp @@ -644,7 +644,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask0(ptr %vp) { define <8 x i32> @test_masked_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -658,7 +658,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -672,7 +672,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) { define <8 x i32> @test_masked_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -686,7 +686,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -700,7 +700,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) { define <8 x i32> @test_masked_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -714,7 +714,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -728,7 +728,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) { define <8 x i32> @test_8xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp @@ -738,7 +738,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask3(ptr %vp) { define <8 x i32> @test_masked_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq @@ -752,7 +752,7 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq @@ -766,7 +766,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) { define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> @@ -775,7 +775,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) { define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] ; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -789,7 +789,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -801,7 +801,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] ; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -815,7 +815,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -827,7 +827,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] ; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -841,7 +841,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -853,7 +853,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> @@ -862,7 +862,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] ; CHECK-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -876,7 +876,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -888,7 +888,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_16xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -898,7 +898,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(ptr %vp) { define <16 x i32> @test_masked_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -912,7 +912,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %vec2, define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -926,7 +926,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(ptr %vp, <16 x i32> %mask define <16 x i32> @test_masked_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -940,7 +940,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %vec2, define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -954,7 +954,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(ptr %vp, <16 x i32> %mask define <16 x i32> @test_masked_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -968,7 +968,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %vec2, define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -982,7 +982,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(ptr %vp, <16 x i32> %mask define <16 x i32> @test_16xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -992,7 +992,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(ptr %vp) { define <16 x i32> @test_masked_16xi32_perm_mem_mask3(ptr %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] ; CHECK-NEXT: vptestnmd %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1006,7 +1006,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(ptr %vp, <16 x i32> %vec2, define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(ptr %vp, <16 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1254,7 +1254,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) { define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> @@ -1263,7 +1263,7 @@ define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) { define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] ; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1277,7 +1277,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1313,7 +1313,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] ; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1327,7 +1327,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1371,7 +1371,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] ; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1385,7 +1385,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1421,7 +1421,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> @@ -1430,7 +1430,7 @@ define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) { define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] ; CHECK-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -1444,7 +1444,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1480,7 +1480,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_8xi64_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp @@ -1490,7 +1490,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask0(ptr %vp) { define <8 x i64> @test_masked_8xi64_perm_mem_mask0(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1504,7 +1504,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(ptr %vp, <8 x i64> %vec2, <8 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(ptr %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] ; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1544,7 +1544,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(ptr %vp, <8 x i64> %mas define <8 x i64> @test_masked_8xi64_perm_mem_mask2(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1558,7 +1558,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(ptr %vp, <8 x i64> %vec2, <8 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(ptr %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] ; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1607,7 +1607,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(ptr %vp, <8 x i64> %mas define <8 x i64> @test_masked_8xi64_perm_mem_mask4(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1621,7 +1621,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(ptr %vp, <8 x i64> %vec2, <8 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(ptr %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] ; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1661,7 +1661,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(ptr %vp, <8 x i64> %mas define <8 x i64> @test_8xi64_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp @@ -1671,7 +1671,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask6(ptr %vp) { define <8 x i64> @test_masked_8xi64_perm_mem_mask6(ptr %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] ; CHECK-NEXT: vptestnmq %zmm1, %zmm1, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq @@ -1685,7 +1685,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(ptr %vp, <8 x i64> %vec2, <8 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(ptr %vp, <8 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] ; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 ; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1725,7 +1725,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(ptr %vp, <8 x i64> %mas define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -1734,7 +1734,7 @@ define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) { define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -1749,7 +1749,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1762,7 +1762,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x floa define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -1777,7 +1777,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1790,7 +1790,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x floa define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -1805,7 +1805,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1818,7 +1818,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x floa define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -1827,7 +1827,7 @@ define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) { define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -1842,7 +1842,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1855,7 +1855,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x floa define <8 x float> @test_8xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp @@ -1865,7 +1865,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask0(ptr %vp) { define <8 x float> @test_masked_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -1880,7 +1880,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -1895,7 +1895,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %m define <8 x float> @test_masked_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -1910,7 +1910,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -1925,7 +1925,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %m define <8 x float> @test_masked_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -1940,7 +1940,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -1955,7 +1955,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %m define <8 x float> @test_8xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp @@ -1965,7 +1965,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask3(ptr %vp) { define <8 x float> @test_masked_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -1980,7 +1980,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -1995,7 +1995,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %m define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -2004,7 +2004,7 @@ define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) { define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] +; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -2019,7 +2019,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2032,7 +2032,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] +; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -2047,7 +2047,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2060,7 +2060,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] +; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -2075,7 +2075,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2088,7 +2088,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -2097,7 +2097,7 @@ define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) { define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] +; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -2112,7 +2112,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2125,7 +2125,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x define <16 x float> @test_16xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -2135,7 +2135,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask0(ptr %vp) { define <16 x float> @test_masked_16xfloat_perm_mem_mask0(ptr %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -2150,7 +2150,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(ptr %vp, <16 x float> % define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(ptr %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2165,7 +2165,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(ptr %vp, <16 x float> define <16 x float> @test_masked_16xfloat_perm_mem_mask1(ptr %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -2180,7 +2180,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(ptr %vp, <16 x float> % define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(ptr %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2195,7 +2195,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(ptr %vp, <16 x float> define <16 x float> @test_masked_16xfloat_perm_mem_mask2(ptr %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -2210,7 +2210,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(ptr %vp, <16 x float> % define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(ptr %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2225,7 +2225,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(ptr %vp, <16 x float> define <16 x float> @test_16xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -2235,7 +2235,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask3(ptr %vp) { define <16 x float> @test_masked_16xfloat_perm_mem_mask3(ptr %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -2250,7 +2250,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(ptr %vp, <16 x float> % define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(ptr %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2515,7 +2515,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -2524,7 +2524,7 @@ define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) { define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] +; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -2539,7 +2539,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] +; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2578,7 +2578,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] +; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -2593,7 +2593,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] +; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2640,7 +2640,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] +; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -2655,7 +2655,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] +; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2694,7 +2694,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -2703,7 +2703,7 @@ define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) { define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] +; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} @@ -2718,7 +2718,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] +; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2757,7 +2757,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 define <8 x double> @test_8xdouble_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp @@ -2767,7 +2767,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask0(ptr %vp) { define <8 x double> @test_masked_8xdouble_perm_mem_mask0(ptr %vp, <8 x double> %vec2, <8 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] +; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} @@ -2782,7 +2782,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask0(ptr %vp, <8 x double> % define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(ptr %vp, <8 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] +; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2825,7 +2825,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(ptr %vp, <8 x dou define <8 x double> @test_masked_8xdouble_perm_mem_mask2(ptr %vp, <8 x double> %vec2, <8 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] +; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} @@ -2840,7 +2840,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask2(ptr %vp, <8 x double> % define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(ptr %vp, <8 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] +; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2892,7 +2892,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(ptr %vp, <8 x dou define <8 x double> @test_masked_8xdouble_perm_mem_mask4(ptr %vp, <8 x double> %vec2, <8 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] +; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} @@ -2907,7 +2907,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask4(ptr %vp, <8 x double> % define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(ptr %vp, <8 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] +; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2950,7 +2950,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(ptr %vp, <8 x dou define <8 x double> @test_8xdouble_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp @@ -2960,7 +2960,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask6(ptr %vp) { define <8 x double> @test_masked_8xdouble_perm_mem_mask6(ptr %vp, <8 x double> %vec2, <8 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] +; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} @@ -2975,7 +2975,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask6(ptr %vp, <8 x double> % define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(ptr %vp, <8 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] +; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 210513fe31783..c7d761d571fc3 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -866,9 +866,9 @@ define <8 x double> @test37_commute(<8 x double> %x, <8 x double> %x1, ptr %ptr) define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, ptr %ptr) nounwind { ; AVX512-LABEL: test38: ; AVX512: ## %bb.0: -; AVX512-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x58,0xc2,0x0f,0x01] +; AVX512-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x65,0xc0] ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ## encoding: [0xc3] @@ -891,9 +891,9 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, ptr %ptr) nounwin define <4 x double> @test38_commute(<4 x double> %x, <4 x double> %x1, ptr %ptr) nounwind { ; AVX512-LABEL: test38_commute: ; AVX512: ## %bb.0: -; AVX512-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vcmpgtpd (%rdi){1to8}, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x58,0xc2,0x0f,0x0e] +; AVX512-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x65,0xc0] ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ## encoding: [0xc3] @@ -916,9 +916,9 @@ define <4 x double> @test38_commute(<4 x double> %x, <4 x double> %x1, ptr %ptr) define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, ptr %ptr) nounwind { ; AVX512-LABEL: test39: ; AVX512: ## %bb.0: -; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x58,0xc2,0x0f,0x01] +; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x65,0xc0] ; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -942,9 +942,9 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, ptr %ptr) nounwin define <2 x double> @test39_commute(<2 x double> %x, <2 x double> %x1, ptr %ptr) nounwind { ; AVX512-LABEL: test39_commute: ; AVX512: ## %bb.0: -; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vcmpgtpd (%rdi){1to8}, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x58,0xc2,0x0f,0x0e] +; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x65,0xc0] ; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -1001,9 +1001,9 @@ define <16 x float> @test40_commute(<16 x float> %x, <16 x float> %x1, ptr %p define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, ptr %ptr) nounwind { ; AVX512-LABEL: test41: ; AVX512: ## %bb.0: -; AVX512-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x58,0xc2,0x0f,0x01] +; AVX512-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ## encoding: [0xc3] @@ -1026,9 +1026,9 @@ define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, ptr %ptr) nounwin define <8 x float> @test41_commute(<8 x float> %x, <8 x float> %x1, ptr %ptr) nounwind { ; AVX512-LABEL: test41_commute: ; AVX512: ## %bb.0: -; AVX512-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vcmpgtps (%rdi){1to16}, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x58,0xc2,0x0f,0x0e] +; AVX512-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ## encoding: [0xc3] @@ -1051,9 +1051,9 @@ define <8 x float> @test41_commute(<8 x float> %x, <8 x float> %x1, ptr %ptr) define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, ptr %ptr) nounwind { ; AVX512-LABEL: test42: ; AVX512: ## %bb.0: -; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x58,0xc2,0x0f,0x01] +; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -1077,9 +1077,9 @@ define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, ptr %ptr) nounwin define <4 x float> @test42_commute(<4 x float> %x, <4 x float> %x1, ptr %ptr) nounwind { ; AVX512-LABEL: test42_commute: ; AVX512: ## %bb.0: -; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vcmpgtps (%rdi){1to16}, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x58,0xc2,0x0f,0x0e] +; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x65,0xc0] ; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] diff --git a/llvm/test/CodeGen/X86/avx512bf16-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bf16-intrinsics-upgrade.ll index 32f8c0f0be9f2..b0c473490ec66 100644 --- a/llvm/test/CodeGen/X86/avx512bf16-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bf16-intrinsics-upgrade.ll @@ -100,8 +100,8 @@ entry: define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x float> %A) local_unnamed_addr #2 { ; X86-LABEL: test_mm512_mask_cvtneps2bf16_512: ; X86: # %bb.0: # %entry -; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; X86-NEXT: vcvtneps2bf16 %zmm1, %ymm1 # encoding: [0x62,0xf2,0x7e,0x48,0x72,0xc9] +; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] ; X86-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -109,8 +109,8 @@ define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x f ; ; X64-LABEL: test_mm512_mask_cvtneps2bf16_512: ; X64: # %bb.0: # %entry -; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; X64-NEXT: vcvtneps2bf16 %zmm1, %ymm1 # encoding: [0x62,0xf2,0x7e,0x48,0x72,0xc9] +; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] ; X64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll index ada2c8d53aa53..519739834bd19 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -15,13 +15,13 @@ define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp ; X86-NEXT: vmovdqa64 136(%ebp), %zmm3 -; X86-NEXT: vpcmpneqb %zmm0, %zmm1, %k0 -; X86-NEXT: vpcmpneqb 8(%ebp), %zmm2, %k1 -; X86-NEXT: vpcmpneqb 72(%ebp), %zmm3, %k2 -; X86-NEXT: kandd %k0, %k2, %k0 -; X86-NEXT: kmovd %k0, %eax -; X86-NEXT: kshiftrq $32, %k2, %k0 -; X86-NEXT: kandd %k1, %k0, %k0 +; X86-NEXT: vpcmpneqb 8(%ebp), %zmm2, %k0 +; X86-NEXT: vpcmpneqb 72(%ebp), %zmm3, %k1 +; X86-NEXT: vpcmpneqb %zmm0, %zmm1, %k2 +; X86-NEXT: kandd %k2, %k1, %k2 +; X86-NEXT: kmovd %k2, %eax +; X86-NEXT: kshiftrq $32, %k1, %k1 +; X86-NEXT: kandd %k0, %k1, %k0 ; X86-NEXT: kmovd %k0, %edx ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 51ffeca52a665..2e73e1ca37a1e 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1908,16 +1908,17 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8] ; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04] +; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05] ; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] ; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] -; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05] +; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] +; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] -; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1] -; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] -; X64-NEXT: leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff] +; X64-NEXT: decq %rax # encoding: [0x48,0xff,0xc8] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) @@ -1994,18 +1995,18 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin ; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] ; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02] -; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] -; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] ; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] +; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] ; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8] -; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8] ; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] +; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) @@ -2083,16 +2084,17 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8] ; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04] +; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05] ; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] ; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] -; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05] +; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] +; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] -; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06] -; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] -; X64-NEXT: leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff] +; X64-NEXT: decq %rax # encoding: [0x48,0xff,0xc8] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) @@ -2169,18 +2171,18 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] ; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02] -; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] -; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] ; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] +; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] ; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8] -; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8] ; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] +; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) @@ -2213,16 +2215,17 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X86-NEXT: vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] -; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05] +; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] +; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff] +; X86-NEXT: decl %eax # encoding: [0x48] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2235,16 +2238,17 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05] ; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] -; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05] +; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] +; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] -; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X64-NEXT: leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff] +; X64-NEXT: decl %eax # encoding: [0xff,0xc8] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) @@ -2277,18 +2281,18 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounw ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X86-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: popl %esi # encoding: [0x5e] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2302,18 +2306,18 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounw ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] -; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) @@ -2346,16 +2350,17 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X86-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] -; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05] +; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] +; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff] +; X86-NEXT: decl %eax # encoding: [0x48] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2368,16 +2373,17 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05] ; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] -; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05] +; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] +; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06] -; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X64-NEXT: leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff] +; X64-NEXT: decl %eax # encoding: [0xff,0xc8] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) @@ -2410,18 +2416,18 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) noun ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X86-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: popl %esi # encoding: [0x5e] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2435,18 +2441,18 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) noun ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] -; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index 41e2aa003ce7a..eff0aaac136c5 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1412,20 +1412,18 @@ define <32 x i16>@test_int_x86_avx512_maskz_psrav32_hi(<32 x i16> %x0, <32 x i16 define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: ; X86: # %bb.0: -; X86-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] -; X86-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x5a,0x05,A,A,A,A] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] +; X86-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: ; X64: # %bb.0: -; X64-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] -; X64-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x5a,0x05,A,A,A,A] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] +; X64-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index ae710cc40a522..d96e63d44eec5 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -4800,15 +4800,12 @@ declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32 define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; X86-LABEL: test_cmp_b_256: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx # encoding: [0x53] -; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %edi # encoding: [0x57] -; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: .cfi_offset %esi, -16 -; X86-NEXT: .cfi_offset %edi, -12 -; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_offset %esi, -12 +; X86-NEXT: .cfi_offset %edi, -8 ; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0] @@ -4820,10 +4817,10 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; X86-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05] ; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X86-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1] -; X86-NEXT: kmovd %k0, %ebx # encoding: [0xc5,0xfb,0x93,0xd8] ; X86-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] ; X86-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] -; X86-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x02] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x02] ; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X86-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X86-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3] @@ -4836,10 +4833,8 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; X86-NEXT: # xmm1 = xmm1[0],xmm2[0] ; X86-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; X86-NEXT: popl %esi # encoding: [0x5e] -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4856,10 +4851,10 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; X64-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05] ; X64-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] ; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] ; X64-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] -; X64-NEXT: vpinsrd $2, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x02] +; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x02] ; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X64-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X64-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3] @@ -4894,51 +4889,46 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; X86-LABEL: test_mask_cmp_b_256: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp # encoding: [0x55] -; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %ebx # encoding: [0x53] -; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %edi # encoding: [0x57] -; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: .cfi_offset %esi, -20 -; X86-NEXT: .cfi_offset %edi, -16 -; X86-NEXT: .cfi_offset %ebx, -12 -; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: .cfi_offset %esi, -16 +; X86-NEXT: .cfi_offset %edi, -12 +; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] ; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] -; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x05] -; X86-NEXT: kmovd %k0, %ebx # encoding: [0xc5,0xfb,0x93,0xd8] -; X86-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] -; X86-NEXT: kmovd %k0, %ebp # encoding: [0xc5,0xfb,0x93,0xe8] -; X86-NEXT: vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] -; X86-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x01] -; X86-NEXT: vpinsrd $2, %ebp, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc5,0x02] +; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] +; X86-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] +; X86-NEXT: vpcmpgtb %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc9] +; X86-NEXT: kmovd %k1, %ebx # encoding: [0xc5,0xfb,0x93,0xd9] +; X86-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] +; X86-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] +; X86-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x02] ; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca] ; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] ; X86-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] ; X86-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: vmovd %esi, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd6] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X86-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca] ; X86-NEXT: # xmm1 = xmm1[0],xmm2[0] ; X86-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; X86-NEXT: popl %esi # encoding: [0x5e] -; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: popl %ebp # encoding: [0x5d] +; X86-NEXT: popl %ebx # encoding: [0x5b] ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4949,18 +4939,18 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] -; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] -; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x05] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] -; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] -; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8] -; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] -; X64-NEXT: vpinsrd $1, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x01] -; X64-NEXT: vpinsrd $2, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc1,0x02] +; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] +; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc9] +; X64-NEXT: kmovd %k1, %r8d # encoding: [0xc5,0x7b,0x93,0xc1] +; X64-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] +; X64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x01] +; X64-NEXT: vpinsrd $2, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x02] ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03] +; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] ; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] @@ -4994,15 +4984,12 @@ declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) noun define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; X86-LABEL: test_ucmp_b_256: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx # encoding: [0x53] -; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %edi # encoding: [0x57] -; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: .cfi_offset %esi, -16 -; X86-NEXT: .cfi_offset %edi, -12 -; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_offset %esi, -12 +; X86-NEXT: .cfi_offset %edi, -8 ; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01] @@ -5014,10 +5001,10 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; X86-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05] ; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X86-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06] -; X86-NEXT: kmovd %k0, %ebx # encoding: [0xc5,0xfb,0x93,0xd8] ; X86-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] ; X86-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] -; X86-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x02] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x02] ; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X86-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X86-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3] @@ -5030,10 +5017,8 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; X86-NEXT: # xmm1 = xmm1[0],xmm2[0] ; X86-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; X86-NEXT: popl %esi # encoding: [0x5e] -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5050,10 +5035,10 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; X64-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05] ; X64-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X64-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] ; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] ; X64-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] -; X64-NEXT: vpinsrd $2, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x02] +; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x02] ; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X64-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X64-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3] @@ -5088,51 +5073,46 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; X86-LABEL: test_mask_ucmp_b_256: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp # encoding: [0x55] -; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %ebx # encoding: [0x53] -; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %edi # encoding: [0x57] -; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: .cfi_offset %esi, -20 -; X86-NEXT: .cfi_offset %edi, -16 -; X86-NEXT: .cfi_offset %ebx, -12 -; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: .cfi_offset %esi, -16 +; X86-NEXT: .cfi_offset %edi, -12 +; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] ; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] -; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05] -; X86-NEXT: kmovd %k0, %ebx # encoding: [0xc5,0xfb,0x93,0xd8] -; X86-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06] -; X86-NEXT: kmovd %k0, %ebp # encoding: [0xc5,0xfb,0x93,0xe8] -; X86-NEXT: vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] -; X86-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x01] -; X86-NEXT: vpinsrd $2, %ebp, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc5,0x02] +; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] +; X86-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] +; X86-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc9,0x06] +; X86-NEXT: kmovd %k1, %ebx # encoding: [0xc5,0xfb,0x93,0xd9] +; X86-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] +; X86-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] +; X86-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x02] ; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca] ; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] ; X86-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] ; X86-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: vmovd %esi, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd6] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X86-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca] ; X86-NEXT: # xmm1 = xmm1[0],xmm2[0] ; X86-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; X86-NEXT: popl %esi # encoding: [0x5e] -; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: popl %ebp # encoding: [0x5d] +; X86-NEXT: popl %ebx # encoding: [0x5b] ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5143,18 +5123,18 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] -; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] -; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05] -; X64-NEXT: kmovd %k0, %r8d # encoding: [0xc5,0x7b,0x93,0xc0] -; X64-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06] -; X64-NEXT: kmovd %k0, %r9d # encoding: [0xc5,0x7b,0x93,0xc8] -; X64-NEXT: vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] -; X64-NEXT: vpinsrd $1, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x01] -; X64-NEXT: vpinsrd $2, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc1,0x02] +; X64-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] +; X64-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc9,0x06] +; X64-NEXT: kmovd %k1, %r8d # encoding: [0xc5,0x7b,0x93,0xc1] +; X64-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] +; X64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x01] +; X64-NEXT: vpinsrd $2, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x22,0xc0,0x02] ; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03] +; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] ; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X64-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] @@ -5190,22 +5170,22 @@ define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] ; CHECK-NEXT: vpcmpgtw %ymm0, %ymm1, %k1 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc8] -; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xe9] ; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] ; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] @@ -5237,22 +5217,22 @@ define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] ; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X86-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5262,22 +5242,22 @@ define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] ; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X64-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] ; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5307,22 +5287,22 @@ define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] ; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x06] ; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] ; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] @@ -5354,22 +5334,22 @@ define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] ; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5379,22 +5359,22 @@ define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] ; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] ; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5424,22 +5404,22 @@ define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] ; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc8] -; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xe9] ; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] ; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] @@ -5470,22 +5450,22 @@ define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] ; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5494,22 +5474,22 @@ define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] ; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] ; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) @@ -5538,22 +5518,22 @@ define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] ; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x06] ; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] ; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] @@ -5584,22 +5564,22 @@ define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] ; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5608,22 +5588,22 @@ define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] ; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] ; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) @@ -5653,22 +5633,22 @@ define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1] ; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpcmpgtw %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x65,0xc0] -; CHECK-NEXT: vpcmplew %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x02] -; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x04] -; CHECK-NEXT: vpcmpnltw %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd9,0x05] -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k4 # encoding: [0x62,0xf1,0x7d,0x08,0x65,0xe1] ; CHECK-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; CHECK-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; CHECK-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; CHECK-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; CHECK-NEXT: vpcmplew %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x02] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x04] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x05] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x65,0xc1] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] @@ -5699,22 +5679,22 @@ define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1] ; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpgtw %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x65,0xc0] -; X86-NEXT: vpcmplew %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd1,0x02] -; X86-NEXT: vpcmpneqw %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd9,0x04] -; X86-NEXT: vpcmpnltw %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xe1,0x05] -; X86-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc9] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd2,0x01] +; X86-NEXT: vpcmplew %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02] +; X86-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04] +; X86-NEXT: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05] +; X86-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5724,22 +5704,22 @@ define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; X64-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtw %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x65,0xc0] -; X64-NEXT: vpcmplew %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd1,0x02] -; X64-NEXT: vpcmpneqw %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd9,0x04] -; X64-NEXT: vpcmpnltw %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xe1,0x05] -; X64-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc9] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; X64-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X64-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; X64-NEXT: vpcmplew %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; X64-NEXT: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; X64-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) @@ -5769,22 +5749,22 @@ define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1] ; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x01] -; CHECK-NEXT: vpcmpleuw %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc9,0x02] -; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x04] -; CHECK-NEXT: vpcmpnltuw %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd9,0x05] -; CHECK-NEXT: vpcmpnleuw %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe1,0x06] ; CHECK-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; CHECK-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; CHECK-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; CHECK-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; CHECK-NEXT: vpcmpleuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x02] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x04] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x05] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x06] +; CHECK-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] @@ -5815,22 +5795,22 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1] ; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x01] -; X86-NEXT: vpcmpleuw %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd1,0x02] -; X86-NEXT: vpcmpneqw %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd9,0x04] -; X86-NEXT: vpcmpnltuw %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xe1,0x05] -; X86-NEXT: vpcmpnleuw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x06] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd2,0x01] +; X86-NEXT: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02] +; X86-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04] +; X86-NEXT: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05] +; X86-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5840,22 +5820,22 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; X64-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x01] -; X64-NEXT: vpcmpleuw %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd1,0x02] -; X64-NEXT: vpcmpneqw %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd9,0x04] -; X64-NEXT: vpcmpnltuw %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xe1,0x05] -; X64-NEXT: vpcmpnleuw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x06] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; X64-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X64-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; X64-NEXT: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) diff --git a/llvm/test/CodeGen/X86/avx512fp16-frem.ll b/llvm/test/CodeGen/X86/avx512fp16-frem.ll index 2164c2460f6d7..d11b847c48cdb 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-frem.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-frem.ll @@ -19,59 +19,59 @@ define <2 x half> @frem_vec2(<2 x half> %x, <2 x half> %y) nounwind { ; CHECK-LABEL: frem_vec2: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm0 +; CHECK-NEXT: vmovapd %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[1,0] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[1,1,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT @@ -79,16 +79,16 @@ define <2 x half> @frem_vec2(<2 x half> %x, <2 x half> %y) nounwind { ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -108,59 +108,59 @@ define <4 x half> @frem_vec4(<4 x half> %x, <4 x half> %y) nounwind { ; CHECK-LABEL: frem_vec4: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm0 +; CHECK-NEXT: vmovapd %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[1,0] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[1,1,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT @@ -168,16 +168,16 @@ define <4 x half> @frem_vec4(<4 x half> %x, <4 x half> %y) nounwind { ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -197,59 +197,59 @@ define <8 x half> @frem_vec8(<8 x half> %x, <8 x half> %y) nounwind { ; CHECK-LABEL: frem_vec8: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm0 +; CHECK-NEXT: vmovapd %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[1,0] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vmovshdup (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[1,1,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT @@ -257,16 +257,16 @@ define <8 x half> @frem_vec8(<8 x half> %x, <8 x half> %y) nounwind { ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vpsrld $16, (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -307,13 +307,13 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind { ; CHECK-NEXT: # xmm1 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 @@ -328,14 +328,14 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind { ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 @@ -349,13 +349,13 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind { ; CHECK-NEXT: # xmm1 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 @@ -372,12 +372,12 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind { ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vzeroupper @@ -409,12 +409,12 @@ define <16 x half> @frem_vec16(<16 x half> %x, <16 x half> %y) nounwind { ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT @@ -483,13 +483,13 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind { ; CHECK-NEXT: # xmm1 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 @@ -530,13 +530,13 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind { ; CHECK-NEXT: # xmm1 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 @@ -578,13 +578,13 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind { ; CHECK-NEXT: # xmm1 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 @@ -599,14 +599,14 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind { ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 @@ -620,13 +620,13 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind { ; CHECK-NEXT: # xmm1 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 @@ -645,11 +645,11 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind { ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vzeroupper @@ -681,12 +681,12 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind { ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT @@ -718,12 +718,12 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind { ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vzeroupper @@ -755,12 +755,12 @@ define <32 x half> @frem_vec32(<32 x half> %x, <32 x half> %y) nounwind { ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT @@ -849,9 +849,9 @@ define <4 x half> @frem_strict_vec4(<4 x half> %x, <4 x half> %y) nounwind #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm2 ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm1 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm2 ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm0 ; CHECK-NEXT: callq fmodf@PLT @@ -895,59 +895,59 @@ define <8 x half> @frem_strict_vec8(<8 x half> %x, <8 x half> %y) nounwind #0 { ; CHECK-LABEL: frem_strict_vec8: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: vmovapd %xmm1, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm1 +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm2, %xmm2, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT @@ -955,16 +955,16 @@ define <8 x half> @frem_strict_vec8(<8 x half> %x, <8 x half> %y) nounwind #0 { ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1006,14 +1006,14 @@ define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind # ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1028,12 +1028,12 @@ define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind # ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 @@ -1048,14 +1048,14 @@ define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind # ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1072,10 +1072,10 @@ define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind # ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper @@ -1108,11 +1108,11 @@ define <16 x half> @frem_strict_vec16(<16 x half> %x, <16 x half> %y) nounwind # ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vunpcklps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT @@ -1182,14 +1182,14 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind # ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1229,12 +1229,12 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind # ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrldq $10, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 @@ -1277,14 +1277,14 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind # ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1299,12 +1299,12 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind # ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 @@ -1319,14 +1319,14 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind # ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1344,10 +1344,10 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind # ; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper @@ -1380,11 +1380,11 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind # ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT @@ -1418,10 +1418,10 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind # ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper @@ -1454,11 +1454,11 @@ define <32 x half> @frem_strict_vec32(<32 x half> %x, <32 x half> %y) nounwind # ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index 1a60644b2fc22..c31167742e336 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -1987,9 +1987,9 @@ entry: define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A) { ; X86-LABEL: test_mm_mask_set1_epi64: ; X86: # %bb.0: # %entry -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} ; X86-NEXT: retl @@ -2011,9 +2011,9 @@ entry: define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { ; X86-LABEL: test_mm_maskz_set1_epi64: ; X86: # %bb.0: # %entry -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ; X86-NEXT: retl @@ -2036,9 +2036,9 @@ entry: define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) { ; X86-LABEL: test_mm256_mask_set1_epi64: ; X86: # %bb.0: # %entry -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} ; X86-NEXT: retl @@ -2060,9 +2060,9 @@ entry: define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { ; X86-LABEL: test_mm256_maskz_set1_epi64: ; X86: # %bb.0: # %entry -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index e8e22bae23c92..1d5360c039359 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -7323,18 +7323,18 @@ define <8 x i32>@test_int_x86_avx512_maskz_psrav8_si(<8 x i32> %x0, <8 x i32> %x define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() { ; X86-LABEL: test_int_x86_avx512_mask_psrav8_si_const: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_psrav8_si_const: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] -; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] @@ -8636,18 +8636,18 @@ define <2 x i64>@test_int_x86_avx512_maskz_psrav_q_128(<2 x i64> %x0, <2 x i64> define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_psrav_q_128_const: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,18446744073709551607] -; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vmovdqa {{.*#+}} xmm0 = [2,0,4294967287,4294967295] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_psrav_q_128_const: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,18446744073709551607] -; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [2,18446744073709551607] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] @@ -9497,22 +9497,22 @@ define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x66,0xc0] -; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x02] -; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x04] -; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd9,0x05] -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k4 # encoding: [0x62,0xf1,0x7d,0x28,0x66,0xe1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; CHECK-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; CHECK-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -9544,22 +9544,22 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; X86-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x66,0xc0] -; X86-NEXT: vpcmpled %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd1,0x02] -; X86-NEXT: vpcmpneqd %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x04] -; X86-NEXT: vpcmpnltd %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe1,0x05] -; X86-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] ; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01] -; X86-NEXT: kmovw %k2, %ecx # encoding: [0xc5,0xf8,0x93,0xca] -; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] -; X86-NEXT: kmovw %k3, %ecx # encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] -; X86-NEXT: kmovw %k4, %ecx # encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd2,0x01] +; X86-NEXT: vpcmpled %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02] +; X86-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04] +; X86-NEXT: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05] +; X86-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1] +; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -9570,22 +9570,22 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; X64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1] ; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x66,0xc0] -; X64-NEXT: vpcmpled %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd1,0x02] -; X64-NEXT: vpcmpneqd %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x04] -; X64-NEXT: vpcmpnltd %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe1,0x05] -; X64-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc9] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X64-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; X64-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X64-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; X64-NEXT: vpcmpled %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; X64-NEXT: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; X64-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -9616,22 +9616,22 @@ define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1] ; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x01] -; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x02] -; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x04] -; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd9,0x05] -; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe1,0x06] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; CHECK-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; CHECK-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; CHECK-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x06] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -9663,22 +9663,22 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; X86-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x01] -; X86-NEXT: vpcmpleud %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x02] -; X86-NEXT: vpcmpneqd %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x04] -; X86-NEXT: vpcmpnltud %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe1,0x05] -; X86-NEXT: vpcmpnleud %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x06] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] ; X86-NEXT: movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01] -; X86-NEXT: kmovw %k2, %ecx # encoding: [0xc5,0xf8,0x93,0xca] -; X86-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02] -; X86-NEXT: kmovw %k3, %ecx # encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04] -; X86-NEXT: kmovw %k4, %ecx # encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd2,0x01] +; X86-NEXT: vpcmpleud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02] +; X86-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04] +; X86-NEXT: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05] +; X86-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -9689,22 +9689,22 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; X64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1] ; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x01] -; X64-NEXT: vpcmpleud %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x02] -; X64-NEXT: vpcmpneqd %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x04] -; X64-NEXT: vpcmpnltud %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe1,0x05] -; X64-NEXT: vpcmpnleud %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x06] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] ; X64-NEXT: movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0] -; X64-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01] -; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; X64-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X64-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01] +; X64-NEXT: vpcmpleud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; X64-NEXT: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; X64-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; X64-NEXT: vpinsrb $7, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -9734,22 +9734,22 @@ define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1] ; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 # encoding: [0x62,0xf2,0xf5,0x28,0x37,0xc8] -; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf2,0xfd,0x28,0x37,0xe9] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x37,0xc1] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -9782,23 +9782,23 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { ; X86-NEXT: vpcmpgtq %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x37,0xd0] ; X86-NEXT: vpcmpleq %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd9,0x02] ; X86-NEXT: vpcmpneqq %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltq %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x05] -; X86-NEXT: vpcmpgtq %ymm1, %ymm0, %k6 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x37,0xf1] -; X86-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] -; X86-NEXT: kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X86-NEXT: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x05] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X86-NEXT: vpcmpgtq %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x37,0xd1] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X86-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X86-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X86-NEXT: kshiftrw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0c] +; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -9810,23 +9810,23 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { ; X64-NEXT: vpcmpgtq %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x37,0xd0] ; X64-NEXT: vpcmpleq %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd9,0x02] ; X64-NEXT: vpcmpneqq %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltq %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x05] -; X64-NEXT: vpcmpgtq %ymm1, %ymm0, %k6 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x37,0xf1] -; X64-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] -; X64-NEXT: kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X64-NEXT: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x05] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpgtq %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x37,0xd1] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X64-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X64-NEXT: kshiftrw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0c] +; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -9856,22 +9856,22 @@ define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1] ; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x06] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x06] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -9904,23 +9904,23 @@ define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { ; X86-NEXT: vpcmpltuq %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x01] ; X86-NEXT: vpcmpleuq %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd9,0x02] ; X86-NEXT: vpcmpneqq %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltuq %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe9,0x05] -; X86-NEXT: vpcmpnleuq %ymm1, %ymm0, %k6 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf1,0x06] -; X86-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] -; X86-NEXT: kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X86-NEXT: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x05] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X86-NEXT: vpcmpnleuq %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x06] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X86-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X86-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X86-NEXT: kshiftrw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0c] +; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -9932,23 +9932,23 @@ define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { ; X64-NEXT: vpcmpltuq %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x01] ; X64-NEXT: vpcmpleuq %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd9,0x02] ; X64-NEXT: vpcmpneqq %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltuq %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe9,0x05] -; X64-NEXT: vpcmpnleuq %ymm1, %ymm0, %k6 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf1,0x06] -; X64-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] -; X64-NEXT: kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X64-NEXT: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x05] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpnleuq %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x06] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X64-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X64-NEXT: kshiftrw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0c] +; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -9978,22 +9978,22 @@ define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1] ; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x66,0xc8] -; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x66,0xe9] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x66,0xc1] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] @@ -10025,23 +10025,23 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { ; X86-NEXT: vpcmpgtd %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x66,0xd0] ; X86-NEXT: vpcmpled %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd9,0x02] ; X86-NEXT: vpcmpneqd %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltd %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x05] -; X86-NEXT: vpcmpgtd %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x66,0xf1] -; X86-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] -; X86-NEXT: kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X86-NEXT: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x05] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X86-NEXT: vpcmpgtd %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x66,0xd1] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X86-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X86-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X86-NEXT: kshiftrw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0c] +; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -10052,23 +10052,23 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { ; X64-NEXT: vpcmpgtd %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x66,0xd0] ; X64-NEXT: vpcmpled %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd9,0x02] ; X64-NEXT: vpcmpneqd %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltd %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x05] -; X64-NEXT: vpcmpgtd %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x66,0xf1] -; X64-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] -; X64-NEXT: kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X64-NEXT: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x05] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpgtd %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x66,0xd1] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X64-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X64-NEXT: kshiftrw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0c] +; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) @@ -10097,22 +10097,22 @@ define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1] ; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x06] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x06] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] @@ -10144,23 +10144,23 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { ; X86-NEXT: vpcmpltud %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x01] ; X86-NEXT: vpcmpleud %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd9,0x02] ; X86-NEXT: vpcmpneqd %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltud %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe9,0x05] -; X86-NEXT: vpcmpnleud %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf1,0x06] -; X86-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] -; X86-NEXT: kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X86-NEXT: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x05] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X86-NEXT: vpcmpnleud %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x06] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X86-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X86-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X86-NEXT: kshiftrw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0c] +; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -10171,23 +10171,23 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { ; X64-NEXT: vpcmpltud %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x01] ; X64-NEXT: vpcmpleud %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd9,0x02] ; X64-NEXT: vpcmpneqd %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltud %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe9,0x05] -; X64-NEXT: vpcmpnleud %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf1,0x06] -; X64-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] -; X64-NEXT: kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X64-NEXT: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x05] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpnleud %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x06] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X64-NEXT: kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X64-NEXT: kshiftrw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0c] +; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) @@ -10216,22 +10216,22 @@ define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1] ; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x08,0x37,0xc8] -; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf2,0xfd,0x08,0x37,0xe9] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x37,0xc1] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] @@ -10263,23 +10263,23 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; X86-NEXT: vpcmpgtq %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x37,0xd0] ; X86-NEXT: vpcmpleq %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd9,0x02] ; X86-NEXT: vpcmpneqq %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltq %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x05] -; X86-NEXT: vpcmpgtq %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x37,0xf1] -; X86-NEXT: kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e] -; X86-NEXT: kshiftrw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0e] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X86-NEXT: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x05] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X86-NEXT: vpcmpgtq %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x37,0xd1] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X86-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X86-NEXT: kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X86-NEXT: kshiftrw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0e] +; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -10290,23 +10290,23 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; X64-NEXT: vpcmpgtq %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x37,0xd0] ; X64-NEXT: vpcmpleq %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd9,0x02] ; X64-NEXT: vpcmpneqq %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltq %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x05] -; X64-NEXT: vpcmpgtq %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x37,0xf1] -; X64-NEXT: kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e] -; X64-NEXT: kshiftrw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0e] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X64-NEXT: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x05] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpgtq %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x37,0xd1] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X64-NEXT: kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X64-NEXT: kshiftrw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0e] +; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) @@ -10335,22 +10335,22 @@ define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1] ; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x06] ; CHECK-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] +; CHECK-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; CHECK-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x02] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x04] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04] +; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x05] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05] +; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x06] +; CHECK-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] +; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06] ; CHECK-NEXT: movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00] ; CHECK-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] @@ -10382,23 +10382,23 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; X86-NEXT: vpcmpltuq %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x01] ; X86-NEXT: vpcmpleuq %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd9,0x02] ; X86-NEXT: vpcmpneqq %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltuq %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe9,0x05] -; X86-NEXT: vpcmpnleuq %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf1,0x06] -; X86-NEXT: kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e] -; X86-NEXT: kshiftrw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0e] ; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X86-NEXT: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x05] ; X86-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X86-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X86-NEXT: vpcmpnleuq %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x06] ; X86-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X86-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X86-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X86-NEXT: kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X86-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X86-NEXT: kshiftrw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0e] +; X86-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X86-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -10409,23 +10409,23 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; X64-NEXT: vpcmpltuq %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x01] ; X64-NEXT: vpcmpleuq %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd9,0x02] ; X64-NEXT: vpcmpneqq %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltuq %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe9,0x05] -; X64-NEXT: vpcmpnleuq %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf1,0x06] -; X64-NEXT: kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e] -; X64-NEXT: kshiftrw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0e] ; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] +; X64-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X64-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01] +; X64-NEXT: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x05] ; X64-NEXT: kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] +; X64-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02] +; X64-NEXT: vpcmpnleuq %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x06] ; X64-NEXT: kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5] +; X64-NEXT: vpinsrb $4, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x04] +; X64-NEXT: kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05] -; X64-NEXT: kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6] +; X64-NEXT: kshiftrw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc1,0x0e] +; X64-NEXT: kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2] ; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06] -; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) @@ -17422,9 +17422,9 @@ define void @test_cmp_256(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x f ; X86-NEXT: andl $-32, %esp # encoding: [0x83,0xe4,0xe0] ; X86-NEXT: subl $32, %esp # encoding: [0x83,0xec,0x20] ; X86-NEXT: movl 40(%ebp), %eax # encoding: [0x8b,0x45,0x28] -; X86-NEXT: vcmpltps %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x01] -; X86-NEXT: vcmpltps 8(%ebp), %ymm2, %k1 # encoding: [0x62,0xf1,0x6c,0x28,0xc2,0x8d,0x08,0x00,0x00,0x00,0x01] -; X86-NEXT: kunpckbw %k0, %k1, %k1 # encoding: [0xc5,0xf5,0x4b,0xc8] +; X86-NEXT: vcmpltps 8(%ebp), %ymm2, %k0 # encoding: [0x62,0xf1,0x6c,0x28,0xc2,0x85,0x08,0x00,0x00,0x00,0x01] +; X86-NEXT: vcmpltps %ymm1, %ymm0, %k1 # encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x01] +; X86-NEXT: kunpckbw %k1, %k0, %k1 # encoding: [0xc5,0xfd,0x4b,0xc9] ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] ; X86-NEXT: vmovaps %zmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x29,0x00] ; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec] diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 0973824fbb0ef..5413471983541 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4374,22 +4374,22 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s ; X86-LABEL: test_x86_vcvtps2ph_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3] ; X86-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b] -; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128: ; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3] ; X64-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b] -; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask) @@ -4405,23 +4405,23 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s ; X86-LABEL: test_x86_vcvtps2ph_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3] ; X86-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c] -; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_256: ; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3] ; X64-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c] -; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) @@ -7031,18 +7031,18 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> define <8 x i32> @combine_vpermi2d_vpermps(<16 x i32> noundef %a) { ; X86-LABEL: combine_vpermi2d_vpermps: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1] -; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X86-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x0d,A,A,A,A] +; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] ; X86-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: combine_vpermi2d_vpermps: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbd {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1] -; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A] -; X64-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x0d,A,A,A,A] +; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] ; X64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; X64-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll index 9741972767bcd..14120de2b5498 100644 --- a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll @@ -173,9 +173,9 @@ define void @test_mm256_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapt ; X86-LABEL: test_mm256_2intersect_epi32_b: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastd (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x01] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] -; X86-NEXT: vpbroadcastd (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x02] ; X86-NEXT: vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] diff --git a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll index 28e3d6dd5d849..79f492d8a4a71 100644 --- a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll @@ -184,9 +184,9 @@ define void @test_mm512_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapt ; X86-LABEL: test_mm512_2intersect_epi64_b: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastq (%ecx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x01] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] -; X86-NEXT: vpbroadcastq (%edx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x02] ; X86-NEXT: vp2intersectq (%ecx){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll index ea4d32bae9ccb..087c57a28481a 100644 --- a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll +++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll @@ -155,18 +155,18 @@ define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 { ; FAST_ISEL_AVXNECONVERT: # %bb.0: ; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $2, %xmm0, %eax ; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $1, %xmm0, %ecx ; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 -; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $1, %xmm0, %eax -; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax -; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %ecx, %xmm2 ; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm0, %eax ; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax ; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 ; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1 -; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm1, %eax ; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 -; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm1 -; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm1, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; FAST_ISEL_AVXNECONVERT-NEXT: vmovq %xmm1, %rax ; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx @@ -745,8 +745,8 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 { ; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax ; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2 ; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2 -; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm2, %eax ; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm2, %eax ; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 ; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 684e2921b789e..d20a088fcff9f 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -10,15 +10,15 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl (%edx), %edx -; X86-NEXT: shll $16, %edx -; X86-NEXT: vmovd %edx, %xmm0 ; X86-NEXT: movzwl (%ecx), %ecx ; X86-NEXT: shll $16, %ecx -; X86-NEXT: vmovd %ecx, %xmm1 +; X86-NEXT: vmovd %ecx, %xmm0 +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: vmovd %eax, %xmm1 ; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $0, %xmm0, (%eax) ; X86-NEXT: retl ; @@ -370,12 +370,12 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind { ; X86-LABEL: add_constant: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: shll $16, %ecx -; X86-NEXT: vmovd %ecx, %xmm0 +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: vmovd %eax, %xmm0 ; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $0, %xmm0, (%eax) ; X86-NEXT: retl ; @@ -1085,7 +1085,7 @@ define <32 x bfloat> @pr63017_2() nounwind { ; ; AVXNC-LABEL: pr63017_2: ; AVXNC: # %bb.0: -; AVXNC-NEXT: vbroadcastss {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] +; AVXNC-NEXT: vmovaps {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] ; AVXNC-NEXT: testb %al, %al ; AVXNC-NEXT: jne .LBB16_2 ; AVXNC-NEXT: # %bb.1: # %cond.load @@ -1235,10 +1235,10 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) { ; ; SSE2-LABEL: pr64460_4: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: cvtps2pd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: cvtps2pd %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] @@ -1983,11 +1983,11 @@ define void @PR92471(ptr %0, ptr %1) nounwind { ; X86-LABEL: PR92471: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vpinsrd $1, 4(%ecx), %xmm0, %xmm0 -; X86-NEXT: vpinsrd $2, 8(%ecx), %xmm0, %xmm0 -; X86-NEXT: vpinsrw $6, 12(%ecx), %xmm0, %xmm0 +; X86-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0 +; X86-NEXT: vpinsrd $2, 8(%eax), %xmm0, %xmm0 +; X86-NEXT: vpinsrw $6, 12(%eax), %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X86-NEXT: vpslld $16, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll index 6acce84645e88..97d4f5ee8f8ad 100644 --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -18,9 +18,9 @@ define i8 @v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) { ; SSE-NEXT: pcmpgtq %xmm5, %xmm1 ; SSE-NEXT: pcmpgtq %xmm4, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm11, %xmm10 ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8 @@ -114,9 +114,9 @@ define i8 @v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> ; SSE-NEXT: cmpltpd %xmm1, %xmm5 ; SSE-NEXT: cmpltpd %xmm0, %xmm4 ; SSE-NEXT: packssdw %xmm5, %xmm4 -; SSE-NEXT: packssdw %xmm6, %xmm4 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: packssdw %xmm6, %xmm4 ; SSE-NEXT: packssdw %xmm11, %xmm10 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm8 @@ -192,8 +192,8 @@ define i8 @v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { ; SSE-LABEL: v32i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: pcmpgtw %xmm4, %xmm0 @@ -205,13 +205,13 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { ; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: packsswb %xmm10, %xmm11 -; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: packsswb %xmm8, %xmm9 +; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: packsswb %xmm9, %xmm8 ; SSE-NEXT: pmovmskb %xmm11, %ecx -; SSE-NEXT: pmovmskb %xmm9, %eax +; SSE-NEXT: pmovmskb %xmm8, %eax ; SSE-NEXT: shll $16, %eax ; SSE-NEXT: orl %ecx, %eax ; SSE-NEXT: retq @@ -301,25 +301,25 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) { ; SSE-LABEL: v16i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE-NEXT: pcmpgtd %xmm5, %xmm1 ; SSE-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE-NEXT: pcmpgtd %xmm7, %xmm3 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pcmpgtd %xmm7, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: packssdw %xmm10, %xmm11 -; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: packssdw %xmm8, %xmm9 -; SSE-NEXT: packsswb %xmm9, %xmm11 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: packssdw %xmm9, %xmm8 +; SSE-NEXT: packsswb %xmm8, %xmm11 ; SSE-NEXT: pmovmskb %xmm11, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq @@ -400,25 +400,25 @@ define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) { define i16 @v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d) { ; SSE-LABEL: v16f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: cmpltps %xmm0, %xmm4 ; SSE-NEXT: cmpltps %xmm1, %xmm5 ; SSE-NEXT: cmpltps %xmm2, %xmm6 -; SSE-NEXT: cmpltps %xmm3, %xmm7 ; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: cmpltps %xmm3, %xmm7 ; SSE-NEXT: andps %xmm4, %xmm11 ; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: andps %xmm5, %xmm10 ; SSE-NEXT: packssdw %xmm10, %xmm11 -; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: andps %xmm6, %xmm9 ; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: andps %xmm7, %xmm8 -; SSE-NEXT: packssdw %xmm8, %xmm9 -; SSE-NEXT: packsswb %xmm9, %xmm11 +; SSE-NEXT: andps %xmm6, %xmm8 +; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: andps %xmm7, %xmm9 +; SSE-NEXT: packssdw %xmm9, %xmm8 +; SSE-NEXT: packsswb %xmm8, %xmm11 ; SSE-NEXT: pmovmskb %xmm11, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq @@ -502,8 +502,8 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pand %xmm1, %xmm10 ; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: pand %xmm3, %xmm8 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index 423f2c49e70e5..05ffccbf5c01d 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -28,7 +28,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -37,7 +37,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -67,7 +67,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -76,7 +76,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -108,7 +108,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -117,7 +117,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -221,7 +221,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -265,7 +265,7 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -429,20 +429,20 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [16,32,64,128] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i8_8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm2 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -496,10 +496,10 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index f478fa5a1f6cd..f3066e0ceda28 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -30,7 +30,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 @@ -40,7 +40,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0 @@ -81,7 +81,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -91,7 +91,7 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -134,7 +134,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 @@ -144,7 +144,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 @@ -274,7 +274,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0 @@ -330,7 +330,7 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 @@ -536,7 +536,7 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,1,1,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [16,32,64,128] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 @@ -551,11 +551,11 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm2 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq $63, %ymm1, %ymm1 @@ -609,27 +609,27 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 ; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm2, %ymm3, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i16_16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $31, %ymm1, %ymm1 @@ -692,7 +692,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] @@ -782,14 +782,14 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index 2a79dae43bb2f..886783b39531a 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -22,7 +22,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 @@ -32,7 +32,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,2] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0 @@ -63,7 +63,7 @@ define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -73,7 +73,7 @@ define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -106,7 +106,7 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 @@ -116,7 +116,7 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 @@ -149,7 +149,7 @@ define <8 x i1> @bitcast_i8_8i1_freeze(i8 zeroext %a0) { ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 @@ -159,7 +159,7 @@ define <8 x i1> @bitcast_i8_8i1_freeze(i8 zeroext %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 86d7df0c2d648..2e83953eb5b1c 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -966,7 +966,7 @@ define i1 @trunc_v32i16_cmp(<32 x i16> %a0) nounwind { ; ; AVX512-LABEL: trunc_v32i16_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index d92e1a1e7b9d4..31df2704b1f36 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -1106,32 +1106,31 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; X64-NEXT: bswapq %rdi -; X64-NEXT: movq %rdi, %r10 -; X64-NEXT: shrq $4, %r10 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: bswapq %r10 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: shrq $4, %rax ; X64-NEXT: movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %r11, %rax ; X64-NEXT: andq %r11, %r10 -; X64-NEXT: andq %r11, %rdi -; X64-NEXT: shlq $4, %rdi -; X64-NEXT: orq %r10, %rdi -; X64-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333 -; X64-NEXT: movq %rdi, %r14 -; X64-NEXT: andq %r10, %r14 -; X64-NEXT: shrq $2, %rdi -; X64-NEXT: andq %r10, %rdi -; X64-NEXT: leaq (%rdi,%r14,4), %rdi +; X64-NEXT: shlq $4, %r10 +; X64-NEXT: orq %rax, %r10 +; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NEXT: movq %r10, %r14 +; X64-NEXT: andq %rax, %r14 +; X64-NEXT: shrq $2, %r10 +; X64-NEXT: andq %rax, %r10 +; X64-NEXT: leaq (%r10,%r14,4), %r10 ; X64-NEXT: movabsq $6148820866244280320, %r14 # imm = 0x5555000000000000 -; X64-NEXT: movq %rdi, %r13 +; X64-NEXT: movq %r10, %r13 ; X64-NEXT: andq %r14, %r13 -; X64-NEXT: shrq %rdi -; X64-NEXT: andq %r14, %rdi -; X64-NEXT: leaq (%rdi,%r13,2), %rdi +; X64-NEXT: shrq %r10 +; X64-NEXT: andq %r14, %r10 ; X64-NEXT: bswapq %rbx +; X64-NEXT: leaq (%r10,%r13,2), %r10 ; X64-NEXT: movq %rbx, %r14 ; X64-NEXT: shrq $4, %r14 ; X64-NEXT: andq %r11, %r14 @@ -1139,9 +1138,9 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: shlq $4, %rbx ; X64-NEXT: orq %r14, %rbx ; X64-NEXT: movq %rbx, %r14 -; X64-NEXT: andq %r10, %r14 +; X64-NEXT: andq %rax, %r14 ; X64-NEXT: shrq $2, %rbx -; X64-NEXT: andq %r10, %rbx +; X64-NEXT: andq %rax, %rbx ; X64-NEXT: leaq (%rbx,%r14,4), %rbx ; X64-NEXT: movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555 ; X64-NEXT: movq %rbx, %r13 @@ -1149,7 +1148,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: shrq %rbx ; X64-NEXT: andq %r14, %rbx ; X64-NEXT: leaq (%rbx,%r13,2), %rbx -; X64-NEXT: shrdq $48, %rbx, %rdi +; X64-NEXT: shrdq $48, %rbx, %r10 ; X64-NEXT: bswapq %r15 ; X64-NEXT: movq %r15, %r13 ; X64-NEXT: shrq $4, %r13 @@ -1158,9 +1157,9 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: shlq $4, %r15 ; X64-NEXT: orq %r13, %r15 ; X64-NEXT: movq %r15, %r13 -; X64-NEXT: andq %r10, %r13 +; X64-NEXT: andq %rax, %r13 ; X64-NEXT: shrq $2, %r15 -; X64-NEXT: andq %r10, %r15 +; X64-NEXT: andq %rax, %r15 ; X64-NEXT: leaq (%r15,%r13,4), %r15 ; X64-NEXT: movq %r15, %r13 ; X64-NEXT: andq %r14, %r13 @@ -1176,9 +1175,9 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: shlq $4, %r12 ; X64-NEXT: orq %r13, %r12 ; X64-NEXT: movq %r12, %r13 -; X64-NEXT: andq %r10, %r13 +; X64-NEXT: andq %rax, %r13 ; X64-NEXT: shrq $2, %r12 -; X64-NEXT: andq %r10, %r12 +; X64-NEXT: andq %rax, %r12 ; X64-NEXT: leaq (%r12,%r13,4), %r12 ; X64-NEXT: movq %r12, %r13 ; X64-NEXT: andq %r14, %r13 @@ -1194,9 +1193,9 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: shlq $4, %r9 ; X64-NEXT: orq %r13, %r9 ; X64-NEXT: movq %r9, %r13 -; X64-NEXT: andq %r10, %r13 +; X64-NEXT: andq %rax, %r13 ; X64-NEXT: shrq $2, %r9 -; X64-NEXT: andq %r10, %r9 +; X64-NEXT: andq %rax, %r9 ; X64-NEXT: leaq (%r9,%r13,4), %r9 ; X64-NEXT: movq %r9, %r13 ; X64-NEXT: andq %r14, %r13 @@ -1212,9 +1211,9 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: shlq $4, %r8 ; X64-NEXT: orq %r13, %r8 ; X64-NEXT: movq %r8, %r13 -; X64-NEXT: andq %r10, %r13 +; X64-NEXT: andq %rax, %r13 ; X64-NEXT: shrq $2, %r8 -; X64-NEXT: andq %r10, %r8 +; X64-NEXT: andq %rax, %r8 ; X64-NEXT: leaq (%r8,%r13,4), %r8 ; X64-NEXT: movq %r8, %r13 ; X64-NEXT: andq %r14, %r13 @@ -1230,9 +1229,9 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: shlq $4, %rcx ; X64-NEXT: orq %r13, %rcx ; X64-NEXT: movq %rcx, %r13 -; X64-NEXT: andq %r10, %r13 +; X64-NEXT: andq %rax, %r13 ; X64-NEXT: shrq $2, %rcx -; X64-NEXT: andq %r10, %rcx +; X64-NEXT: andq %rax, %rcx ; X64-NEXT: leaq (%rcx,%r13,4), %rcx ; X64-NEXT: movq %rcx, %r13 ; X64-NEXT: andq %r14, %r13 @@ -1248,9 +1247,9 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: shlq $4, %rdx ; X64-NEXT: orq %r13, %rdx ; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: andq %r10, %r13 +; X64-NEXT: andq %rax, %r13 ; X64-NEXT: shrq $2, %rdx -; X64-NEXT: andq %r10, %rdx +; X64-NEXT: andq %rax, %rdx ; X64-NEXT: leaq (%rdx,%r13,4), %rdx ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: andq %r14, %r13 @@ -1266,26 +1265,27 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64-NEXT: shlq $4, %rsi ; X64-NEXT: orq %r13, %rsi ; X64-NEXT: movq %rsi, %r11 -; X64-NEXT: andq %r10, %r11 +; X64-NEXT: andq %rax, %r11 ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: andq %r10, %rsi -; X64-NEXT: leaq (%rsi,%r11,4), %rsi -; X64-NEXT: movq %rsi, %r10 -; X64-NEXT: andq %r14, %r10 -; X64-NEXT: shrq %rsi +; X64-NEXT: andq %rax, %rsi +; X64-NEXT: leaq (%rsi,%r11,4), %rax +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: andq %r14, %rsi -; X64-NEXT: leaq (%rsi,%r10,2), %rsi +; X64-NEXT: shrq %rax +; X64-NEXT: andq %r14, %rax +; X64-NEXT: leaq (%rax,%rsi,2), %rsi ; X64-NEXT: shrdq $48, %rsi, %rdx +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shrq $48, %rsi -; X64-NEXT: movq %rdx, 56(%rax) -; X64-NEXT: movq %rcx, 48(%rax) -; X64-NEXT: movq %r8, 40(%rax) -; X64-NEXT: movq %r9, 32(%rax) -; X64-NEXT: movq %r12, 24(%rax) -; X64-NEXT: movq %r15, 16(%rax) -; X64-NEXT: movq %rbx, 8(%rax) -; X64-NEXT: movq %rdi, (%rax) -; X64-NEXT: movw %si, 64(%rax) +; X64-NEXT: movq %rdx, 56(%rdi) +; X64-NEXT: movq %rcx, 48(%rdi) +; X64-NEXT: movq %r8, 40(%rdi) +; X64-NEXT: movq %r9, 32(%rdi) +; X64-NEXT: movq %r12, 24(%rdi) +; X64-NEXT: movq %r15, 16(%rdi) +; X64-NEXT: movq %rbx, 8(%rdi) +; X64-NEXT: movq %r10, (%rdi) +; X64-NEXT: movw %si, 64(%rdi) ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -1411,10 +1411,10 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86GFNI-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1 ; X86GFNI-NEXT: vmovd %xmm1, %eax -; X86GFNI-NEXT: bswapl %eax ; X86GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; X86GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1 ; X86GFNI-NEXT: vpextrd $1, %xmm1, %ecx +; X86GFNI-NEXT: bswapl %eax ; X86GFNI-NEXT: bswapl %ecx ; X86GFNI-NEXT: shrdl $16, %ecx, %eax ; X86GFNI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1528,23 +1528,22 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64GFNI: # %bb.0: ; X64GFNI-NEXT: pushq %r14 ; X64GFNI-NEXT: pushq %rbx -; X64GFNI-NEXT: movq %rdi, %rax ; X64GFNI-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; X64GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; X64GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1 -; X64GFNI-NEXT: vmovq %xmm1, %r10 -; X64GFNI-NEXT: bswapq %r10 +; X64GFNI-NEXT: vmovq %xmm1, %rax +; X64GFNI-NEXT: bswapq %rax ; X64GFNI-NEXT: vmovq %r9, %xmm1 ; X64GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1 -; X64GFNI-NEXT: vmovq %xmm1, %rdi -; X64GFNI-NEXT: bswapq %rdi +; X64GFNI-NEXT: vmovq %xmm1, %r9 +; X64GFNI-NEXT: bswapq %r9 ; X64GFNI-NEXT: vmovq %r8, %xmm1 ; X64GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1 ; X64GFNI-NEXT: vmovq %xmm1, %r8 ; X64GFNI-NEXT: bswapq %r8 -; X64GFNI-NEXT: movq %r8, %r9 -; X64GFNI-NEXT: shldq $16, %rdi, %r9 -; X64GFNI-NEXT: shldq $16, %r10, %rdi +; X64GFNI-NEXT: movq %r8, %r10 +; X64GFNI-NEXT: shldq $16, %r9, %r10 +; X64GFNI-NEXT: shldq $16, %rax, %r9 ; X64GFNI-NEXT: vmovq %rcx, %xmm1 ; X64GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1 ; X64GFNI-NEXT: vmovq %xmm1, %rcx @@ -1574,17 +1573,18 @@ define i528 @large_promotion(i528 %A) nounwind { ; X64GFNI-NEXT: vmovq %xmm0, %r14 ; X64GFNI-NEXT: bswapq %r14 ; X64GFNI-NEXT: shrdq $48, %r14, %rbx -; X64GFNI-NEXT: shrdq $48, %r10, %r14 +; X64GFNI-NEXT: shrdq $48, %rax, %r14 +; X64GFNI-NEXT: movq %rdi, %rax ; X64GFNI-NEXT: shrq $48, %rsi -; X64GFNI-NEXT: movq %r14, 16(%rax) -; X64GFNI-NEXT: movq %rbx, 8(%rax) -; X64GFNI-NEXT: movq %r11, (%rax) -; X64GFNI-NEXT: movq %rdx, 56(%rax) -; X64GFNI-NEXT: movq %rcx, 48(%rax) -; X64GFNI-NEXT: movq %r8, 40(%rax) -; X64GFNI-NEXT: movq %r9, 32(%rax) -; X64GFNI-NEXT: movq %rdi, 24(%rax) -; X64GFNI-NEXT: movw %si, 64(%rax) +; X64GFNI-NEXT: movq %r14, 16(%rdi) +; X64GFNI-NEXT: movq %rbx, 8(%rdi) +; X64GFNI-NEXT: movq %r11, (%rdi) +; X64GFNI-NEXT: movq %rdx, 56(%rdi) +; X64GFNI-NEXT: movq %rcx, 48(%rdi) +; X64GFNI-NEXT: movq %r8, 40(%rdi) +; X64GFNI-NEXT: movq %r10, 32(%rdi) +; X64GFNI-NEXT: movq %r9, 24(%rdi) +; X64GFNI-NEXT: movw %si, 64(%rdi) ; X64GFNI-NEXT: popq %rbx ; X64GFNI-NEXT: popq %r14 ; X64GFNI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll index 2922113b14ea9..ef884fea6ed96 100644 --- a/llvm/test/CodeGen/X86/bitselect.ll +++ b/llvm/test/CodeGen/X86/bitselect.ll @@ -102,16 +102,16 @@ define i64 @bitselect_i64(i64 %a, i64 %b, i64 %m) nounwind { ; X86-LABEL: bitselect_i64: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: xorl %esi, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %esi, %edx +; X86-NEXT: xorl %ecx, %edx ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %esi, %edx +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: xorl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -143,29 +143,29 @@ define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edi, %ecx +; X86-NEXT: xorl %esi, %ecx ; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edi, %ecx +; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: andl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %edi, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %edx, %edi ; X86-NEXT: andl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %ebx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %esi, %ebx -; X86-NEXT: andl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: andl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll index cabeebb0c3f36..8289cbb23c78a 100644 --- a/llvm/test/CodeGen/X86/bmi2.ll +++ b/llvm/test/CodeGen/X86/bmi2.ll @@ -390,13 +390,13 @@ declare i32 @llvm.x86.bmi.pext.32(i32, i32) define i32 @mulx32(i32 %x, i32 %y, ptr %p) { ; X86-LABEL: mulx32: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl %edx, %edx ; X86-NEXT: addl %eax, %eax -; X86-NEXT: mulxl %eax, %eax, %edx -; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: mulxl %eax, %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, (%edx) ; X86-NEXT: retl ; ; X64-LABEL: mulx32: @@ -439,12 +439,12 @@ define i32 @mulx32(i32 %x, i32 %y, ptr %p) { define i32 @mulx32_load(i32 %x, ptr %y, ptr %p) { ; X86-LABEL: mulx32_load: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl %edx, %edx -; X86-NEXT: mulxl (%eax), %eax, %edx -; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: mulxl (%eax), %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, (%edx) ; X86-NEXT: retl ; ; X64-LABEL: mulx32_load: diff --git a/llvm/test/CodeGen/X86/bool-simplify.ll b/llvm/test/CodeGen/X86/bool-simplify.ll index edc36fd8b1446..bcd3a543754ab 100644 --- a/llvm/test/CodeGen/X86/bool-simplify.ll +++ b/llvm/test/CodeGen/X86/bool-simplify.ll @@ -4,8 +4,8 @@ define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: ptest %xmm0, %xmm0 +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c) diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index 2e237fb5b07b7..e2acfd9158539 100644 --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -123,7 +123,7 @@ define <16 x i8> @f16xi8_i64(<16 x i8> %a) { define <32 x i8> @f32xi8_i16(<32 x i8> %a) { ; AVX-LABEL: f32xi8_i16: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -140,7 +140,7 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) { ; ; AVX-64-LABEL: f32xi8_i16: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddb %xmm1, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -245,8 +245,7 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) { define <32 x i8> @f32xi8_i128(<32 x i8> %a) { ; AVX-LABEL: f32xi8_i128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -264,8 +263,7 @@ define <32 x i8> @f32xi8_i128(<32 x i8> %a) { ; ; AVX-64-LABEL: f32xi8_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddb %xmm1, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -289,7 +287,7 @@ define <32 x i8> @f32xi8_i128(<32 x i8> %a) { define <64 x i8> @f64xi8_i16(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i16: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 @@ -320,7 +318,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) { ; ; AVX-64-LABEL: f64xi8_i16: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 @@ -791,8 +789,7 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) { define <16 x i16> @f16xi16_i128(<16 x i16> %a) { ; AVX-LABEL: f16xi16_i128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -810,8 +807,7 @@ define <16 x i16> @f16xi16_i128(<16 x i16> %a) { ; ; AVX-64-LABEL: f16xi16_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddw %xmm1, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -1046,7 +1042,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i256: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpmovsxbw {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddw %xmm4, %xmm1, %xmm1 @@ -1061,7 +1057,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; ; AVX2-LABEL: f32xi16_i256: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1079,7 +1075,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; AVX-64-LABEL: f32xi16_i256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vpmovsxbw {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vpaddw %xmm4, %xmm1, %xmm1 @@ -1094,7 +1090,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; ; AVX2-64-LABEL: f32xi16_i256: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1195,8 +1191,7 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) { define <8 x i32> @f8xi32_i128(<8 x i32> %a) { ; AVX-LABEL: f8xi32_i128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] -; AVX-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1214,8 +1209,7 @@ define <8 x i32> @f8xi32_i128(<8 x i32> %a) { ; ; AVX-64-LABEL: f8xi32_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] -; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddd %xmm1, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1381,8 +1375,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) { define <4 x i64> @f4xi64_i128(<4 x i64> %a) { ; AVX-LABEL: f4xi64_i128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; AVX-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -1400,8 +1393,7 @@ define <4 x i64> @f4xi64_i128(<4 x i64> %a) { ; ; AVX-64-LABEL: f4xi64_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,1,0,1] -; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1] ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-64-NEXT: vpaddq %xmm1, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -1500,7 +1492,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; AVX-LABEL: f8xi64_i256: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,3] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,0,3,0] ; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,1,0,2,0,3,0] ; AVX-NEXT: vpaddq %xmm4, %xmm1, %xmm1 @@ -1515,7 +1507,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; ; AVX2-LABEL: f8xi64_i256: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1533,7 +1525,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; AVX-64-LABEL: f8xi64_i256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,3] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3] ; AVX-64-NEXT: vpaddq %xmm4, %xmm1, %xmm1 @@ -1548,7 +1540,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { ; ; AVX2-64-LABEL: f8xi64_i256: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,1,2,3] +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] ; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1682,18 +1674,18 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) { ; AVX-LABEL: f16xf32_f64: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f16xf32_f64: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; @@ -1707,18 +1699,18 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) { ; AVX-64-LABEL: f16xf32_f64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f16xf32_f64: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; @@ -1739,9 +1731,9 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; @@ -1749,9 +1741,9 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX2-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; @@ -1767,9 +1759,9 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) { ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; @@ -1777,9 +1769,9 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) { ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; @@ -1800,18 +1792,18 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) { ; AVX-LABEL: f16xf32_f256: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f16xf32_f256: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] -; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; @@ -1826,18 +1818,18 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) { ; AVX-64-LABEL: f16xf32_f256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] -; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f16xf32_f256: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] -; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; @@ -1897,9 +1889,9 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) { ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; @@ -1907,9 +1899,9 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX2-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; @@ -1925,9 +1917,9 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) { ; AVX-64: # %bb.0: ; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; @@ -1935,9 +1927,9 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) { ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; @@ -1958,18 +1950,18 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) { ; AVX-LABEL: f8xf64_f256: ; AVX: # %bb.0: ; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f8xf64_f256: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retl ; @@ -1984,18 +1976,18 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) { ; AVX-64-LABEL: f8xf64_f256: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f8xf64_f256: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll index 6d5e995a6d574..3c07977c7c70f 100644 --- a/llvm/test/CodeGen/X86/bswap-wide-int.ll +++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll @@ -71,8 +71,8 @@ define i128 @bswap_i128(i128 %a0) nounwind { ; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-MOVBE-NEXT: movbel %esi, 12(%eax) ; X86-MOVBE-NEXT: movbel %edi, 8(%eax) -; X86-MOVBE-NEXT: movbel %ecx, 4(%eax) -; X86-MOVBE-NEXT: movbel %edx, (%eax) +; X86-MOVBE-NEXT: movbel %edx, 4(%eax) +; X86-MOVBE-NEXT: movbel %ecx, (%eax) ; X86-MOVBE-NEXT: popl %esi ; X86-MOVBE-NEXT: popl %edi ; X86-MOVBE-NEXT: retl $4 @@ -149,10 +149,10 @@ define i256 @bswap_i256(i256 %a0) nounwind { ; ; X64-LABEL: bswap_i256: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: bswapq %r8 ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: bswapq %rsi ; X64-NEXT: movq %rsi, 24(%rdi) ; X64-NEXT: movq %rdx, 16(%rdi) diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll index 81eac5676bb5c..542c97e6ba8d2 100644 --- a/llvm/test/CodeGen/X86/bswap.ll +++ b/llvm/test/CodeGen/X86/bswap.ll @@ -351,21 +351,21 @@ define i528 @large_promotion(i528 %A) nounwind { ; ; CHECK64-LABEL: large_promotion: ; CHECK64: # %bb.0: +; CHECK64-NEXT: pushq %r14 ; CHECK64-NEXT: pushq %rbx -; CHECK64-NEXT: movq %rdi, %rax +; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK64-NEXT: bswapq %rdi ; CHECK64-NEXT: bswapq %r10 -; CHECK64-NEXT: shrdq $48, %r10, %rdi ; CHECK64-NEXT: bswapq %r11 ; CHECK64-NEXT: shrdq $48, %r11, %r10 ; CHECK64-NEXT: bswapq %rbx ; CHECK64-NEXT: shrdq $48, %rbx, %r11 +; CHECK64-NEXT: bswapq %r14 +; CHECK64-NEXT: shrdq $48, %r14, %rbx ; CHECK64-NEXT: bswapq %r9 -; CHECK64-NEXT: shrdq $48, %r9, %rbx +; CHECK64-NEXT: shrdq $48, %r9, %r14 ; CHECK64-NEXT: bswapq %r8 ; CHECK64-NEXT: shrdq $48, %r8, %r9 ; CHECK64-NEXT: bswapq %rcx @@ -374,17 +374,19 @@ define i528 @large_promotion(i528 %A) nounwind { ; CHECK64-NEXT: shrdq $48, %rdx, %rcx ; CHECK64-NEXT: bswapq %rsi ; CHECK64-NEXT: shrdq $48, %rsi, %rdx +; CHECK64-NEXT: movq %rdi, %rax ; CHECK64-NEXT: shrq $48, %rsi -; CHECK64-NEXT: movq %rdx, 56(%rax) -; CHECK64-NEXT: movq %rcx, 48(%rax) -; CHECK64-NEXT: movq %r8, 40(%rax) -; CHECK64-NEXT: movq %r9, 32(%rax) -; CHECK64-NEXT: movq %rbx, 24(%rax) -; CHECK64-NEXT: movq %r11, 16(%rax) -; CHECK64-NEXT: movq %r10, 8(%rax) -; CHECK64-NEXT: movq %rdi, (%rax) -; CHECK64-NEXT: movw %si, 64(%rax) +; CHECK64-NEXT: movq %rdx, 56(%rdi) +; CHECK64-NEXT: movq %rcx, 48(%rdi) +; CHECK64-NEXT: movq %r8, 40(%rdi) +; CHECK64-NEXT: movq %r9, 32(%rdi) +; CHECK64-NEXT: movq %r14, 24(%rdi) +; CHECK64-NEXT: movq %rbx, 16(%rdi) +; CHECK64-NEXT: movq %r11, 8(%rdi) +; CHECK64-NEXT: movq %r10, (%rdi) +; CHECK64-NEXT: movw %si, 64(%rdi) ; CHECK64-NEXT: popq %rbx +; CHECK64-NEXT: popq %r14 ; CHECK64-NEXT: retq %Z = call i528 @llvm.bswap.i528(i528 %A) ret i528 %Z diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll index efd9d1105d975..24b0788d7eb34 100644 --- a/llvm/test/CodeGen/X86/btc_bts_btr.ll +++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll @@ -466,11 +466,11 @@ define i16 @bts_16_load(ptr %x, i16 %n) { ; ; X86-LABEL: bts_16_load: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: orw (%edx), %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orw (%ecx), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %1 = load i16, ptr %x @@ -492,11 +492,11 @@ define i16 @btc_16_load(ptr %x, i16 %n) { ; ; X86-LABEL: btc_16_load: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: xorw (%edx), %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorw (%ecx), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %1 = load i16, ptr %x @@ -578,11 +578,11 @@ define i64 @btr_64_load(ptr %x, i64 %n) { ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB24_2 @@ -616,11 +616,11 @@ define i64 @bts_64_load(ptr %x, i64 %n) { ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB25_2 @@ -651,11 +651,11 @@ define i64 @btc_64_load(ptr %x, i64 %n) { ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB26_2 @@ -690,11 +690,11 @@ define void @btr_16_dont_fold(ptr %x, i16 %n) { ; ; X86-LABEL: btr_16_dont_fold: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movw $-2, %dx -; X86-NEXT: rolw %cl, %dx -; X86-NEXT: andw %dx, (%eax) +; X86-NEXT: movw $-2, %ax +; X86-NEXT: rolw %cl, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andw %ax, (%ecx) ; X86-NEXT: retl %1 = load i16, ptr %x %2 = shl i16 1, %n @@ -716,11 +716,11 @@ define void @bts_16_dont_fold(ptr %x, i16 %n) { ; ; X86-LABEL: bts_16_dont_fold: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: orw %dx, (%eax) +; X86-NEXT: movl $1, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orw %ax, (%ecx) ; X86-NEXT: retl %1 = load i16, ptr %x %2 = shl i16 1, %n @@ -741,11 +741,11 @@ define void @btc_16_dont_fold(ptr %x, i16 %n) { ; ; X86-LABEL: btc_16_dont_fold: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: xorw %dx, (%eax) +; X86-NEXT: movl $1, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorw %ax, (%ecx) ; X86-NEXT: retl %1 = load i16, ptr %x %2 = shl i16 1, %n @@ -766,11 +766,11 @@ define void @btr_32_dont_fold(ptr %x, i32 %n) { ; ; X86-LABEL: btr_32_dont_fold: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $-2, %edx -; X86-NEXT: roll %cl, %edx -; X86-NEXT: andl %edx, (%eax) +; X86-NEXT: movl $-2, %eax +; X86-NEXT: roll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl %eax, (%ecx) ; X86-NEXT: retl %1 = load i32, ptr %x %2 = shl i32 1, %n @@ -792,11 +792,11 @@ define void @bts_32_dont_fold(ptr %x, i32 %n) { ; ; X86-LABEL: bts_32_dont_fold: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: orl %edx, (%eax) +; X86-NEXT: movl $1, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl %eax, (%ecx) ; X86-NEXT: retl %1 = load i32, ptr %x %2 = shl i32 1, %n @@ -817,11 +817,11 @@ define void @btc_32_dont_fold(ptr %x, i32 %n) { ; ; X86-LABEL: btc_32_dont_fold: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: xorl %edx, (%eax) +; X86-NEXT: movl $1, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %eax, (%ecx) ; X86-NEXT: retl %1 = load i32, ptr %x %2 = shl i32 1, %n @@ -845,22 +845,22 @@ define void @btr_64_dont_fold(ptr %x, i64 %n) { ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl $1, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB33_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB33_2: -; X86-NEXT: notl %esi +; X86-NEXT: notl %eax +; X86-NEXT: andl %eax, (%esi) ; X86-NEXT: notl %edx -; X86-NEXT: andl %edx, (%eax) -; X86-NEXT: andl %esi, 4(%eax) +; X86-NEXT: andl %edx, 4(%esi) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -887,20 +887,20 @@ define void @bts_64_dont_fold(ptr %x, i64 %n) { ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl $1, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB34_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB34_2: -; X86-NEXT: orl %edx, (%eax) -; X86-NEXT: orl %esi, 4(%eax) +; X86-NEXT: orl %eax, (%esi) +; X86-NEXT: orl %edx, 4(%esi) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -926,20 +926,20 @@ define void @btc_64_dont_fold(ptr %x, i64 %n) { ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl $1, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB35_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB35_2: -; X86-NEXT: xorl %edx, (%eax) -; X86-NEXT: xorl %esi, 4(%eax) +; X86-NEXT: xorl %eax, (%esi) +; X86-NEXT: xorl %edx, 4(%esi) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll index 789196c5e4848..41869507c0f77 100644 --- a/llvm/test/CodeGen/X86/build-vector-512.ll +++ b/llvm/test/CodeGen/X86/build-vector-512.ll @@ -44,16 +44,16 @@ define <16 x float> @test_buildvector_v16f32(float %a0, float %a1, float %a2, fl ; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX-64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; AVX-64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX-64-NEXT: retq @@ -124,16 +124,16 @@ define <16 x i32> @test_buildvector_v16i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i ; AVX-64-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ; AVX-64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; AVX-64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; AVX-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; AVX-64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; AVX-64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX-64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX-64-NEXT: retq %ins0 = insertelement <16 x i32> undef, i32 %a0, i32 0 @@ -174,15 +174,6 @@ define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i ; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 @@ -191,7 +182,16 @@ define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i ; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX-32-NEXT: retl ; @@ -213,24 +213,24 @@ define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i ; AVX-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX-64-NEXT: vmovd %edi, %xmm1 -; AVX-64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vmovd %edi, %xmm2 +; AVX-64-NEXT: vpinsrw $1, %esi, %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $4, %r8d, %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $5, %r9d, %xmm2, %xmm2 ; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX-64-NEXT: retq %ins0 = insertelement <32 x i16> undef, i16 %a0, i32 0 @@ -303,23 +303,6 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 @@ -336,7 +319,24 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX-32-NEXT: retl ; @@ -374,29 +374,12 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX-64-NEXT: vmovd %edi, %xmm1 -; AVX-64-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vmovd %edi, %xmm2 +; AVX-64-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2 ; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 @@ -407,7 +390,24 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX-64-NEXT: retq %ins0 = insertelement <64 x i8> undef, i8 %a0, i32 0 @@ -575,7 +575,7 @@ define <16 x float> @test_buildvector_16f32_2_var(float %a0, float %a1) { ; AVX-32-LABEL: test_buildvector_16f32_2_var: ; AVX-32: # %bb.0: ; AVX-32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0 -; AVX-32-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,17,0,0] +; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = [0,17,0,0] ; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; AVX-32-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] @@ -589,12 +589,12 @@ define <16 x float> @test_buildvector_16f32_2_var(float %a0, float %a1) { ; AVX-64-LABEL: test_buildvector_16f32_2_var: ; AVX-64: # %bb.0: ; AVX-64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX-64-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,16,0,0] +; AVX-64-NEXT: vmovsd {{.*#+}} xmm2 = [0,16,0,0] ; AVX-64-NEXT: vbroadcastss %xmm0, %xmm0 ; AVX-64-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 ; AVX-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm1[0] ; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX-64-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,16,0,0,0,17,18,19] +; AVX-64-NEXT: vmovaps {{.*#+}} ymm3 = [0,16,0,0,0,17,18,19] ; AVX-64-NEXT: vpermi2ps %zmm0, %zmm1, %zmm3 ; AVX-64-NEXT: vinsertf64x4 $1, %ymm3, %zmm2, %zmm0 ; AVX-64-NEXT: retq @@ -622,7 +622,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) { ; AVX-32: # %bb.0: ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,17,0,0] +; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = [0,17,0,0] ; AVX-32-NEXT: vbroadcastss (%ecx), %xmm1 ; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-32-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 @@ -636,7 +636,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) { ; ; AVX-64-LABEL: test_buildvector_16f32_2_load: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,17,0,0] +; AVX-64-NEXT: vmovsd {{.*#+}} xmm0 = [0,17,0,0] ; AVX-64-NEXT: vbroadcastss (%rdi), %xmm1 ; AVX-64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-64-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 diff --git a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll index db27132d4055b..14c655d9f346a 100644 --- a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll @@ -26,8 +26,10 @@ define i32 @foo(i32 %arg, ptr %arg3) nounwind { ; CHECK-NEXT: movq %rsi, %rbx ; CHECK-NEXT: movslq %edi, %rbp ; CHECK-NEXT: leaq (,%rbp,8), %rax -; CHECK-NEXT: leaq global(%rax,%rax,2), %r14 -; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r15 +; CHECK-NEXT: leaq (%rax,%rax,2), %r14 +; CHECK-NEXT: addq $global, %r14 +; CHECK-NEXT: leaq (%rax,%rax,2), %r15 +; CHECK-NEXT: addq $global+4, %r15 ; CHECK-NEXT: xorl %r13d, %r13d ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: # %bb8 @@ -43,7 +45,8 @@ define i32 @foo(i32 %arg, ptr %arg3) nounwind { ; CHECK-NEXT: testb %r13b, %r13b ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.3: # %bb15 -; CHECK-NEXT: leaq (%rbp,%rbp,2), %rax +; CHECK-NEXT: leaq (,%rbp,2), %rax +; CHECK-NEXT: addq %rbp, %rax ; CHECK-NEXT: movq %r12, global+16(,%rax,8) ; CHECK-NEXT: movabsq $-2305847407260205056, %r14 # imm = 0xDFFFFC0000000000 ; CHECK-NEXT: #APP diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll index 67213b38277dc..03baffd55a3e7 100644 --- a/llvm/test/CodeGen/X86/canonicalize-vars.ll +++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll @@ -580,7 +580,6 @@ define void @canonicalize_undef(double addrspace(1)* %out) { define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) { ; X87-LABEL: canon_fp32_varargsv4f32: ; X87: # %bb.0: -; X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X87-NEXT: fld1 ; X87-NEXT: fld %st(0) ; X87-NEXT: fmuls {{[0-9]+}}(%esp) @@ -588,6 +587,7 @@ define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) { ; X87-NEXT: fmuls {{[0-9]+}}(%esp) ; X87-NEXT: fld %st(2) ; X87-NEXT: fmuls {{[0-9]+}}(%esp) +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X87-NEXT: fxch %st(3) ; X87-NEXT: fmuls {{[0-9]+}}(%esp) ; X87-NEXT: fstps 12(%eax) @@ -636,7 +636,6 @@ define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) { define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) { ; X87-LABEL: canon_fp64_varargsv4f64: ; X87: # %bb.0: -; X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X87-NEXT: fld1 ; X87-NEXT: fld %st(0) ; X87-NEXT: fmull {{[0-9]+}}(%esp) @@ -644,6 +643,7 @@ define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) { ; X87-NEXT: fmull {{[0-9]+}}(%esp) ; X87-NEXT: fld %st(2) ; X87-NEXT: fmull {{[0-9]+}}(%esp) +; X87-NEXT: movl {{[0-9]+}}(%esp), %eax ; X87-NEXT: fxch %st(3) ; X87-NEXT: fmull {{[0-9]+}}(%esp) ; X87-NEXT: fstpl 24(%eax) diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll index 79513b205933e..6c7adb7aa66ca 100644 --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -147,10 +147,10 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4 ; ; AVX-LABEL: fpext: ; AVX: # %bb.0: +; AVX-NEXT: vcvtps2pd %xmm2, %ymm2 +; AVX-NEXT: vcvtps2pd %xmm3, %ymm3 ; AVX-NEXT: vcmpltpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vcvtps2pd %xmm2, %ymm1 -; AVX-NEXT: vcvtps2pd %xmm3, %ymm2 -; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 +; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0 ; AVX-NEXT: retq %cmp = fcmp olt <4 x double> %a, %b %sel = select <4 x i1> %cmp, <4 x float> %c, <4 x float> %d @@ -194,7 +194,7 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d) ; AVX1-LABEL: trunc: ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 @@ -228,13 +228,13 @@ define <4 x float> @fptrunc(<4 x float> %a, <4 x float> %b, <4 x double> %c, <4 ; SSE2-NEXT: cmpltps %xmm1, %xmm0 ; SSE2-NEXT: cvtpd2ps %xmm5, %xmm1 ; SSE2-NEXT: cvtpd2ps %xmm4, %xmm4 +; SSE2-NEXT: cvtpd2ps %xmm3, %xmm3 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE2-NEXT: cvtpd2ps %xmm3, %xmm1 -; SSE2-NEXT: cvtpd2ps %xmm2, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: cvtpd2ps %xmm2, %xmm1 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: andpd %xmm0, %xmm1 ; SSE2-NEXT: andnpd %xmm4, %xmm0 -; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: fptrunc: @@ -242,8 +242,8 @@ define <4 x float> @fptrunc(<4 x float> %a, <4 x float> %b, <4 x double> %c, <4 ; SSE41-NEXT: cmpltps %xmm1, %xmm0 ; SSE41-NEXT: cvtpd2ps %xmm3, %xmm1 ; SSE41-NEXT: cvtpd2ps %xmm2, %xmm2 -; SSE41-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE41-NEXT: cvtpd2ps %xmm5, %xmm3 +; SSE41-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE41-NEXT: cvtpd2ps %xmm4, %xmm1 ; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 @@ -252,10 +252,10 @@ define <4 x float> @fptrunc(<4 x float> %a, <4 x float> %b, <4 x double> %c, <4 ; ; AVX-LABEL: fptrunc: ; AVX: # %bb.0: +; AVX-NEXT: vcvtpd2ps %ymm2, %xmm2 +; AVX-NEXT: vcvtpd2ps %ymm3, %xmm3 ; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vcvtpd2ps %ymm2, %xmm1 -; AVX-NEXT: vcvtpd2ps %ymm3, %xmm2 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %cmp = fcmp olt <4 x float> %a, %b @@ -281,16 +281,16 @@ define dso_local void @example25() nounwind { ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: .LBB5_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movaps da+4112(%rax), %xmm0 -; SSE2-NEXT: movaps da+4096(%rax), %xmm1 -; SSE2-NEXT: cmpltps db+4096(%rax), %xmm1 -; SSE2-NEXT: cmpltps db+4112(%rax), %xmm0 +; SSE2-NEXT: movaps da+4096(%rax), %xmm0 +; SSE2-NEXT: cmpltps db+4096(%rax), %xmm0 +; SSE2-NEXT: movaps da+4112(%rax), %xmm1 +; SSE2-NEXT: cmpltps db+4112(%rax), %xmm1 ; SSE2-NEXT: movaps dc+4112(%rax), %xmm2 ; SSE2-NEXT: movaps dc+4096(%rax), %xmm3 ; SSE2-NEXT: cmpltps dd+4096(%rax), %xmm3 -; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm0, %xmm3 ; SSE2-NEXT: cmpltps dd+4112(%rax), %xmm2 -; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: andps %xmm1, %xmm2 ; SSE2-NEXT: psrld $31, %xmm3 ; SSE2-NEXT: psrld $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, dj+4112(%rax) @@ -306,16 +306,16 @@ define dso_local void @example25() nounwind { ; SSE41-NEXT: .p2align 4 ; SSE41-NEXT: .LBB5_1: # %vector.body ; SSE41-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE41-NEXT: movaps da+4112(%rax), %xmm0 -; SSE41-NEXT: movaps da+4096(%rax), %xmm1 -; SSE41-NEXT: cmpltps db+4096(%rax), %xmm1 -; SSE41-NEXT: cmpltps db+4112(%rax), %xmm0 +; SSE41-NEXT: movaps da+4096(%rax), %xmm0 +; SSE41-NEXT: cmpltps db+4096(%rax), %xmm0 +; SSE41-NEXT: movaps da+4112(%rax), %xmm1 +; SSE41-NEXT: cmpltps db+4112(%rax), %xmm1 ; SSE41-NEXT: movaps dc+4112(%rax), %xmm2 ; SSE41-NEXT: movaps dc+4096(%rax), %xmm3 ; SSE41-NEXT: cmpltps dd+4096(%rax), %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm0, %xmm3 ; SSE41-NEXT: cmpltps dd+4112(%rax), %xmm2 -; SSE41-NEXT: andps %xmm0, %xmm2 +; SSE41-NEXT: andps %xmm1, %xmm2 ; SSE41-NEXT: psrld $31, %xmm3 ; SSE41-NEXT: psrld $31, %xmm2 ; SSE41-NEXT: movdqa %xmm2, dj+4112(%rax) @@ -328,7 +328,7 @@ define dso_local void @example25() nounwind { ; AVX1-LABEL: example25: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB5_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/clear-highbits.ll b/llvm/test/CodeGen/X86/clear-highbits.ll index 755b1094234fd..938559ca25e3a 100644 --- a/llvm/test/CodeGen/X86/clear-highbits.ll +++ b/llvm/test/CodeGen/X86/clear-highbits.ll @@ -525,9 +525,9 @@ define i64 @clear_highbits64_c0(i64 %val, i64 %numhighbits) nounwind { ; X86-BASELINE: # %bb.0: ; X86-BASELINE-NEXT: pushl %esi ; X86-BASELINE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: movl $-1, %esi ; X86-BASELINE-NEXT: shrl %cl, %esi +; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: xorl %edx, %edx ; X86-BASELINE-NEXT: testb $32, %cl ; X86-BASELINE-NEXT: jne .LBB13_1 @@ -598,9 +598,9 @@ define i64 @clear_highbits64_c1_indexzext(i64 %val, i8 %numhighbits) nounwind { ; X86-BASELINE: # %bb.0: ; X86-BASELINE-NEXT: pushl %esi ; X86-BASELINE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: movl $-1, %esi ; X86-BASELINE-NEXT: shrl %cl, %esi +; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: xorl %edx, %edx ; X86-BASELINE-NEXT: testb $32, %cl ; X86-BASELINE-NEXT: jne .LBB14_1 @@ -672,11 +672,11 @@ define i64 @clear_highbits64_c2_load(ptr %w, i64 %numhighbits) nounwind { ; X86-BASELINE: # %bb.0: ; X86-BASELINE-NEXT: pushl %edi ; X86-BASELINE-NEXT: pushl %esi -; X86-BASELINE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BASELINE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: movl $-1, %edi ; X86-BASELINE-NEXT: shrl %cl, %edi +; X86-BASELINE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: xorl %edx, %edx ; X86-BASELINE-NEXT: testb $32, %cl ; X86-BASELINE-NEXT: jne .LBB15_1 @@ -755,11 +755,11 @@ define i64 @clear_highbits64_c3_load_indexzext(ptr %w, i8 %numhighbits) nounwind ; X86-BASELINE: # %bb.0: ; X86-BASELINE-NEXT: pushl %edi ; X86-BASELINE-NEXT: pushl %esi -; X86-BASELINE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BASELINE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: movl $-1, %edi ; X86-BASELINE-NEXT: shrl %cl, %edi +; X86-BASELINE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: xorl %edx, %edx ; X86-BASELINE-NEXT: testb $32, %cl ; X86-BASELINE-NEXT: jne .LBB16_1 @@ -839,9 +839,9 @@ define i64 @clear_highbits64_c4_commutative(i64 %val, i64 %numhighbits) nounwind ; X86-BASELINE: # %bb.0: ; X86-BASELINE-NEXT: pushl %esi ; X86-BASELINE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: movl $-1, %esi ; X86-BASELINE-NEXT: shrl %cl, %esi +; X86-BASELINE-NEXT: movl $-1, %eax ; X86-BASELINE-NEXT: xorl %edx, %edx ; X86-BASELINE-NEXT: testb $32, %cl ; X86-BASELINE-NEXT: jne .LBB17_1 @@ -981,17 +981,17 @@ define i64 @oneuse64_c(i64 %val, i64 %numhighbits, ptr %escape) nounwind { ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl $-1, %esi +; X86-BMI1-NEXT: shrl %cl, %esi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI1-NEXT: movl $-1, %eax -; X86-BMI1-NEXT: movl $-1, %edi -; X86-BMI1-NEXT: shrl %cl, %edi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: testb $32, %cl -; X86-BMI1-NEXT: cmovnel %edi, %eax -; X86-BMI1-NEXT: cmovel %edi, %edx -; X86-BMI1-NEXT: movl %edx, 4(%esi) -; X86-BMI1-NEXT: movl %eax, (%esi) +; X86-BMI1-NEXT: cmovnel %esi, %eax +; X86-BMI1-NEXT: cmovel %esi, %edx +; X86-BMI1-NEXT: movl %edx, 4(%edi) +; X86-BMI1-NEXT: movl %eax, (%edi) ; X86-BMI1-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: popl %esi @@ -1044,21 +1044,21 @@ define i64 @oneuse64_c(i64 %val, i64 %numhighbits, ptr %escape) nounwind { define i32 @oneuse32_d(i32 %val, i32 %numhighbits, ptr %escape) nounwind { ; X86-NOBMI2-LABEL: oneuse32_d: ; X86-NOBMI2: # %bb.0: -; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI2-NEXT: shll %cl, %eax +; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI2-NEXT: movl %eax, (%edx) ; X86-NOBMI2-NEXT: shrl %cl, %eax ; X86-NOBMI2-NEXT: retl ; ; X86-BMI2-LABEL: oneuse32_d: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: shlxl %ecx, {{[0-9]+}}(%esp), %edx -; X86-BMI2-NEXT: movl %edx, (%eax) -; X86-BMI2-NEXT: shrxl %ecx, %edx, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlxl %eax, {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl %ecx, (%edx) +; X86-BMI2-NEXT: shrxl %eax, %ecx, %eax ; X86-BMI2-NEXT: retl ; ; X64-NOBMI2-LABEL: oneuse32_d: @@ -1086,6 +1086,7 @@ define i32 @oneuse32_d(i32 %val, i32 %numhighbits, ptr %escape) nounwind { define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind { ; X86-BASELINE-LABEL: oneusei64_d: ; X86-BASELINE: # %bb.0: +; X86-BASELINE-NEXT: pushl %ebp ; X86-BASELINE-NEXT: pushl %ebx ; X86-BASELINE-NEXT: pushl %edi ; X86-BASELINE-NEXT: pushl %esi @@ -1095,8 +1096,8 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind { ; X86-BASELINE-NEXT: movl %edx, %edi ; X86-BASELINE-NEXT: shll %cl, %edi ; X86-BASELINE-NEXT: shldl %cl, %edx, %eax -; X86-BASELINE-NEXT: testb $32, %cl ; X86-BASELINE-NEXT: movl %edi, %esi +; X86-BASELINE-NEXT: testb $32, %cl ; X86-BASELINE-NEXT: jne .LBB21_2 ; X86-BASELINE-NEXT: # %bb.1: ; X86-BASELINE-NEXT: movl %eax, %esi @@ -1104,8 +1105,8 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind { ; X86-BASELINE-NEXT: movl %esi, %eax ; X86-BASELINE-NEXT: shrl %cl, %eax ; X86-BASELINE-NEXT: xorl %ebx, %ebx -; X86-BASELINE-NEXT: testb $32, %cl ; X86-BASELINE-NEXT: movl $0, %edx +; X86-BASELINE-NEXT: testb $32, %cl ; X86-BASELINE-NEXT: jne .LBB21_4 ; X86-BASELINE-NEXT: # %bb.3: ; X86-BASELINE-NEXT: movl %edi, %ebx @@ -1113,10 +1114,10 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind { ; X86-BASELINE-NEXT: .LBB21_4: ; X86-BASELINE-NEXT: movl %ebx, %edi ; X86-BASELINE-NEXT: shrdl %cl, %esi, %edi +; X86-BASELINE-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-BASELINE-NEXT: movl %ebx, (%ebp) +; X86-BASELINE-NEXT: movl %esi, 4(%ebp) ; X86-BASELINE-NEXT: testb $32, %cl -; X86-BASELINE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BASELINE-NEXT: movl %ebx, (%ecx) -; X86-BASELINE-NEXT: movl %esi, 4(%ecx) ; X86-BASELINE-NEXT: jne .LBB21_6 ; X86-BASELINE-NEXT: # %bb.5: ; X86-BASELINE-NEXT: movl %edi, %eax @@ -1124,40 +1125,45 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind { ; X86-BASELINE-NEXT: popl %esi ; X86-BASELINE-NEXT: popl %edi ; X86-BASELINE-NEXT: popl %ebx +; X86-BASELINE-NEXT: popl %ebp ; X86-BASELINE-NEXT: retl ; ; X86-BMI1-LABEL: oneusei64_d: ; X86-BMI1: # %bb.0: +; X86-BMI1-NEXT: pushl %ebp ; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl %edx, %eax -; X86-BMI1-NEXT: shll %cl, %eax -; X86-BMI1-NEXT: shldl %cl, %edx, %esi +; X86-BMI1-NEXT: movl %eax, %edi +; X86-BMI1-NEXT: shll %cl, %edi +; X86-BMI1-NEXT: shldl %cl, %eax, %esi ; X86-BMI1-NEXT: testb $32, %cl -; X86-BMI1-NEXT: cmovnel %eax, %esi -; X86-BMI1-NEXT: movl %esi, %edi -; X86-BMI1-NEXT: shrl %cl, %edi +; X86-BMI1-NEXT: cmovnel %edi, %esi +; X86-BMI1-NEXT: movl %esi, %ebx +; X86-BMI1-NEXT: shrl %cl, %ebx ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: testb $32, %cl -; X86-BMI1-NEXT: cmovnel %edx, %eax -; X86-BMI1-NEXT: cmovel %edi, %edx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-BMI1-NEXT: movl %eax, (%ebx) +; X86-BMI1-NEXT: cmovnel %edx, %edi +; X86-BMI1-NEXT: cmovel %ebx, %edx +; X86-BMI1-NEXT: movl %edi, %eax ; X86-BMI1-NEXT: shrdl %cl, %esi, %eax +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-BMI1-NEXT: movl %edi, (%ebp) ; X86-BMI1-NEXT: testb $32, %cl -; X86-BMI1-NEXT: movl %esi, 4(%ebx) -; X86-BMI1-NEXT: cmovnel %edi, %eax +; X86-BMI1-NEXT: movl %esi, 4(%ebp) +; X86-BMI1-NEXT: cmovnel %ebx, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: popl %ebx +; X86-BMI1-NEXT: popl %ebp ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: oneusei64_d: ; X86-BMI2: # %bb.0: +; X86-BMI2-NEXT: pushl %ebp ; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi @@ -1165,22 +1171,24 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind { ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: shldl %cl, %eax, %esi -; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: shlxl %ecx, %eax, %edi ; X86-BMI2-NEXT: xorl %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl -; X86-BMI2-NEXT: cmovnel %eax, %esi -; X86-BMI2-NEXT: cmovnel %edx, %eax -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edi -; X86-BMI2-NEXT: cmovel %edi, %edx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-BMI2-NEXT: movl %eax, (%ebx) +; X86-BMI2-NEXT: cmovnel %edi, %esi +; X86-BMI2-NEXT: cmovnel %edx, %edi +; X86-BMI2-NEXT: shrxl %ecx, %esi, %ebx +; X86-BMI2-NEXT: cmovel %ebx, %edx +; X86-BMI2-NEXT: movl %edi, %eax ; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-BMI2-NEXT: movl %edi, (%ebp) ; X86-BMI2-NEXT: testb $32, %cl -; X86-BMI2-NEXT: movl %esi, 4(%ebx) -; X86-BMI2-NEXT: cmovnel %edi, %eax +; X86-BMI2-NEXT: movl %esi, 4(%ebp) +; X86-BMI2-NEXT: cmovnel %ebx, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: popl %edi ; X86-BMI2-NEXT: popl %ebx +; X86-BMI2-NEXT: popl %ebp ; X86-BMI2-NEXT: retl ; ; X64-NOBMI2-LABEL: oneusei64_d: diff --git a/llvm/test/CodeGen/X86/clear-lowbits.ll b/llvm/test/CodeGen/X86/clear-lowbits.ll index 49ea2d0f1ed7a..1f03c58712518 100644 --- a/llvm/test/CodeGen/X86/clear-lowbits.ll +++ b/llvm/test/CodeGen/X86/clear-lowbits.ll @@ -865,11 +865,11 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind { ; ; X86-BMI2-LABEL: clear_lowbits16_ic0: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movb $16, %cl -; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl -; X86-BMI2-NEXT: shrxl %ecx, %eax, %eax -; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: movb $16, %al +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %al +; X86-BMI2-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: shrxl %eax, %ecx, %ecx +; X86-BMI2-NEXT: shlxl %eax, %ecx, %eax ; X86-BMI2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-BMI2-NEXT: retl ; @@ -911,11 +911,11 @@ define i16 @clear_lowbits16_ic1_indexzext(i16 %val, i8 %numlowbits) nounwind { ; ; X86-BMI2-LABEL: clear_lowbits16_ic1_indexzext: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movb $16, %cl -; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl -; X86-BMI2-NEXT: shrxl %ecx, %eax, %eax -; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: movb $16, %al +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %al +; X86-BMI2-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: shrxl %eax, %ecx, %ecx +; X86-BMI2-NEXT: shlxl %eax, %ecx, %eax ; X86-BMI2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-BMI2-NEXT: retl ; @@ -960,9 +960,9 @@ define i16 @clear_lowbits16_ic2_load(ptr %w, i16 %numlowbits) nounwind { ; X86-BMI2-LABEL: clear_lowbits16_ic2_load: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movzwl (%eax), %eax ; X86-BMI2-NEXT: movb $16, %cl ; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-BMI2-NEXT: movzwl (%eax), %eax ; X86-BMI2-NEXT: shrxl %ecx, %eax, %eax ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: # kill: def $ax killed $ax killed $eax @@ -1009,9 +1009,9 @@ define i16 @clear_lowbits16_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind ; X86-BMI2-LABEL: clear_lowbits16_ic3_load_indexzext: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movzwl (%eax), %eax ; X86-BMI2-NEXT: movb $16, %cl ; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-BMI2-NEXT: movzwl (%eax), %eax ; X86-BMI2-NEXT: shrxl %ecx, %eax, %eax ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: # kill: def $ax killed $ax killed $eax @@ -1057,11 +1057,11 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind ; ; X86-BMI2-LABEL: clear_lowbits16_ic4_commutative: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movb $16, %cl -; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %cl -; X86-BMI2-NEXT: shrxl %ecx, %eax, %eax -; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: movb $16, %al +; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %al +; X86-BMI2-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: shrxl %eax, %ecx, %ecx +; X86-BMI2-NEXT: shlxl %eax, %ecx, %eax ; X86-BMI2-NEXT: # kill: def $ax killed $ax killed $eax ; X86-BMI2-NEXT: retl ; @@ -1425,9 +1425,9 @@ define i64 @clear_lowbits64_ic2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-NOBMI2-LABEL: clear_lowbits64_ic2_load: ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: pushl %esi -; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI2-NEXT: movb $64, %cl ; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI2-NEXT: movl $-1, %edx ; X86-NOBMI2-NEXT: movl $-1, %eax ; X86-NOBMI2-NEXT: shll %cl, %eax @@ -1445,9 +1445,9 @@ define i64 @clear_lowbits64_ic2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-BMI2-LABEL: clear_lowbits64_ic2_load: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movb $64, %bl ; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %bl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl $-1, %edx ; X86-BMI2-NEXT: shlxl %ebx, %edx, %eax ; X86-BMI2-NEXT: testb $32, %bl @@ -1488,9 +1488,9 @@ define i64 @clear_lowbits64_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind ; X86-NOBMI2-LABEL: clear_lowbits64_ic3_load_indexzext: ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: pushl %esi -; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI2-NEXT: movb $64, %cl ; X86-NOBMI2-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI2-NEXT: movl $-1, %edx ; X86-NOBMI2-NEXT: movl $-1, %eax ; X86-NOBMI2-NEXT: shll %cl, %eax @@ -1508,9 +1508,9 @@ define i64 @clear_lowbits64_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind ; X86-BMI2-LABEL: clear_lowbits64_ic3_load_indexzext: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movb $64, %bl ; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %bl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl $-1, %edx ; X86-BMI2-NEXT: shlxl %ebx, %edx, %eax ; X86-BMI2-NEXT: testb $32, %bl @@ -1658,11 +1658,11 @@ define i64 @oneuse64(i64 %val, i64 %numlowbits, ptr %escape) nounwind { ; X86-NOBMI2: # %bb.0: ; X86-NOBMI2-NEXT: pushl %edi ; X86-NOBMI2-NEXT: pushl %esi -; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI2-NEXT: movl $-1, %edx ; X86-NOBMI2-NEXT: movl $-1, %edi ; X86-NOBMI2-NEXT: shll %cl, %edi +; X86-NOBMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI2-NEXT: movl $-1, %edx ; X86-NOBMI2-NEXT: xorl %eax, %eax ; X86-NOBMI2-NEXT: testb $32, %cl ; X86-NOBMI2-NEXT: jne .LBB37_1 diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr2.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr2.ll index 0551152a0718d..62faaf507eedf 100644 --- a/llvm/test/CodeGen/X86/clobber_frame_ptr2.ll +++ b/llvm/test/CodeGen/X86/clobber_frame_ptr2.ll @@ -1,4 +1,5 @@ -; RUN: not llc -mtriple=x86_64-pc-linux -stackrealign -verify-machineinstrs %s -o - 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=x86_64-pc-linux -stackrealign -verify-machineinstrs %s -o - 2>&1 | FileCheck %s declare cc 11 i64 @hipe2(i64, i64, i64, i64, i64, i64, i64) @@ -8,9 +9,52 @@ declare cc 11 i64 @hipe2(i64, i64, i64, i64, i64, i64, i64) ; argument after rbp is assigned argument for function call, it is caused ; by x86-cf-opt. -; CHECK: :0: error: Interference usage of base pointer/frame pointer. -; CHECK: :0: error: Interference usage of base pointer/frame pointer. define i64 @test3(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: andq $-16, %rsp +; CHECK-NEXT: subq $32, %rsp +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: movups 16(%rbp), %xmm0 +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_remember_state +; CHECK-NEXT: .cfi_escape 0x0f, 0x06, 0x77, 0x08, 0x06, 0x11, 0x10, 0x22 # +; CHECK-NEXT: movq %rsi, %rbp +; CHECK-NEXT: movups %xmm0, (%rsp) +; CHECK-NEXT: movq %rdi, %r15 +; CHECK-NEXT: movq %rdx, %rsi +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: movq %r8, %rcx +; CHECK-NEXT: movq %r9, %r8 +; CHECK-NEXT: callq hipe2@PLT +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_restore_state +; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: leaq -40(%rbp), %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq %x = call cc 11 i64 @hipe2(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) ret i64 %x } diff --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll index b18283dd8e8d2..5ad922c660c77 100644 --- a/llvm/test/CodeGen/X86/cmov-into-branch.ll +++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll @@ -48,8 +48,8 @@ define i32 @test4(i32 %a, ptr nocapture %b, i32 %x, i32 %y) { define i32 @test5(i32 %a, ptr nocapture %b, i32 %x, i32 %y) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: cmpl %edi, (%rsi) +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: cmoval %edi, %eax ; CHECK-NEXT: cmovael %edx, %eax ; CHECK-NEXT: retq @@ -170,8 +170,8 @@ define i32 @weighted_selects(i32 %a, i32 %b) !prof !19 { ; CHECK-LABEL: weighted_selects: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jne .LBB11_2 ; CHECK-NEXT: # %bb.1: # %select.false ; CHECK-NEXT: movl %eax, %ecx diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll index 374e75967d52f..189dbbbf4106e 100644 --- a/llvm/test/CodeGen/X86/cmov.ll +++ b/llvm/test/CodeGen/X86/cmov.ll @@ -176,7 +176,8 @@ define i32 @test6(ptr nocapture %P) nounwind readonly { ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $42, (%rdi) ; CHECK-NEXT: setl %al -; CHECK-NEXT: leal 4(%rax,%rax,8), %eax +; CHECK-NEXT: leal (%rax,%rax,8), %eax +; CHECK-NEXT: addl $4, %eax ; CHECK-NEXT: retq entry: %0 = load i32, ptr %P, align 4 diff --git a/llvm/test/CodeGen/X86/cmovcmov.ll b/llvm/test/CodeGen/X86/cmovcmov.ll index d2d1c4db4608d..6fa72c76598b6 100644 --- a/llvm/test/CodeGen/X86/cmovcmov.ll +++ b/llvm/test/CodeGen/X86/cmovcmov.ll @@ -325,10 +325,10 @@ define dso_local void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) nounwi ; ; NOCMOV-LABEL: no_cascade_opt: ; NOCMOV: # %bb.0: # %entry -; NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; NOCMOV-NEXT: movb $20, %al ; NOCMOV-NEXT: movb $20, %cl +; NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; NOCMOV-NEXT: jge .LBB7_1 ; NOCMOV-NEXT: # %bb.2: # %entry ; NOCMOV-NEXT: jle .LBB7_3 diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll index 5e030de1409f2..6db74dec96bd4 100644 --- a/llvm/test/CodeGen/X86/cmp-concat.ll +++ b/llvm/test/CodeGen/X86/cmp-concat.ll @@ -93,20 +93,20 @@ define <16 x i8> @cmp_allbits_concat_v16i8(<16 x i8> %x, <16 x i8> %y) { define <2 x i64> @cmp_nobits_concat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: cmp_nobits_concat_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: pextrq $1, %xmm0, %rcx +; CHECK-NEXT: pextrq $1, %xmm0, %rax +; CHECK-NEXT: movq %xmm0, %rcx ; CHECK-NEXT: movq %xmm1, %rdx ; CHECK-NEXT: pextrq $1, %xmm1, %rsi ; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: orq %rcx, %rsi +; CHECK-NEXT: orq %rax, %rsi ; CHECK-NEXT: sete %dil ; CHECK-NEXT: negq %rdi ; CHECK-NEXT: movq %rdi, %xmm1 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: orq %rax, %rdx -; CHECK-NEXT: sete %cl -; CHECK-NEXT: negq %rcx -; CHECK-NEXT: movq %rcx, %xmm0 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: sete %al +; CHECK-NEXT: negq %rax +; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %zx = zext <2 x i64> %x to <2 x i128> diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll index 227de9ad0ab69..7ac1349023327 100644 --- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll +++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll @@ -377,7 +377,7 @@ define <4 x i1> @shr_to_ror_eq_4xi32_s4(<4 x i32> %x) { ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vprold $4, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; CHECK-AVX512-NEXT: retq %shr = lshr <4 x i32> %x, %and = and <4 x i32> %x, @@ -419,7 +419,7 @@ define <4 x i1> @shl_to_ror_eq_4xi32_s8(<4 x i32> %x) { ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vprold $8, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; CHECK-AVX512-NEXT: retq %shr = shl <4 x i32> %x, %and = and <4 x i32> %x, @@ -469,10 +469,10 @@ define <4 x i1> @shl_to_ror_eq_4xi32_s7_fail_no_p2(<4 x i32> %x) { ; ; CHECK-AVX512-LABEL: shl_to_ror_eq_4xi32_s7_fail_no_p2: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpslld $7, %xmm0, %xmm1 -; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 -; CHECK-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpslld $7, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; CHECK-AVX512-NEXT: retq %shr = shl <4 x i32> %x, %and = and <4 x i32> %x, @@ -535,7 +535,7 @@ define <4 x i1> @shr_to_ror_eq_4xi32_s4_fail_no_splat(<4 x i32> %x) { ; CHECK-AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 -; CHECK-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; CHECK-AVX512-NEXT: retq %shr = lshr <4 x i32> %x, %and = and <4 x i32> %x, @@ -580,8 +580,8 @@ define <16 x i1> @shl_to_ror_eq_16xi16_s8_fail_preserve_i16(<16 x i16> %x) { ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; CHECK-AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 @@ -610,7 +610,7 @@ define <16 x i1> @shl_to_ror_eq_16xi16_s8_fail_preserve_i16(<16 x i16> %x) { ; CHECK-AVX512-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0 ; CHECK-AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; CHECK-AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; CHECK-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq %shr = shl <16 x i16> %x, diff --git a/llvm/test/CodeGen/X86/cmpf-avx.ll b/llvm/test/CodeGen/X86/cmpf-avx.ll index e58295fff9855..69050cede52c8 100644 --- a/llvm/test/CodeGen/X86/cmpf-avx.ll +++ b/llvm/test/CodeGen/X86/cmpf-avx.ll @@ -97,7 +97,7 @@ define <8 x i32> @cmp_sgt_fail_no_bounds(<8 x i32> %x, <8 x i32> %y) { define <8 x i32> @cmp_sgt_bitcast(<8 x i32> %xx, <8 x i32> %yy) { ; CHECK-LABEL: cmp_sgt_bitcast: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040] ; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vandps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll index e9f529eea7d3f..8e4e6ec4d8cba 100644 --- a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll +++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll @@ -56,8 +56,8 @@ define void @foo(ptr %arg3, i1 %icmp16) #0 { ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: callq *%rax ; CHECK-NEXT: movl %r13d, %r13d -; CHECK-NEXT: testb $1, %bl ; CHECK-NEXT: movl $0, %r14d +; CHECK-NEXT: testb $1, %bl ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.3: # %bb17 ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll index 0ca1da26fa89c..7d39c6e65ff65 100644 --- a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll +++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll @@ -17,8 +17,8 @@ define i32 @func_local_tls(i32 %arg0, i64 %arg1) nounwind { ; NOPIC-NEXT: pushq %rbx ; NOPIC-NEXT: pushq %rax ; NOPIC-NEXT: movl %fs:foo_local@TPOFF, %ebp -; NOPIC-NEXT: testl %edi, %edi ; NOPIC-NEXT: movl %ebp, %eax +; NOPIC-NEXT: testl %edi, %edi ; NOPIC-NEXT: jne .LBB0_2 ; NOPIC-NEXT: # %bb.1: # %if.then ; NOPIC-NEXT: movq %rsi, %rbx @@ -37,8 +37,8 @@ define i32 @func_local_tls(i32 %arg0, i64 %arg1) nounwind { ; PIC-NEXT: pushq %r14 ; PIC-NEXT: pushq %rbx ; PIC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %ebp -; PIC-NEXT: testl %edi, %edi ; PIC-NEXT: movl %ebp, %eax +; PIC-NEXT: testl %edi, %edi ; PIC-NEXT: jne .LBB0_2 ; PIC-NEXT: # %bb.1: # %if.then ; PIC-NEXT: movq %rsi, %rbx @@ -59,8 +59,8 @@ define i32 @func_local_tls(i32 %arg0, i64 %arg1) nounwind { ; TLSDESC-NEXT: pushq %r14 ; TLSDESC-NEXT: pushq %rbx ; TLSDESC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %ebp -; TLSDESC-NEXT: testl %edi, %edi ; TLSDESC-NEXT: movl %ebp, %eax +; TLSDESC-NEXT: testl %edi, %edi ; TLSDESC-NEXT: jne .LBB0_2 ; TLSDESC-NEXT: # %bb.1: # %if.then ; TLSDESC-NEXT: movq %rsi, %rbx @@ -103,8 +103,8 @@ define i32 @func_nonlocal_tls(i32 %arg0, i64 %arg1) nounwind { ; NOPIC-NEXT: pushq %rbx ; NOPIC-NEXT: movq foo_nonlocal@GOTTPOFF(%rip), %r14 ; NOPIC-NEXT: movl %fs:(%r14), %ebp -; NOPIC-NEXT: testl %edi, %edi ; NOPIC-NEXT: movl %ebp, %eax +; NOPIC-NEXT: testl %edi, %edi ; NOPIC-NEXT: jne .LBB1_2 ; NOPIC-NEXT: # %bb.1: # %if.then ; NOPIC-NEXT: movq %rsi, %rbx @@ -134,8 +134,8 @@ define i32 @func_nonlocal_tls(i32 %arg0, i64 %arg1) nounwind { ; PIC-NEXT: callq __tls_get_addr@PLT ; PIC-NEXT: movq %rax, %r14 ; PIC-NEXT: movl (%rax), %r15d -; PIC-NEXT: testl %ebp, %ebp ; PIC-NEXT: movl %r15d, %eax +; PIC-NEXT: testl %ebp, %ebp ; PIC-NEXT: jne .LBB1_2 ; PIC-NEXT: # %bb.1: # %if.then ; PIC-NEXT: callq effect@PLT @@ -157,8 +157,8 @@ define i32 @func_nonlocal_tls(i32 %arg0, i64 %arg1) nounwind { ; TLSDESC-NEXT: leaq foo_nonlocal@tlsdesc(%rip), %rax ; TLSDESC-NEXT: callq *foo_nonlocal@tlscall(%rax) ; TLSDESC-NEXT: movl %fs:(%rax), %ebp -; TLSDESC-NEXT: testl %edi, %edi ; TLSDESC-NEXT: movl %ebp, %ecx +; TLSDESC-NEXT: testl %edi, %edi ; TLSDESC-NEXT: jne .LBB1_2 ; TLSDESC-NEXT: # %bb.1: # %if.then ; TLSDESC-NEXT: movq %rsi, %rbx diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll index ff9f995c4765b..c5cd876feb1b8 100644 --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -16,14 +16,14 @@ define <4 x i32> @combine_vec_add_to_zero(<4 x i32> %a) { define <4 x i32> @combine_vec_add_constant_sub(<4 x i32> %a) { ; SSE-LABEL: combine_vec_add_constant_sub: ; SSE: # %bb.0: -; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [0,2,4,6] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2,4,6] ; SSE-NEXT: psubd %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_add_constant_sub: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,2,4,6] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6] ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = sub <4 x i32> , %a @@ -230,7 +230,7 @@ define void @PR52039(ptr %pa, ptr %pb) { ; SSE: # %bb.0: ; SSE-NEXT: movdqu (%rdi), %xmm0 ; SSE-NEXT: movdqu 16(%rdi), %xmm1 -; SSE-NEXT: pmovsxbd {{.*#+}} xmm2 = [10,10,10,10] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [10,10,10,10] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psubd %xmm1, %xmm3 ; SSE-NEXT: psubd %xmm0, %xmm2 @@ -298,9 +298,9 @@ define <4 x i32> @combine_vec_add_uniquebits(<4 x i32> %a, <4 x i32> %b) { ; AVX2-LABEL: combine_vec_add_uniquebits: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [61680,61680,61680,61680] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [3855,3855,3855,3855] ; AVX2-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [3855,3855,3855,3855] -; AVX2-NEXT: vandps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = and <4 x i32> %a, diff --git a/llvm/test/CodeGen/X86/combine-addo.ll b/llvm/test/CodeGen/X86/combine-addo.ll index ba748b6e653cf..af51c04765224 100644 --- a/llvm/test/CodeGen/X86/combine-addo.ll +++ b/llvm/test/CodeGen/X86/combine-addo.ll @@ -77,7 +77,7 @@ define <4 x i32> @combine_vec_uadd_not(<4 x i32> %a0, <4 x i32> %a1) { ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm2 -; SSE-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE-NEXT: pmaxud %xmm2, %xmm0 ; SSE-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll index e5594dc9c5e3c..e320f3b5c407d 100644 --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -644,7 +644,7 @@ define <8 x i64> @neg_scalar_broadcast_v8i64(i64 %a0, <2 x i64> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0] ; AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll index 25c26d598881a..8654d568823d1 100644 --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -34,7 +34,7 @@ define <2 x i64> @bitselect_v2i64_rr(<2 x i64>, <2 x i64>) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4294967295,4294967294,4294967293,4294967292] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744069414584319,18446744060824649725] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -77,7 +77,7 @@ define <2 x i64> @bitselect_v2i64_rm(<2 x i64>, ptr nocapture readonly) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4294967294,4294967293,4294967292,4294967295] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744065119617022,18446744073709551612] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -122,7 +122,7 @@ define <2 x i64> @bitselect_v2i64_mr(ptr nocapture readonly, <2 x i64>) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,3,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [12884901890,4294967296] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -153,7 +153,7 @@ define <2 x i64> @bitselect_v2i64_mm(ptr nocapture readonly, ptr nocapture reado ; XOP-LABEL: bitselect_v2i64_mm: ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa (%rsi), %xmm0 -; XOP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967292,4294967295,4294967294,4294967293] +; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022] ; XOP-NEXT: vpcmov %xmm1, (%rdi), %xmm0, %xmm0 ; XOP-NEXT: retq ; @@ -170,7 +170,7 @@ define <2 x i64> @bitselect_v2i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4294967292,4294967295,4294967294,4294967293] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551612,18446744065119617022] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -179,7 +179,7 @@ define <2 x i64> @bitselect_v2i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512VL-LABEL: bitselect_v2i64_mm: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967292,4294967295,4294967294,4294967293] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551612,18446744065119617022] ; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = mem ^ (xmm0 & (xmm1 ^ mem)) ; AVX512VL-NEXT: retq %3 = load <2 x i64>, ptr %0 @@ -306,8 +306,8 @@ define <4 x i64> @bitselect_v4i64_rr(<4 x i64>, <4 x i64>) { ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE-NEXT: orps %xmm3, %xmm1 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: orps %xmm3, %xmm1 ; SSE-NEXT: orps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -327,7 +327,7 @@ define <4 x i64> @bitselect_v4i64_rr(<4 x i64>, <4 x i64>) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4294967295,4294967294,4294967293,4294967292,4294967293,4294967292,4294967293,4294967292] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -377,7 +377,7 @@ define <4 x i64> @bitselect_v4i64_rm(<4 x i64>, ptr nocapture readonly) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4294967294,4294967293,4294967292,4294967295,4294967294,4294967293,4294967292,4294967295] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -429,7 +429,7 @@ define <4 x i64> @bitselect_v4i64_mr(ptr nocapture readonly, <4 x i64>) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,3,0,1,2,3,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -456,16 +456,15 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado ; SSE-NEXT: andps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: andnps (%rdi), %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 ; SSE-NEXT: andnps 16(%rdi), %xmm1 +; SSE-NEXT: orps %xmm3, %xmm0 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: retq ; ; XOP-LABEL: bitselect_v4i64_mm: ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa (%rsi), %ymm0 -; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] -; XOP-NEXT: # ymm1 = mem[0,1,0,1] +; XOP-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] ; XOP-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0 ; XOP-NEXT: retq ; @@ -482,7 +481,7 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4294967292,4294967295,4294967294,4294967293,4294967292,4294967295,4294967294,4294967293] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -490,7 +489,7 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512VL-LABEL: bitselect_v4i64_mm: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967292,4294967295,4294967294,4294967293,4294967292,4294967295,4294967294,4294967293] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] ; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = mem ^ (ymm0 & (ymm1 ^ mem)) ; AVX512VL-NEXT: retq %3 = load <4 x i64>, ptr %0 @@ -837,21 +836,20 @@ define <8 x i64> @bitselect_v8i64_mm(ptr nocapture readonly, ptr nocapture reado ; ; AVX-LABEL: bitselect_v8i64_mm: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] -; AVX-NEXT: # ymm1 = mem[0,1,0,1] -; AVX-NEXT: vandps 32(%rsi), %ymm1, %ymm2 -; AVX-NEXT: vandps (%rsi), %ymm1, %ymm0 -; AVX-NEXT: vandnps (%rdi), %ymm1, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vandnps 32(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; AVX-NEXT: # ymm0 = mem[0,1,0,1] +; AVX-NEXT: vandps 32(%rsi), %ymm0, %ymm1 +; AVX-NEXT: vandps (%rsi), %ymm0, %ymm2 +; AVX-NEXT: vandnps (%rdi), %ymm0, %ymm3 +; AVX-NEXT: vandnps 32(%rdi), %ymm0, %ymm4 +; AVX-NEXT: vorps %ymm3, %ymm2, %ymm0 +; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1 ; AVX-NEXT: retq ; ; AVX512-LABEL: bitselect_v8i64_mm: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem)) ; AVX512-NEXT: retq %3 = load <8 x i64>, ptr %0 @@ -1005,27 +1003,27 @@ define <4 x i1> @bitselect_v4i1_loop(<4 x i32> %a0, <4 x i32> %a1) { ; XOP-LABEL: bitselect_v4i1_loop: ; XOP: # %bb.0: # %bb ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpcomneqd %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; XOP-NEXT: vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 ; XOP-NEXT: vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; XOP-NEXT: vpcomneqd %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: bitselect_v4i1_loop: ; AVX1: # %bb.0: # %bb ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm3, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: bitselect_v4i1_loop: ; AVX2: # %bb.0: # %bb ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [12,12,12,12] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [12,12,12,12] -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [15,15,15,15] ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 @@ -1034,8 +1032,8 @@ define <4 x i1> @bitselect_v4i1_loop(<4 x i32> %a0, <4 x i32> %a1) { ; AVX512F-LABEL: bitselect_v4i1_loop: ; AVX512F: # %bb.0: # %bb ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k2 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k2} ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll index 7237b02ca6b66..52fc4b7bda7f2 100644 --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -48,7 +48,7 @@ define void @concat_of_broadcast_v2f64_v4f64() { ; AVX1-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000 ; AVX1-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 ; AVX1-NEXT: movq %rcx, 46348(%rax) -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216] +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3] ; AVX1-NEXT: vmovups %ymm0, 48296(%rax) ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [7.812501848093234E-3,0.0E+0] ; AVX1-NEXT: vmovsd %xmm0, 47372(%rax) diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll index d7031be3addd9..e4207cdef4ad6 100644 --- a/llvm/test/CodeGen/X86/combine-fcopysign.ll +++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll @@ -114,9 +114,9 @@ define <4 x float> @combine_vec_fcopysign_fabs_mag(<4 x float> %x, <4 x float> % ; AVX-LABEL: combine_vec_fcopysign_fabs_mag: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] ; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x) @@ -136,9 +136,9 @@ define <4 x float> @combine_vec_fcopysign_fneg_mag(<4 x float> %x, <4 x float> % ; AVX-LABEL: combine_vec_fcopysign_fneg_mag: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] ; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = fsub <4 x float> , %x @@ -158,9 +158,9 @@ define <4 x float> @combine_vec_fcopysign_fcopysign_mag(<4 x float> %x, <4 x flo ; AVX-LABEL: combine_vec_fcopysign_fcopysign_mag: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] ; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %z) @@ -180,9 +180,9 @@ define <4 x float> @combine_vec_fcopysign_fcopysign_sgn(<4 x float> %x, <4 x flo ; AVX-LABEL: combine_vec_fcopysign_fcopysign_sgn: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] ; AVX-NEXT: vandps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %y, <4 x float> %z) @@ -211,9 +211,9 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float ; AVX: # %bb.0: ; AVX-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %1 = fpext <4 x float> %y to <4 x double> @@ -237,9 +237,9 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl ; AVX: # %bb.0: ; AVX-NEXT: vcvtpd2ps %ymm1, %xmm1 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vandpd %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-fneg.ll b/llvm/test/CodeGen/X86/combine-fneg.ll index 8ca7fb81563fa..9d92d3a572e3f 100644 --- a/llvm/test/CodeGen/X86/combine-fneg.ll +++ b/llvm/test/CodeGen/X86/combine-fneg.ll @@ -125,8 +125,8 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind { ; X86-SSE1-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X86-SSE1-NEXT: movl 12(%ebp), %ecx ; X86-SSE1-NEXT: xorl %eax, %ecx -; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: xorl 8(%ebp), %eax +; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl %eax, (%esp) ; X86-SSE1-NEXT: movaps (%esp), %xmm0 ; X86-SSE1-NEXT: movl %ebp, %esp @@ -138,8 +138,8 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind { ; X86-SSE2-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: xorl %eax, %ecx -; X86-SSE2-NEXT: movd %ecx, %xmm1 ; X86-SSE2-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movd %ecx, %xmm1 ; X86-SSE2-NEXT: movd %eax, %xmm0 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index 8e4a50ea266c3..44aee16f66407 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -134,12 +134,12 @@ define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) { define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) { ; SSE-LABEL: combine_vec_mul_negpow2c: ; SSE: # %bb.0: -; SSE-NEXT: pmovsxbd {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pmuludq %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmovsxbq {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [18446744073709551615,18446744073709551614] ; SSE-NEXT: pmuludq %xmm5, %xmm4 ; SSE-NEXT: paddq %xmm3, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 @@ -148,7 +148,7 @@ define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) { ; SSE-NEXT: pmuludq %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmovsxbq {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551612,18446744073709551600] ; SSE-NEXT: pmuludq %xmm4, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -161,7 +161,7 @@ define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) { ; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 ; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX-NEXT: vpmovsxbq {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600] +; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600] ; AVX-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 ; AVX-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vpsllq $32, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll index 4a75eb182d79d..5ccb03b281f31 100644 --- a/llvm/test/CodeGen/X86/combine-pavg.ll +++ b/llvm/test/CodeGen/X86/combine-pavg.ll @@ -37,7 +37,7 @@ define <16 x i8> @combine_pavgb_zero(<16 x i8> %a0) { define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { ; SSE-LABEL: combine_pavgw_knownbits: ; SSE: # %bb.0: -; SSE-NEXT: pmovsxbw {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pavgw %xmm1, %xmm0 @@ -61,7 +61,7 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16 ; ; AVX2-LABEL: combine_pavgw_knownbits: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] ; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-pmadd.ll b/llvm/test/CodeGen/X86/combine-pmadd.ll index d9283aa8591fc..985d93e7bb466 100644 --- a/llvm/test/CodeGen/X86/combine-pmadd.ll +++ b/llvm/test/CodeGen/X86/combine-pmadd.ll @@ -64,7 +64,7 @@ define <8 x i32> @combine_pmaddwd_concat(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> define <8 x i32> @combine_pmaddwd_concat_freeze(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: combine_pmaddwd_concat_freeze: ; SSE: # %bb.0: -; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] ; SSE-NEXT: pmaddwd %xmm2, %xmm0 ; SSE-NEXT: pmaddwd %xmm2, %xmm1 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 70335f834291d..b4cb07135e390 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -104,7 +104,7 @@ define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) { ; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] ; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pmovsxdq {{.*#+}} xmm4 = [715827883,715827883] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [715827883,715827883] ; SSE-NEXT: pmuludq %xmm4, %xmm0 ; SSE-NEXT: pmuludq %xmm4, %xmm1 ; SSE-NEXT: pmuludq %xmm4, %xmm2 @@ -329,7 +329,7 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; SSE-NEXT: movd %esi, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: .p2align 4 ; SSE-NEXT: .LBB7_1: # %loop @@ -346,7 +346,8 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; SSE-NEXT: pmuludq %xmm2, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3] ; SSE-NEXT: paddd %xmm4, %xmm1 -; SSE-NEXT: subq $-128, %rax +; SSE-NEXT: addq $32, %rax +; SSE-NEXT: cmpq $524288, %rax # imm = 0x80000 ; SSE-NEXT: jne .LBB7_1 ; SSE-NEXT: # %bb.2: # %end ; SSE-NEXT: retq @@ -358,7 +359,7 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX1-NEXT: xorl %eax, %eax ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB7_1: # %loop @@ -377,7 +378,8 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: subq $-128, %rax +; AVX1-NEXT: addq $32, %rax +; AVX1-NEXT: cmpq $524288, %rax # imm = 0x80000 ; AVX1-NEXT: jne .LBB7_1 ; AVX1-NEXT: # %bb.2: # %end ; AVX1-NEXT: retq @@ -388,7 +390,7 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX2-NEXT: vmovq %rax, %xmm0 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: .p2align 4 ; AVX2-NEXT: .LBB7_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -399,7 +401,8 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: subq $-128, %rax +; AVX2-NEXT: addq $32, %rax +; AVX2-NEXT: cmpq $524288, %rax # imm = 0x80000 ; AVX2-NEXT: jne .LBB7_1 ; AVX2-NEXT: # %bb.2: # %end ; AVX2-NEXT: retq @@ -409,7 +412,7 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX512VL-NEXT: movl %esi, %eax ; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: .p2align 4 ; AVX512VL-NEXT: .LBB7_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 @@ -418,7 +421,8 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 ; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: subq $-128, %rax +; AVX512VL-NEXT: addq $32, %rax +; AVX512VL-NEXT: cmpq $524288, %rax # imm = 0x80000 ; AVX512VL-NEXT: jne .LBB7_1 ; AVX512VL-NEXT: # %bb.2: # %end ; AVX512VL-NEXT: retq @@ -428,7 +432,7 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX512DQVL-NEXT: movl %esi, %eax ; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512DQVL-NEXT: xorl %eax, %eax ; AVX512DQVL-NEXT: .p2align 4 ; AVX512DQVL-NEXT: .LBB7_1: # %loop ; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 @@ -437,7 +441,8 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 ; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512DQVL-NEXT: subq $-128, %rax +; AVX512DQVL-NEXT: addq $32, %rax +; AVX512DQVL-NEXT: cmpq $524288, %rax # imm = 0x80000 ; AVX512DQVL-NEXT: jne .LBB7_1 ; AVX512DQVL-NEXT: # %bb.2: # %end ; AVX512DQVL-NEXT: retq @@ -470,17 +475,17 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psrlq $32, %xmm3 ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: .p2align 4 ; SSE-NEXT: .LBB8_1: # %loop ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE-NEXT: pmovsxdq 2097176(%rdi,%rax), %xmm5 -; SSE-NEXT: pmovsxdq 2097168(%rdi,%rax), %xmm4 -; SSE-NEXT: pmovsxdq 2097152(%rdi,%rax), %xmm6 -; SSE-NEXT: pmovsxdq 2097160(%rdi,%rax), %xmm7 +; SSE-NEXT: pmovsxdq 24(%rdi,%rax,4), %xmm5 +; SSE-NEXT: pmovsxdq 16(%rdi,%rax,4), %xmm4 +; SSE-NEXT: pmovsxdq (%rdi,%rax,4), %xmm6 +; SSE-NEXT: pmovsxdq 8(%rdi,%rax,4), %xmm7 ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pmuludq %xmm7, %xmm8 ; SSE-NEXT: movdqa %xmm2, %xmm9 @@ -521,7 +526,8 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; SSE-NEXT: paddq %xmm7, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3] ; SSE-NEXT: paddd %xmm4, %xmm1 -; SSE-NEXT: subq $-128, %rax +; SSE-NEXT: addq $32, %rax +; SSE-NEXT: cmpq $524288, %rax # imm = 0x80000 ; SSE-NEXT: jne .LBB8_1 ; SSE-NEXT: # %bb.2: # %end ; SSE-NEXT: retq @@ -533,15 +539,15 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX1-NEXT: xorl %eax, %eax ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB8_1: # %loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxdq 2097152(%rdi,%rax), %xmm3 -; AVX1-NEXT: vpmovsxdq 2097160(%rdi,%rax), %xmm4 -; AVX1-NEXT: vpmovsxdq 2097168(%rdi,%rax), %xmm5 -; AVX1-NEXT: vpmovsxdq 2097176(%rdi,%rax), %xmm6 +; AVX1-NEXT: vpmovsxdq (%rdi,%rax,4), %xmm3 +; AVX1-NEXT: vpmovsxdq 8(%rdi,%rax,4), %xmm4 +; AVX1-NEXT: vpmovsxdq 16(%rdi,%rax,4), %xmm5 +; AVX1-NEXT: vpmovsxdq 24(%rdi,%rax,4), %xmm6 ; AVX1-NEXT: vpmuldq %xmm6, %xmm2, %xmm6 ; AVX1-NEXT: vpmuldq %xmm5, %xmm1, %xmm5 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] @@ -552,7 +558,8 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: subq $-128, %rax +; AVX1-NEXT: addq $32, %rax +; AVX1-NEXT: cmpq $524288, %rax # imm = 0x80000 ; AVX1-NEXT: jne .LBB8_1 ; AVX1-NEXT: # %bb.2: # %end ; AVX1-NEXT: retq @@ -563,18 +570,19 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX2-NEXT: vmovq %rax, %xmm0 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: .p2align 4 ; AVX2-NEXT: .LBB8_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm2 -; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm3 +; AVX2-NEXT: vpmovsxdq (%rdi,%rax,4), %ymm2 +; AVX2-NEXT: vpmovsxdq 16(%rdi,%rax,4), %ymm3 ; AVX2-NEXT: vpmuldq %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: subq $-128, %rax +; AVX2-NEXT: addq $32, %rax +; AVX2-NEXT: cmpq $524288, %rax # imm = 0x80000 ; AVX2-NEXT: jne .LBB8_1 ; AVX2-NEXT: # %bb.2: # %end ; AVX2-NEXT: retq @@ -584,7 +592,7 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX512VL-NEXT: movslq %esi, %rax ; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: .p2align 4 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 @@ -593,7 +601,8 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 ; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: subq $-128, %rax +; AVX512VL-NEXT: addq $32, %rax +; AVX512VL-NEXT: cmpq $524288, %rax # imm = 0x80000 ; AVX512VL-NEXT: jne .LBB8_1 ; AVX512VL-NEXT: # %bb.2: # %end ; AVX512VL-NEXT: retq @@ -603,7 +612,7 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX512DQVL-NEXT: movslq %esi, %rax ; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512DQVL-NEXT: xorl %eax, %eax ; AVX512DQVL-NEXT: .p2align 4 ; AVX512DQVL-NEXT: .LBB8_1: # %loop ; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 @@ -612,7 +621,8 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 ; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512DQVL-NEXT: subq $-128, %rax +; AVX512DQVL-NEXT: addq $32, %rax +; AVX512DQVL-NEXT: cmpq $524288, %rax # imm = 0x80000 ; AVX512DQVL-NEXT: jne .LBB8_1 ; AVX512DQVL-NEXT: # %bb.2: # %end ; AVX512DQVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-ptest-256.ll b/llvm/test/CodeGen/X86/combine-ptest-256.ll index 2612fad16db63..89d4944775495 100644 --- a/llvm/test/CodeGen/X86/combine-ptest-256.ll +++ b/llvm/test/CodeGen/X86/combine-ptest-256.ll @@ -9,8 +9,8 @@ define i32 @ptestz_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { ; CHECK-LABEL: ptestz_256_invert0: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vptest %ymm1, %ymm0 +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cmovael %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -28,8 +28,8 @@ define i32 @ptestz_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { define i32 @ptestz_256_invert1(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { ; CHECK-LABEL: ptestz_256_invert1: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vptest %ymm0, %ymm1 +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cmovael %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -47,8 +47,8 @@ define i32 @ptestz_256_invert1(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { define i32 @ptestc_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { ; CHECK-LABEL: ptestc_256_invert0: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vptest %ymm1, %ymm0 +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -100,19 +100,19 @@ define i32 @ptestnzc_256_invert0_commute(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 define i32 @ptestc_256_not(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { ; AVX1-LABEL: ptestc_256_not: ; AVX1: # %bb.0: -; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vptest %ymm1, %ymm0 +; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: cmovael %esi, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: ptestc_256_not: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: movl %edi, %eax ; AVX2-NEXT: cmovael %esi, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -130,8 +130,8 @@ define i32 @ptestc_256_not(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { define i32 @ptestz_256_and(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { ; CHECK-LABEL: ptestz_256_and: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vptest %ymm1, %ymm0 +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cmovel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -149,8 +149,8 @@ define i32 @ptestz_256_and(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { define i32 @ptestz_256_andc(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { ; CHECK-LABEL: ptestz_256_andc: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vptest %ymm1, %ymm0 +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cmovbl %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -169,8 +169,8 @@ define i32 @ptestz_256_andc(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { define i32 @ptestz_256_allones0(<4 x i64> %c, i32 %a, i32 %b) { ; CHECK-LABEL: ptestz_256_allones0: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vptest %ymm0, %ymm0 +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -187,8 +187,8 @@ define i32 @ptestz_256_allones0(<4 x i64> %c, i32 %a, i32 %b) { define i32 @ptestz_256_allones1(<4 x i64> %c, i32 %a, i32 %b) { ; CHECK-LABEL: ptestz_256_allones1: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vptest %ymm0, %ymm0 +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -205,12 +205,12 @@ define i32 @ptestz_256_allones1(<4 x i64> %c, i32 %a, i32 %b) { define i32 @ptestz_v8i32_signbits(<8 x i32> %c, i32 %a, i32 %b) { ; AVX1-LABEL: ptestz_v8i32_signbits: ; AVX1: # %bb.0: -; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: cmovnel %esi, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -233,13 +233,13 @@ define i32 @ptestz_v8i32_signbits(<8 x i32> %c, i32 %a, i32 %b) { define i32 @ptestz_v32i8_signbits(<32 x i8> %c, i32 %a, i32 %b) { ; AVX1-LABEL: ptestz_v32i8_signbits: ; AVX1: # %bb.0: -; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: cmovnel %esi, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll index fda14027e994e..283458548a918 100644 --- a/llvm/test/CodeGen/X86/combine-ptest.ll +++ b/llvm/test/CodeGen/X86/combine-ptest.ll @@ -11,15 +11,15 @@ define i32 @ptestz_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { ; SSE-LABEL: ptestz_128_invert0: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ptest %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovael %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestz_128_invert0: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %xmm1, %xmm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovael %esi, %eax ; AVX-NEXT: retq %t1 = xor <2 x i64> %c, @@ -36,15 +36,15 @@ define i32 @ptestz_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { define i32 @ptestz_128_invert1(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { ; SSE-LABEL: ptestz_128_invert1: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ptest %xmm0, %xmm1 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovael %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestz_128_invert1: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %xmm0, %xmm1 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovael %esi, %eax ; AVX-NEXT: retq %t1 = xor <2 x i64> %d, @@ -61,15 +61,15 @@ define i32 @ptestz_128_invert1(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { define i32 @ptestc_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { ; SSE-LABEL: ptestc_128_invert0: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ptest %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestc_128_invert0: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %xmm1, %xmm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovnel %esi, %eax ; AVX-NEXT: retq %t1 = xor <2 x i64> %c, @@ -86,15 +86,15 @@ define i32 @ptestc_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { define i32 @ptestnzc_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { ; SSE-LABEL: ptestnzc_128_invert0: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ptest %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestnzc_128_invert0: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %xmm1, %xmm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovnel %esi, %eax ; AVX-NEXT: retq %t1 = xor <2 x i64> %c, @@ -111,17 +111,17 @@ define i32 @ptestnzc_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { define i32 @ptestc_128_not(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { ; SSE-LABEL: ptestc_128_not: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE-NEXT: ptest %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovael %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestc_128_not: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vptest %xmm1, %xmm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovael %esi, %eax ; AVX-NEXT: retq %t1 = xor <2 x i64> %c, @@ -138,15 +138,15 @@ define i32 @ptestc_128_not(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { define i32 @ptestz_128_and(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { ; SSE-LABEL: ptestz_128_and: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ptest %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestz_128_and: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %xmm1, %xmm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovnel %esi, %eax ; AVX-NEXT: retq %t1 = and <2 x i64> %c, %d @@ -163,15 +163,15 @@ define i32 @ptestz_128_and(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { define i32 @ptestz_128_andc(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { ; SSE-LABEL: ptestz_128_andc: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ptest %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovael %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestz_128_andc: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %xmm1, %xmm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovael %esi, %eax ; AVX-NEXT: retq %t1 = xor <2 x i64> %c, @@ -189,15 +189,15 @@ define i32 @ptestz_128_andc(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) { define i32 @ptestz_128_allones0(<2 x i64> %c, i32 %a, i32 %b) { ; SSE-LABEL: ptestz_128_allones0: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestz_128_allones0: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovnel %esi, %eax ; AVX-NEXT: retq %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> , <2 x i64> %c) @@ -213,15 +213,15 @@ define i32 @ptestz_128_allones0(<2 x i64> %c, i32 %a, i32 %b) { define i32 @ptestz_128_allones1(<2 x i64> %c, i32 %a, i32 %b) { ; SSE-LABEL: ptestz_128_allones1: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestz_128_allones1: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovnel %esi, %eax ; AVX-NEXT: retq %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> ) @@ -346,17 +346,17 @@ define i32 @ptestz_v8i16_signbits(<8 x i16> %c, i32 %a, i32 %b) { define i32 @ptestz_v2i64_concat(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { ; SSE-LABEL: ptestz_v2i64_concat: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: ptest %xmm2, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovnel %esi, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: ptestz_v2i64_concat: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %ymm1, %ymm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovnel %esi, %eax ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll index 65d74c8f262a3..4c52e57210dda 100644 --- a/llvm/test/CodeGen/X86/combine-rotates.ll +++ b/llvm/test/CodeGen/X86/combine-rotates.ll @@ -11,8 +11,8 @@ define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -118,11 +118,12 @@ define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) { ; SSE2-LABEL: combine_vec_rot_select_zero: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] @@ -150,16 +151,16 @@ define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) { ; ; AVX2-LABEL: combine_vec_rot_select_zero: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm3 -; AVX2-NEXT: vpsllvd %xmm3, %xmm0, %xmm4 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [32,32,32,32] -; AVX2-NEXT: vpsubd %xmm3, %xmm5, %xmm3 -; AVX2-NEXT: vpsrlvd %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm1, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm5 +; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm2 +; AVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: combine_vec_rot_select_zero: diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll index 89aee965a2c1f..8c66374f07c5e 100644 --- a/llvm/test/CodeGen/X86/combine-sbb.ll +++ b/llvm/test/CodeGen/X86/combine-sbb.ll @@ -7,26 +7,24 @@ define void @PR25858_i32(ptr sret(%WideUInt32), ptr, ptr) nounwind { ; X86-LABEL: PR25858_i32: ; X86: # %bb.0: # %top -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl 4(%edx), %edx -; X86-NEXT: subl (%ecx), %esi -; X86-NEXT: sbbl 4(%ecx), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: popl %esi +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: subl (%eax), %edx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: sbbl 4(%eax), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: PR25858_i32: ; X64: # %bb.0: # %top -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movl (%rsi), %ecx -; X64-NEXT: movl 4(%rsi), %esi ; X64-NEXT: subl (%rdx), %ecx +; X64-NEXT: movl 4(%rsi), %esi ; X64-NEXT: sbbl 4(%rdx), %esi +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movl %esi, 4(%rdi) ; X64-NEXT: movl %ecx, (%rdi) ; X64-NEXT: retq @@ -56,38 +54,36 @@ declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) define void @PR25858_i64(ptr sret(%WideUInt64), ptr, ptr) nounwind { ; X86-LABEL: PR25858_i64: ; X86: # %bb.0: # %top -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl (%edi), %edx -; X86-NEXT: movl 4(%edi), %esi -; X86-NEXT: movl 12(%edi), %ecx -; X86-NEXT: movl 8(%edi), %edi -; X86-NEXT: subl 8(%ebx), %edi -; X86-NEXT: sbbl 12(%ebx), %ecx -; X86-NEXT: subl (%ebx), %edx -; X86-NEXT: sbbl 4(%ebx), %esi -; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: subl 8(%eax), %ecx +; X86-NEXT: movl 12(%esi), %edx +; X86-NEXT: sbbl 12(%eax), %edx +; X86-NEXT: movl (%esi), %edi +; X86-NEXT: subl (%eax), %edi +; X86-NEXT: movl 4(%esi), %esi +; X86-NEXT: sbbl 4(%eax), %esi ; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl $0, %edx +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl $4 ; ; X64-LABEL: PR25858_i64: ; X64: # %bb.0: # %top -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq (%rsi), %rcx -; X64-NEXT: movq 8(%rsi), %rsi ; X64-NEXT: subq (%rdx), %rcx +; X64-NEXT: movq 8(%rsi), %rsi ; X64-NEXT: sbbq 8(%rdx), %rsi +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq %rsi, 8(%rdi) ; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq @@ -118,9 +114,9 @@ define i8 @PR24545(i32, i32, ptr nocapture readonly) { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: sbbl 4(%ecx), %eax +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl 4(%eax), %ecx ; X86-NEXT: setb %al ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 1ae1d61091362..4a7240385946a 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -402,7 +402,8 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -421,7 +422,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -437,7 +438,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -467,7 +468,8 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] +; XOP-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; XOP-NEXT: # xmm2 = mem[0,0] ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq %1 = sdiv <16 x i8> %x, @@ -513,8 +515,8 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { ; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,4,2,16,8,32,64,2] ; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [u,16384,u,4096,8192,2048,1024,u] +; SSE41-NEXT: psraw $1, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq @@ -524,9 +526,9 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,4,2,16,8,32,64,2] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 -; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,16384,u,4096,8192,2048,1024,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,16384,u,4096,8192,2048,1024,u] +; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX1-NEXT: retq ; @@ -535,9 +537,9 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { ; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1 ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,4,2,16,8,32,64,2] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsraw $1, %xmm1, %xmm2 -; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,16384,u,4096,8192,2048,1024,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] +; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,16384,u,4096,8192,2048,1024,u] +; AVX2-NEXT: vpsraw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX2-NEXT: retq ; @@ -637,7 +639,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psraw $15, %xmm2 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [0,4,2,16,8,32,64,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [u,4,2,16,8,32,64,2] ; SSE41-NEXT: pmulhuw %xmm3, %xmm2 ; SSE41-NEXT: paddw %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768] @@ -660,7 +662,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2 -; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,4,2,16,8,32,64,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [u,4,2,16,8,32,64,2] ; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [u,16384,32768,4096,8192,2048,1024,32768] @@ -686,9 +688,9 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1 ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [u,4,2,16,8,32,64,2,u,4,2,16,8,32,64,2] ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpsraw $1, %ymm1, %ymm2 -; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [u,16384,u,4096,8192,2048,1024,u,u,16384,u,4096,8192,2048,1024,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15] +; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [u,16384,u,4096,8192,2048,1024,u,u,16384,u,4096,8192,2048,1024,u] +; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11,12,13,14],ymm1[15] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-NEXT: retq ; @@ -716,10 +718,10 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOP-NEXT: vpsraw $15, %xmm1, %xmm2 -; XOP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,65522,65521,65524,65523,65525,65526,65521] +; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,65522,65521,65524,65523,65525,65526,65521] ; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,65534,65535,65532,65533,65531,65530,65535] +; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,65534,65535,65532,65533,65531,65530,65535] ; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm4 ; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3 @@ -851,7 +853,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm6 ; SSE41-NEXT: psraw $15, %xmm6 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm5 = [0,4,2,16,8,32,64,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [u,4,2,16,8,32,64,2] ; SSE41-NEXT: pmulhuw %xmm5, %xmm6 ; SSE41-NEXT: paddw %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768] @@ -892,7 +894,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3 -; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm4 = [0,4,2,16,8,32,64,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [u,4,2,16,8,32,64,2] ; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [u,16384,32768,4096,8192,2048,1024,32768] @@ -954,24 +956,24 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2 +; AVX512F-NEXT: vpmulhuw %ymm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpsravd %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512F-NEXT: vpsraw $15, %ymm4, %ymm5 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpaddw %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vpsravd %zmm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpsraw $15, %ymm4, %ymm5 +; AVX512F-NEXT: vpmulhuw %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpaddw %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpsravd %zmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) @@ -993,10 +995,10 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm3 -; XOP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [0,65522,65521,65524,65523,65525,65526,65521] +; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,65522,65521,65524,65523,65525,65526,65521] ; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,65534,65535,65532,65533,65531,65530,65535] +; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,65534,65535,65532,65533,65531,65530,65535] ; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpsraw $15, %xmm0, %xmm5 ; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5 @@ -1232,10 +1234,10 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) { ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOP-NEXT: vpsrad $31, %xmm1, %xmm2 -; XOP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,4294967266,4294967267,4294967268] +; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,4294967266,4294967267,4294967268] ; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292] +; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,4294967294,4294967293,4294967292] ; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm4 ; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3 @@ -1508,10 +1510,10 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOP-NEXT: vpsrad $31, %xmm2, %xmm3 -; XOP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,4294967266,4294967267,4294967268] +; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,4294967266,4294967267,4294967268] ; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,4294967294,4294967293,4294967292] +; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,4294967294,4294967293,4294967292] ; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpsrad $31, %xmm0, %xmm5 ; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 @@ -1716,7 +1718,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,3,4] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [u,2,3,4] ; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2 @@ -1735,7 +1737,8 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64: ; XOP: # %bb.0: -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: # xmm1 = mem[0,0] ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2 ; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 @@ -1901,10 +1904,10 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,62,61,60] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,62,61,60] ; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,2,3,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,2,3,4] ; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,2305843009213693952,1152921504606846976,576460752303423488] ; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3 @@ -1946,17 +1949,19 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: # xmm3 = mem[0,0] ; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556] +; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556] ; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4 ; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612] +; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612] ; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6 ; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6 ; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [18446744073709551614,18446744073709551614] +; XOP-NEXT: vmovddup {{.*#+}} xmm7 = [18446744073709551614,18446744073709551614] +; XOP-NEXT: # xmm7 = mem[0,0] ; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] @@ -2690,7 +2695,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { ; ; SSE41-LABEL: combine_vec_sdiv_nonuniform5: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] ; SSE41-NEXT: paddw %xmm0, %xmm1 @@ -2734,11 +2739,11 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1] ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1 -; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm1 +; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2802,15 +2807,15 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; ; SSE41-LABEL: combine_vec_sdiv_nonuniform6: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] ; SSE41-NEXT: paddw %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlw $15, %xmm2 ; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,256,256,u,u,512,256,8] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq @@ -2822,8 +2827,8 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,256,256,u,u,512,256,8] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -2835,8 +2840,8 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm2 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,256,256,u,u,512,256,8] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -2848,9 +2853,9 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll index 1ce10c3708d58..b91a921125bac 100644 --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -86,12 +86,11 @@ define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) { ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65536,32768,16384,8192] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_known_zero1: @@ -149,7 +148,7 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_shl_trunc_and: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 @@ -199,8 +198,8 @@ define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -308,11 +307,11 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] @@ -788,8 +787,8 @@ define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -952,7 +951,7 @@ define <4 x i32> @combine_vec_shl_clamped1(<4 x i32> %sh, <4 x i32> %amt) { ; ; SSE41-LABEL: combine_vec_shl_clamped1: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: pslld $23, %xmm1 @@ -993,12 +992,12 @@ define <4 x i32> @combine_vec_shl_clamped2(<4 x i32> %sh, <4 x i32> %amt) { ; ; SSE41-LABEL: combine_vec_shl_clamped2: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: pslld $23, %xmm1 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1034,12 +1033,12 @@ define <4 x i32> @combine_vec_shl_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt ; ; SSE41-LABEL: combine_vec_shl_commuted_clamped: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: pslld $23, %xmm1 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1076,7 +1075,7 @@ define <4 x i32> @combine_vec_shl_commuted_clamped1(<4 x i32> %sh, <4 x i32> %am ; ; SSE41-LABEL: combine_vec_shl_commuted_clamped1: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: pslld $23, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-smax.ll b/llvm/test/CodeGen/X86/combine-smax.ll index 0133827b85cae..f3b9cc100f485 100644 --- a/llvm/test/CodeGen/X86/combine-smax.ll +++ b/llvm/test/CodeGen/X86/combine-smax.ll @@ -54,7 +54,7 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; ; AVX2-LABEL: test_v16i8_nosignbit: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-smin.ll b/llvm/test/CodeGen/X86/combine-smin.ll index b58934256a209..6a077524f5f52 100644 --- a/llvm/test/CodeGen/X86/combine-smin.ll +++ b/llvm/test/CodeGen/X86/combine-smin.ll @@ -54,7 +54,7 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; ; AVX2-LABEL: test_v16i8_nosignbit: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index c982884314f62..c1da16c693206 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -186,14 +186,14 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrad %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -209,13 +209,13 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -230,7 +230,7 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 @@ -298,7 +298,7 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -399,7 +399,7 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -465,14 +465,14 @@ define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -487,13 +487,13 @@ define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrld %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -570,10 +570,10 @@ define <8 x i16> @combine_vec8i16_ashr_clamped(<8 x i16> %x, <8 x i16> %y) { ; SSE41-NEXT: psllw $4, %xmm1 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $8, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -627,14 +627,14 @@ define <4 x i32> @combine_vec4i32_ashr_clamped(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad %xmm4, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrad %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm2, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -649,13 +649,13 @@ define <4 x i32> @combine_vec4i32_ashr_clamped(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -706,17 +706,17 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) { ; SSE2-NEXT: psrlq %xmm3, %xmm5 ; SSE2-NEXT: psrlq %xmm6, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE2-NEXT: xorpd %xmm7, %xmm0 -; SSE2-NEXT: psubq %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrlq %xmm4, %xmm3 +; SSE2-NEXT: xorpd %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; SSE2-NEXT: psrlq %xmm5, %xmm2 +; SSE2-NEXT: psubq %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrlq %xmm4, %xmm6 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm4, %xmm3 ; SSE2-NEXT: psrlq %xmm5, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1] ; SSE2-NEXT: xorpd %xmm2, %xmm1 ; SSE2-NEXT: psubq %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -755,17 +755,17 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) { ; SSE41-NEXT: psrlq %xmm9, %xmm2 ; SSE41-NEXT: psrlq %xmm3, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: psubq %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] ; SSE41-NEXT: psrlq %xmm3, %xmm0 +; SSE41-NEXT: psubq %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrlq %xmm6, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlq %xmm6, %xmm2 ; SSE41-NEXT: psrlq %xmm3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pxor %xmm0, %xmm1 ; SSE41-NEXT: psubq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm4, %xmm0 @@ -776,9 +776,9 @@ define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) { ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775870,9223372036854775870,9223372036854775870,9223372036854775870] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [63,63,63,63] ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [63,63,63,63] -; AVX2-NEXT: vblendvpd %ymm3, %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm3, %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll index 4b01c16a6324e..9d41905074533 100644 --- a/llvm/test/CodeGen/X86/combine-srem.ll +++ b/llvm/test/CodeGen/X86/combine-srem.ll @@ -90,8 +90,8 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -209,8 +209,8 @@ define <4 x i32> @combine_vec_srem_by_pow2a(<4 x i32> %x) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX2-NEXT: vpsrld $30, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967292,4294967292,4294967292,4294967292] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -242,8 +242,8 @@ define <4 x i32> @combine_vec_srem_by_pow2a_neg(<4 x i32> %x) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX2-NEXT: vpsrld $30, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967292,4294967292,4294967292,4294967292] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -300,7 +300,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) { ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] ; AVX2-NEXT: vpsravd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; AVX2-NEXT: vpsllvd %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index 7bc90534dcc6e..75210f1535740 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -198,7 +198,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr0: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpsrlq $48, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -282,7 +282,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) { ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr1: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -479,7 +479,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) { ; SSE41-NEXT: pcmpeqb %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: paddw %xmm1, %xmm0 @@ -534,14 +534,14 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -557,13 +557,13 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrld %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -578,7 +578,7 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_and: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 @@ -611,22 +611,22 @@ define <4 x i32> @combine_vec_lshr_clamped1(<4 x i32> %sh, <4 x i32> %amt) { ; SSE2-LABEL: combine_vec_lshr_clamped1: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psrld %xmm3, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: psrld %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,3] ; SSE2-NEXT: pandn %xmm5, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 @@ -634,7 +634,7 @@ define <4 x i32> @combine_vec_lshr_clamped1(<4 x i32> %sh, <4 x i32> %amt) { ; ; SSE41-LABEL: combine_vec_lshr_clamped1: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] @@ -644,13 +644,13 @@ define <4 x i32> @combine_vec_lshr_clamped1(<4 x i32> %sh, <4 x i32> %amt) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm6 ; SSE41-NEXT: psrld %xmm5, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: psrld %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrld %xmm1, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -679,20 +679,20 @@ define <4 x i32> @combine_vec_lshr_clamped2(<4 x i32> %sh, <4 x i32> %amt) { ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrld %xmm0, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrld %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrld %xmm0, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm0, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_lshr_clamped2: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -703,12 +703,12 @@ define <4 x i32> @combine_vec_lshr_clamped2(<4 x i32> %sh, <4 x i32> %amt) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm3, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq @@ -737,20 +737,20 @@ define <4 x i32> @combine_vec_lshr_commuted_clamped(<4 x i32> %sh, <4 x i32> %am ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrld %xmm0, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrld %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrld %xmm0, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm0, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_lshr_commuted_clamped: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -761,12 +761,12 @@ define <4 x i32> @combine_vec_lshr_commuted_clamped(<4 x i32> %sh, <4 x i32> %am ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm3, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld %xmm2, %xmm3 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq @@ -790,14 +790,14 @@ define <4 x i32> @combine_vec_lshr_commuted_clamped1(<4 x i32> %sh, <4 x i32> %a ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrld %xmm2, %xmm4 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psrld %xmm3, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm2, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3] ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -814,15 +814,15 @@ define <4 x i32> @combine_vec_lshr_commuted_clamped1(<4 x i32> %sh, <4 x i32> %a ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: psrld %xmm3, %xmm4 +; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: psrld %xmm4, %xmm6 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll index 6034a24099e4d..144227f3b34d3 100644 --- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll +++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll @@ -189,15 +189,16 @@ define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a define <4 x float> @demandedbits_uitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) { ; SSE-LABEL: demandedbits_uitofp_blendvps: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1258291200,1258291200,1258291200,1258291200] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1258291200,1258291200,1258291200,1258291200] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] ; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7] +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE-NEXT: addps %xmm2, %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: demandedbits_uitofp_blendvps: @@ -214,8 +215,8 @@ define <4 x float> @demandedbits_uitofp_blendvps(<4 x float> %a0, <4 x float> %a ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1258291200,1258291200,1258291200,1258291200] ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] -; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] ; AVX2-NEXT: vsubps %xmm4, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll index b70e3fcd779c5..8be82efbacd6f 100644 --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -52,20 +52,10 @@ define <8 x i16> @combine_constfold_v8i16() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_constfold_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_constfold_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] -; AVX2-NEXT: retq -; -; AVX512-LABEL: combine_constfold_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = [0,254,65534,0] -; AVX512-NEXT: retq +; AVX-LABEL: combine_constfold_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] +; AVX-NEXT: retq %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } @@ -76,20 +66,10 @@ define <8 x i16> @combine_constfold_undef_v8i16() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_constfold_undef_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_constfold_undef_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] -; AVX2-NEXT: retq -; -; AVX512-LABEL: combine_constfold_undef_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = [0,65534] -; AVX512-NEXT: retq +; AVX-LABEL: combine_constfold_undef_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] +; AVX-NEXT: retq %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } @@ -251,7 +231,7 @@ define <8 x i16> @combine_trunc_v8i32_v8i16(<8 x i16> %a0, <8 x i32> %a1) { ; ; SSE41-LABEL: combine_trunc_v8i32_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -260,7 +240,7 @@ define <8 x i16> @combine_trunc_v8i32_v8i16(<8 x i16> %a0, <8 x i32> %a1) { ; ; SSE42-LABEL: combine_trunc_v8i32_v8i16: ; SSE42: # %bb.0: -; SSE42-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] ; SSE42-NEXT: pminud %xmm3, %xmm2 ; SSE42-NEXT: pminud %xmm3, %xmm1 ; SSE42-NEXT: packusdw %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll index f16b85eaa195d..d34f1ec4093cc 100644 --- a/llvm/test/CodeGen/X86/combine-sub.ll +++ b/llvm/test/CodeGen/X86/combine-sub.ll @@ -104,14 +104,14 @@ define <4 x i32> @combine_vec_sub_add1(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_vec_sub_constant_add(<4 x i32> %a) { ; SSE-LABEL: combine_vec_sub_constant_add: ; SSE: # %bb.0: -; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [3,1,4294967295,4294967293] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [3,1,4294967295,4294967293] ; SSE-NEXT: psubd %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_sub_constant_add: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,1,4294967295,4294967293] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,1,4294967295,4294967293] ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = add <4 x i32> %a, diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index 55715197830b1..642b290079ec5 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -227,14 +227,14 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -248,13 +248,13 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrld %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -264,14 +264,14 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_udiv_by_pow2c: @@ -301,14 +301,14 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -323,13 +323,13 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrld %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -340,14 +340,14 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a: @@ -378,14 +378,14 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -400,13 +400,13 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrld %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -415,16 +415,16 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 -; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2b: @@ -435,7 +435,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; ; XOP-LABEL: combine_vec_udiv_by_shl_pow2b: ; XOP: # %bb.0: -; XOP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292] +; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292] ; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpshld %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq @@ -659,7 +659,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -673,7 +673,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; XOP-NEXT: movl $249, %eax ; XOP-NEXT: vmovd %eax, %xmm2 ; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] +; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq %div = udiv <16 x i8> %x, diff --git a/llvm/test/CodeGen/X86/combine-undef-index-mscatter.ll b/llvm/test/CodeGen/X86/combine-undef-index-mscatter.ll index 6f748d405458c..137c361305af9 100644 --- a/llvm/test/CodeGen/X86/combine-undef-index-mscatter.ll +++ b/llvm/test/CodeGen/X86/combine-undef-index-mscatter.ll @@ -10,12 +10,12 @@ define void @main(<24 x ptr> %x) ; CHECK-NEXT: vmovq %rsi, %xmm1 ; CHECK-NEXT: vmovq %rdi, %xmm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; CHECK-NEXT: vmovq %r9, %xmm2 +; CHECK-NEXT: vmovq %r8, %xmm3 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; CHECK-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: vmovq %r9, %xmm1 -; CHECK-NEXT: vmovq %r8, %xmm2 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; CHECK-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm1 ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm2 ; CHECK-NEXT: kxnorw %k0, %k0, %k1 diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll index 715d5c7b28f11..d17ea107ba096 100644 --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -346,7 +346,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_shl_pow2b: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,8,16] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16] ; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 @@ -362,7 +362,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_urem_by_lshr_pow2a: ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE-NEXT: pmovsxbd {{.*#+}} xmm3 = [4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrld %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] @@ -419,7 +419,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_urem_by_lshr_pow2b: ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,4,8,16] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,4,8,16] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrld %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] @@ -442,7 +442,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) { ; AVX1-LABEL: combine_vec_urem_by_lshr_pow2b: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,8,16] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,8,16] ; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 ; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 @@ -461,7 +461,7 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) { ; ; AVX2-LABEL: combine_vec_urem_by_lshr_pow2b: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,8,16] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16] ; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/comi-flags.ll b/llvm/test/CodeGen/X86/comi-flags.ll index 6f520aa57dcd0..756e101ee4d9f 100644 --- a/llvm/test/CodeGen/X86/comi-flags.ll +++ b/llvm/test/CodeGen/X86/comi-flags.ll @@ -44,17 +44,24 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1, i32 %a2, i32 %a3) { ; SSE-LABEL: test_x86_sse_comige_ss: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: comiss %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovbl %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse_comige_ss: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vcomiss %xmm1, %xmm0 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse_comige_ss: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: vcomiss %xmm1, %xmm0 +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: cmovbl %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse_comige_ss: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vcomiss %xmm1, %xmm0 +; AVX10_2-NEXT: cmovbl %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -86,17 +93,24 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1, i32 %a2, i32 %a3) { ; SSE-LABEL: test_x86_sse_comile_ss: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: comiss %xmm0, %xmm1 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovbl %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse_comile_ss: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse_comile_ss: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: vcomiss %xmm0, %xmm1 +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: cmovbl %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse_comile_ss: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vcomiss %xmm0, %xmm1 +; AVX10_2-NEXT: cmovbl %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -192,17 +206,24 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1, i32 %a2, i32 %a3) { ; SSE-LABEL: test_x86_sse_ucomige_ss: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovbl %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse_ucomige_ss: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse_ucomige_ss: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: vucomiss %xmm1, %xmm0 +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: cmovbl %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse_ucomige_ss: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vucomiss %xmm1, %xmm0 +; AVX10_2-NEXT: cmovbl %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -234,17 +255,24 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1, i32 %a2, i32 %a3) { ; SSE-LABEL: test_x86_sse_ucomile_ss: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ucomiss %xmm0, %xmm1 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovbl %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse_ucomile_ss: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse_ucomile_ss: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: vucomiss %xmm0, %xmm1 +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: cmovbl %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse_ucomile_ss: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vucomiss %xmm0, %xmm1 +; AVX10_2-NEXT: cmovbl %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -344,17 +372,24 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1, i32 %a2, i32 %a3) { ; SSE-LABEL: test_x86_sse2_comige_sd: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: comisd %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovbl %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse2_comige_sd: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vcomisd %xmm1, %xmm0 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse2_comige_sd: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: vcomisd %xmm1, %xmm0 +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: cmovbl %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse2_comige_sd: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vcomisd %xmm1, %xmm0 +; AVX10_2-NEXT: cmovbl %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -386,17 +421,24 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1, i32 %a2, i32 %a3) { ; SSE-LABEL: test_x86_sse2_comile_sd: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: comisd %xmm0, %xmm1 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovbl %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse2_comile_sd: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vcomisd %xmm0, %xmm1 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse2_comile_sd: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: vcomisd %xmm0, %xmm1 +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: cmovbl %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse2_comile_sd: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vcomisd %xmm0, %xmm1 +; AVX10_2-NEXT: cmovbl %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -492,17 +534,24 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1, i32 %a2, i32 %a3) { ; SSE-LABEL: test_x86_sse2_ucomige_sd: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ucomisd %xmm1, %xmm0 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovbl %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse2_ucomige_sd: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse2_ucomige_sd: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: vucomisd %xmm1, %xmm0 +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: cmovbl %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse2_ucomige_sd: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vucomisd %xmm1, %xmm0 +; AVX10_2-NEXT: cmovbl %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 @@ -534,17 +583,24 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1, i32 %a2, i32 %a3) { ; SSE-LABEL: test_x86_sse2_ucomile_sd: ; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: ucomisd %xmm0, %xmm1 +; SSE-NEXT: movl %edi, %eax ; SSE-NEXT: cmovbl %esi, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_x86_sse2_ucomile_sd: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vucomisd %xmm0, %xmm1 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq +; NO-AVX10_2-LABEL: test_x86_sse2_ucomile_sd: +; NO-AVX10_2: # %bb.0: +; NO-AVX10_2-NEXT: vucomisd %xmm0, %xmm1 +; NO-AVX10_2-NEXT: movl %edi, %eax +; NO-AVX10_2-NEXT: cmovbl %esi, %eax +; NO-AVX10_2-NEXT: retq +; +; AVX10_2-LABEL: test_x86_sse2_ucomile_sd: +; AVX10_2: # %bb.0: +; AVX10_2-NEXT: movl %edi, %eax +; AVX10_2-NEXT: vucomisd %xmm0, %xmm1 +; AVX10_2-NEXT: cmovbl %esi, %eax +; AVX10_2-NEXT: retq %call = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1) ; [#uses=1] %cmp = icmp ne i32 %call, 0 %res = select i1 %cmp, i32 %a2, i32 %a3 diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll index ae61fa1eb2319..ab0c89c4eeee1 100644 --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -209,17 +209,17 @@ define <4 x double> @uitofp_v4i32_v4f64(<2 x i32> %x, <2 x i32> %y) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vorpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_v4i32_v4f64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -266,22 +266,22 @@ define <4 x i32> @fptosi_v4f64_v4i32(<2 x double> %x, <2 x double> %y) { define <4 x i32> @fptoui_v4f64_v4i32(<2 x double> %x, <2 x double> %y) { ; SSE-LABEL: fptoui_v4f64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: movapd {{.*#+}} xmm2 = [2.147483648E+9,2.147483648E+9] -; SSE-NEXT: cvttpd2dq %xmm0, %xmm3 -; SSE-NEXT: subpd %xmm2, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm0, %xmm2 +; SSE-NEXT: movapd {{.*#+}} xmm3 = [2.147483648E+9,2.147483648E+9] +; SSE-NEXT: subpd %xmm3, %xmm0 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm4 -; SSE-NEXT: movapd %xmm3, %xmm0 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: psrad $31, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: cvttpd2dq %xmm1, %xmm3 -; SSE-NEXT: subpd %xmm2, %xmm1 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: cvttpd2dq %xmm1, %xmm2 +; SSE-NEXT: subpd %xmm3, %xmm1 ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 -; SSE-NEXT: movapd %xmm3, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movapd %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_v4f64_v4i32: @@ -289,9 +289,9 @@ define <4 x i32> @fptoui_v4f64_v4i32(<2 x double> %x, <2 x double> %y) { ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper @@ -302,10 +302,10 @@ define <4 x i32> @fptoui_v4f64_v4i32(<2 x double> %x, <2 x double> %y) { ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vsubpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX2-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper @@ -374,7 +374,7 @@ define <4 x float> @mismatch_tofp_v4i32_v4f32(<2 x i32> %x, <2 x i32> %y) { ; AVX2-LABEL: mismatch_tofp_v4i32_v4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/concat-fpext-v2bf16.ll b/llvm/test/CodeGen/X86/concat-fpext-v2bf16.ll index b3891a61f4574..1f80b40e05352 100644 --- a/llvm/test/CodeGen/X86/concat-fpext-v2bf16.ll +++ b/llvm/test/CodeGen/X86/concat-fpext-v2bf16.ll @@ -10,9 +10,9 @@ define void @test(<2 x ptr> %ptr) { ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB0_1: # %ifmerge.89 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm2 -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovlps %xmm0, (%rax) entry: diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll index 4c990d81810be..19f2cd14bc72b 100644 --- a/llvm/test/CodeGen/X86/conditional-tailcall.ll +++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll @@ -263,80 +263,76 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize ; CHECK32-NEXT: .LBB3_1: # %for.cond ; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK32-NEXT: testl %edx, %edx # encoding: [0x85,0xd2] -; CHECK32-NEXT: je .LBB3_14 # encoding: [0x74,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_14-1, kind: FK_PCRel_1 +; CHECK32-NEXT: je .LBB3_13 # encoding: [0x74,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_13-1, kind: FK_PCRel_1 ; CHECK32-NEXT: # %bb.2: # %for.body ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK32-NEXT: cmpl $2, %ebx # encoding: [0x83,0xfb,0x02] -; CHECK32-NEXT: je .LBB3_12 # encoding: [0x74,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_12-1, kind: FK_PCRel_1 +; CHECK32-NEXT: je .LBB3_11 # encoding: [0x74,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 ; CHECK32-NEXT: # %bb.3: # %for.body ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK32-NEXT: cmpl $1, %ebx # encoding: [0x83,0xfb,0x01] -; CHECK32-NEXT: je .LBB3_10 # encoding: [0x74,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 +; CHECK32-NEXT: je .LBB3_9 # encoding: [0x74,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1 ; CHECK32-NEXT: # %bb.4: # %for.body ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK32-NEXT: testl %ebx, %ebx # encoding: [0x85,0xdb] -; CHECK32-NEXT: jne .LBB3_11 # encoding: [0x75,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 +; CHECK32-NEXT: jne .LBB3_10 # encoding: [0x75,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; CHECK32-NEXT: # %bb.5: # %sw.bb ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK32-NEXT: movzbl (%eax), %ebp # encoding: [0x0f,0xb6,0x28] -; CHECK32-NEXT: cmpl $43, %ebp # encoding: [0x83,0xfd,0x2b] ; CHECK32-NEXT: movl %edi, %ebx # encoding: [0x89,0xfb] -; CHECK32-NEXT: je .LBB3_11 # encoding: [0x74,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 +; CHECK32-NEXT: cmpl $43, %ebp # encoding: [0x83,0xfd,0x2b] +; CHECK32-NEXT: je .LBB3_10 # encoding: [0x74,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; CHECK32-NEXT: # %bb.6: # %sw.bb ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK32-NEXT: cmpl $45, %ebp # encoding: [0x83,0xfd,0x2d] ; CHECK32-NEXT: movl %edi, %ebx # encoding: [0x89,0xfb] -; CHECK32-NEXT: je .LBB3_11 # encoding: [0x74,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 -; CHECK32-NEXT: # %bb.7: # %if.else +; CHECK32-NEXT: cmpl $45, %ebp # encoding: [0x83,0xfd,0x2d] +; CHECK32-NEXT: jne .LBB3_7 # encoding: [0x75,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_7-1, kind: FK_PCRel_1 +; CHECK32-NEXT: jmp .LBB3_10 # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 +; CHECK32-NEXT: .LBB3_9: # %sw.bb14 +; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 +; CHECK32-NEXT: movzbl (%eax), %ebp # encoding: [0x0f,0xb6,0x28] +; CHECK32-NEXT: .LBB3_7: # %if.else ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK32-NEXT: addl $-48, %ebp # encoding: [0x83,0xc5,0xd0] +; CHECK32-NEXT: movl %esi, %ebx # encoding: [0x89,0xf3] ; CHECK32-NEXT: cmpl $10, %ebp # encoding: [0x83,0xfd,0x0a] -; CHECK32-NEXT: jmp .LBB3_8 # encoding: [0xeb,A] +; CHECK32-NEXT: jae .LBB3_8 # encoding: [0x73,A] ; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1 -; CHECK32-NEXT: .LBB3_10: # %sw.bb14 -; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK32-NEXT: movzbl (%eax), %ebx # encoding: [0x0f,0xb6,0x18] -; CHECK32-NEXT: addl $-48, %ebx # encoding: [0x83,0xc3,0xd0] -; CHECK32-NEXT: cmpl $10, %ebx # encoding: [0x83,0xfb,0x0a] -; CHECK32-NEXT: .LBB3_8: # %if.else -; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK32-NEXT: movl %esi, %ebx # encoding: [0x89,0xf3] -; CHECK32-NEXT: jae .LBB3_9 # encoding: [0x73,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1 -; CHECK32-NEXT: jmp .LBB3_11 # encoding: [0xeb,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 -; CHECK32-NEXT: .LBB3_12: # %sw.bb22 +; CHECK32-NEXT: jmp .LBB3_10 # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 +; CHECK32-NEXT: .LBB3_11: # %sw.bb22 ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK32-NEXT: movzbl (%eax), %ebx # encoding: [0x0f,0xb6,0x18] -; CHECK32-NEXT: addl $-48, %ebx # encoding: [0x83,0xc3,0xd0] -; CHECK32-NEXT: cmpl $10, %ebx # encoding: [0x83,0xfb,0x0a] +; CHECK32-NEXT: movzbl (%eax), %ebp # encoding: [0x0f,0xb6,0x28] +; CHECK32-NEXT: addl $-48, %ebp # encoding: [0x83,0xc5,0xd0] ; CHECK32-NEXT: movl %esi, %ebx # encoding: [0x89,0xf3] -; CHECK32-NEXT: jae .LBB3_13 # encoding: [0x73,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_13-1, kind: FK_PCRel_1 -; CHECK32-NEXT: .LBB3_11: # %for.inc +; CHECK32-NEXT: cmpl $10, %ebp # encoding: [0x83,0xfd,0x0a] +; CHECK32-NEXT: jae .LBB3_12 # encoding: [0x73,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_12-1, kind: FK_PCRel_1 +; CHECK32-NEXT: .LBB3_10: # %for.inc ; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK32-NEXT: incl %eax # encoding: [0x40] ; CHECK32-NEXT: decl %edx # encoding: [0x4a] ; CHECK32-NEXT: jmp .LBB3_1 # encoding: [0xeb,A] ; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1 -; CHECK32-NEXT: .LBB3_14: +; CHECK32-NEXT: .LBB3_13: ; CHECK32-NEXT: cmpl $2, %ebx # encoding: [0x83,0xfb,0x02] ; CHECK32-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK32-NEXT: jmp .LBB3_15 # encoding: [0xeb,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_15-1, kind: FK_PCRel_1 -; CHECK32-NEXT: .LBB3_9: +; CHECK32-NEXT: jmp .LBB3_14 # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_14-1, kind: FK_PCRel_1 +; CHECK32-NEXT: .LBB3_8: ; CHECK32-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK32-NEXT: .LBB3_15: # %cleanup.thread +; CHECK32-NEXT: .LBB3_14: # %cleanup.thread ; CHECK32-NEXT: # kill: def $al killed $al killed $eax ; CHECK32-NEXT: addl $12, %esp # encoding: [0x83,0xc4,0x0c] ; CHECK32-NEXT: .cfi_def_cfa_offset 20 -; CHECK32-NEXT: .LBB3_16: # %cleanup.thread +; CHECK32-NEXT: .LBB3_15: # %cleanup.thread ; CHECK32-NEXT: popl %esi # encoding: [0x5e] ; CHECK32-NEXT: .cfi_def_cfa_offset 16 ; CHECK32-NEXT: popl %edi # encoding: [0x5f] @@ -346,7 +342,7 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize ; CHECK32-NEXT: popl %ebp # encoding: [0x5d] ; CHECK32-NEXT: .cfi_def_cfa_offset 4 ; CHECK32-NEXT: retl # encoding: [0xc3] -; CHECK32-NEXT: .LBB3_13: # %if.else28 +; CHECK32-NEXT: .LBB3_12: # %if.else28 ; CHECK32-NEXT: .cfi_def_cfa_offset 32 ; CHECK32-NEXT: subl $8, %esp # encoding: [0x83,0xec,0x08] ; CHECK32-NEXT: .cfi_adjust_cfa_offset 8 @@ -358,8 +354,8 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize ; CHECK32-NEXT: # fixup A - offset: 1, value: _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_-4, kind: FK_PCRel_4 ; CHECK32-NEXT: addl $28, %esp # encoding: [0x83,0xc4,0x1c] ; CHECK32-NEXT: .cfi_adjust_cfa_offset -28 -; CHECK32-NEXT: jmp .LBB3_16 # encoding: [0xeb,A] -; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_16-1, kind: FK_PCRel_1 +; CHECK32-NEXT: jmp .LBB3_15 # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_15-1, kind: FK_PCRel_1 ; ; CHECK64-LABEL: pr31257: ; CHECK64: # %bb.0: # %entry @@ -378,75 +374,71 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize ; CHECK64-NEXT: .LBB3_1: # %for.cond ; CHECK64-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK64-NEXT: testq %rax, %rax # encoding: [0x48,0x85,0xc0] -; CHECK64-NEXT: je .LBB3_12 # encoding: [0x74,A] -; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_12-1, kind: FK_PCRel_1 +; CHECK64-NEXT: je .LBB3_11 # encoding: [0x74,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.2: # %for.body ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02] -; CHECK64-NEXT: je .LBB3_10 # encoding: [0x74,A] -; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 +; CHECK64-NEXT: je .LBB3_9 # encoding: [0x74,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.3: # %for.body ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK64-NEXT: cmpl $1, %r8d # encoding: [0x41,0x83,0xf8,0x01] -; CHECK64-NEXT: je .LBB3_8 # encoding: [0x74,A] -; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1 +; CHECK64-NEXT: je .LBB3_7 # encoding: [0x74,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_7-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.4: # %for.body ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK64-NEXT: testl %r8d, %r8d # encoding: [0x45,0x85,0xc0] -; CHECK64-NEXT: jne .LBB3_11 # encoding: [0x75,A] -; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 +; CHECK64-NEXT: jne .LBB3_10 # encoding: [0x75,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.5: # %sw.bb ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK64-NEXT: movzbl (%rdi), %r9d # encoding: [0x44,0x0f,0xb6,0x0f] -; CHECK64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] ; CHECK64-NEXT: movl %edx, %r8d # encoding: [0x41,0x89,0xd0] -; CHECK64-NEXT: je .LBB3_11 # encoding: [0x74,A] -; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 +; CHECK64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] +; CHECK64-NEXT: je .LBB3_10 # encoding: [0x74,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; CHECK64-NEXT: # %bb.6: # %sw.bb ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: cmpl $45, %r9d # encoding: [0x41,0x83,0xf9,0x2d] ; CHECK64-NEXT: movl %edx, %r8d # encoding: [0x41,0x89,0xd0] -; CHECK64-NEXT: je .LBB3_11 # encoding: [0x74,A] -; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 -; CHECK64-NEXT: # %bb.7: # %if.else -; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0] -; CHECK64-NEXT: cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a] -; CHECK64-NEXT: jmp .LBB3_9 # encoding: [0xeb,A] -; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1 -; CHECK64-NEXT: .LBB3_8: # %sw.bb14 +; CHECK64-NEXT: cmpl $45, %r9d # encoding: [0x41,0x83,0xf9,0x2d] +; CHECK64-NEXT: jne .LBB3_8 # encoding: [0x75,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1 +; CHECK64-NEXT: jmp .LBB3_10 # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 +; CHECK64-NEXT: .LBB3_7: # %sw.bb14 ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: movzbl (%rdi), %r8d # encoding: [0x44,0x0f,0xb6,0x07] -; CHECK64-NEXT: addl $-48, %r8d # encoding: [0x41,0x83,0xc0,0xd0] -; CHECK64-NEXT: cmpl $10, %r8d # encoding: [0x41,0x83,0xf8,0x0a] -; CHECK64-NEXT: .LBB3_9: # %if.else +; CHECK64-NEXT: movzbl (%rdi), %r9d # encoding: [0x44,0x0f,0xb6,0x0f] +; CHECK64-NEXT: .LBB3_8: # %if.else ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 +; CHECK64-NEXT: addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0] ; CHECK64-NEXT: movl %ecx, %r8d # encoding: [0x41,0x89,0xc8] -; CHECK64-NEXT: jb .LBB3_11 # encoding: [0x72,A] -; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1 -; CHECK64-NEXT: jmp .LBB3_13 # encoding: [0xeb,A] -; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_13-1, kind: FK_PCRel_1 -; CHECK64-NEXT: .LBB3_10: # %sw.bb22 +; CHECK64-NEXT: cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a] +; CHECK64-NEXT: jb .LBB3_10 # encoding: [0x72,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 +; CHECK64-NEXT: jmp .LBB3_12 # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_12-1, kind: FK_PCRel_1 +; CHECK64-NEXT: .LBB3_9: # %sw.bb22 ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK64-NEXT: movzbl (%rdi), %r8d # encoding: [0x44,0x0f,0xb6,0x07] -; CHECK64-NEXT: addl $-48, %r8d # encoding: [0x41,0x83,0xc0,0xd0] -; CHECK64-NEXT: cmpl $10, %r8d # encoding: [0x41,0x83,0xf8,0x0a] +; CHECK64-NEXT: movzbl (%rdi), %r9d # encoding: [0x44,0x0f,0xb6,0x0f] +; CHECK64-NEXT: addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0] ; CHECK64-NEXT: movl %ecx, %r8d # encoding: [0x41,0x89,0xc8] +; CHECK64-NEXT: cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a] ; CHECK64-NEXT: jae _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_ # TAILCALL ; CHECK64-NEXT: # encoding: [0x73,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_-1, kind: FK_PCRel_1 -; CHECK64-NEXT: .LBB3_11: # %for.inc +; CHECK64-NEXT: .LBB3_10: # %for.inc ; CHECK64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; CHECK64-NEXT: incq %rdi # encoding: [0x48,0xff,0xc7] ; CHECK64-NEXT: decq %rax # encoding: [0x48,0xff,0xc8] ; CHECK64-NEXT: jmp .LBB3_1 # encoding: [0xeb,A] ; CHECK64-NEXT: # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1 -; CHECK64-NEXT: .LBB3_12: +; CHECK64-NEXT: .LBB3_11: ; CHECK64-NEXT: cmpl $2, %r8d # encoding: [0x41,0x83,0xf8,0x02] ; CHECK64-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; CHECK64-NEXT: # kill: def $al killed $al killed $eax ; CHECK64-NEXT: retq # encoding: [0xc3] -; CHECK64-NEXT: .LBB3_13: +; CHECK64-NEXT: .LBB3_12: ; CHECK64-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] ; CHECK64-NEXT: # kill: def $al killed $al killed $eax ; CHECK64-NEXT: retq # encoding: [0xc3] @@ -480,8 +472,8 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize ; WIN64-NEXT: # %bb.5: # %sw.bb ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09] -; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] ; WIN64-NEXT: movl $1, %r8d # encoding: [0x41,0xb8,0x01,0x00,0x00,0x00] +; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] ; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; WIN64-NEXT: # %bb.6: # %sw.bb diff --git a/llvm/test/CodeGen/X86/copy-eflags.ll b/llvm/test/CodeGen/X86/copy-eflags.ll index e1711ccdbe13f..f5d2aff074647 100644 --- a/llvm/test/CodeGen/X86/copy-eflags.ll +++ b/llvm/test/CodeGen/X86/copy-eflags.ll @@ -49,10 +49,10 @@ define dso_local i32 @test1() nounwind { ; X64-NEXT: incl c(%rip) ; X64-NEXT: sete %dl ; X64-NEXT: movzbl a(%rip), %esi -; X64-NEXT: leal 1(%rsi), %edi ; X64-NEXT: cmpb %cl, %sil ; X64-NEXT: sete d(%rip) -; X64-NEXT: movb %dil, a(%rip) +; X64-NEXT: leal 1(%rsi), %ecx +; X64-NEXT: movb %cl, a(%rip) ; X64-NEXT: testb %dl, %dl ; X64-NEXT: jne .LBB0_2 ; X64-NEXT: # %bb.1: # %if.then diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll index d9393ba9febb2..27dcd3f407cba 100644 --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -788,8 +788,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_unary(<32 ; CHECK: # %bb.0: ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] ; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16] -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; CHECK-NEXT: # ymm2 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> @@ -800,8 +799,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_binary(<3 ; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_binary: ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; CHECK-NEXT: # ymm2 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> @@ -858,8 +856,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_unary(<32 ; CHECK: # %bb.0: ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16] -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; CHECK-NEXT: # ymm2 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> @@ -871,8 +868,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_binary(<3 ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 ; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 -; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; CHECK-NEXT: # ymm2 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> diff --git a/llvm/test/CodeGen/X86/critical-anti-dep-breaker.ll b/llvm/test/CodeGen/X86/critical-anti-dep-breaker.ll index 653ce171b1d8f..3bbe4e8efa5f9 100644 --- a/llvm/test/CodeGen/X86/critical-anti-dep-breaker.ll +++ b/llvm/test/CodeGen/X86/critical-anti-dep-breaker.ll @@ -24,9 +24,9 @@ define i32 @Part_Create(ptr %Anchor, i32 %TypeNum, i32 %F, i32 %Z, ptr %Status, ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq NullToken@GOTPCREL(%rip), %rax ; CHECK-NEXT: movq PartClass@GOTPCREL(%rip), %r10 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: movl (%r10), %ebp ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll index 1267fe9033454..a4c6e2be32920 100644 --- a/llvm/test/CodeGen/X86/ctlz.ll +++ b/llvm/test/CodeGen/X86/ctlz.ll @@ -158,8 +158,8 @@ define i64 @ctlz_i64(i64 %x) { ; X86-CMOV: # %bb.0: ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-CMOV-NEXT: bsrl %ecx, %edx -; X86-CMOV-NEXT: xorl $31, %edx ; X86-CMOV-NEXT: bsrl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: xorl $31, %edx ; X86-CMOV-NEXT: xorl $31, %eax ; X86-CMOV-NEXT: orl $32, %eax ; X86-CMOV-NEXT: testl %ecx, %ecx @@ -890,10 +890,10 @@ define i32 @PR47603_zext(i32 %a0, ptr %a1) { ; ; X86-FASTLZCNT-LABEL: PR47603_zext: ; X86-FASTLZCNT: # %bb.0: -; X86-FASTLZCNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FASTLZCNT-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx -; X86-FASTLZCNT-NEXT: xorl $31, %ecx -; X86-FASTLZCNT-NEXT: movsbl (%eax,%ecx), %eax +; X86-FASTLZCNT-NEXT: lzcntl {{[0-9]+}}(%esp), %eax +; X86-FASTLZCNT-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-FASTLZCNT-NEXT: xorl $31, %eax +; X86-FASTLZCNT-NEXT: movsbl (%ecx,%eax), %eax ; X86-FASTLZCNT-NEXT: retl %ctlz = tail call i32 @llvm.ctlz.i32(i32 %a0, i1 true) %xor = xor i32 %ctlz, 31 diff --git a/llvm/test/CodeGen/X86/dag-large-offset.ll b/llvm/test/CodeGen/X86/dag-large-offset.ll index 2774a93993153..205651ce87679 100644 --- a/llvm/test/CodeGen/X86/dag-large-offset.ll +++ b/llvm/test/CodeGen/X86/dag-large-offset.ll @@ -23,7 +23,8 @@ define i32 @foo(i1 %b) #0 { ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_1: ; CHECK-NEXT: movl $-2147483647, %eax # imm = 0x80000001 -; CHECK-NEXT: leal -5(%ebp,%eax), %eax +; CHECK-NEXT: addl %ebp, %eax +; CHECK-NEXT: addl $-5, %eax ; CHECK-NEXT: .LBB0_3: # %entry ; CHECK-NEXT: movl __stack_chk_guard, %ecx ; CHECK-NEXT: cmpl -4(%ebp), %ecx diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll index bf6b09674e187..0218c99f41e37 100644 --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -28,26 +28,26 @@ define void @_Z1nv() local_unnamed_addr { ; CHECK-NEXT: movq c@GOTPCREL(%rip), %rax ; CHECK-NEXT: movswl (%rax), %ecx ; CHECK-NEXT: movq b@GOTPCREL(%rip), %rax -; CHECK-NEXT: movswl (%rax), %edi ; CHECK-NEXT: movq a@GOTPCREL(%rip), %rsi ; CHECK-NEXT: movl (%rsi), %esi -; CHECK-NEXT: movq l@GOTPCREL(%rip), %r8 -; CHECK-NEXT: movl (%r8), %r8d -; CHECK-NEXT: movl %r8d, %r9d -; CHECK-NEXT: shll $7, %r9d -; CHECK-NEXT: sarl $7, %r9d -; CHECK-NEXT: negl %r9d +; CHECK-NEXT: movq l@GOTPCREL(%rip), %rdi +; CHECK-NEXT: movl (%rdi), %edi +; CHECK-NEXT: movl %edi, %r8d +; CHECK-NEXT: shll $7, %r8d +; CHECK-NEXT: sarl $7, %r8d +; CHECK-NEXT: negl %r8d ; CHECK-NEXT: testl %esi, %esi -; CHECK-NEXT: cmovel %esi, %r9d +; CHECK-NEXT: cmovel %esi, %r8d +; CHECK-NEXT: movswl (%rax), %r9d ; CHECK-NEXT: movzwl %dx, %r10d ; CHECK-NEXT: leal (%rcx,%r10,2), %ecx -; CHECK-NEXT: addl %edi, %ecx -; CHECK-NEXT: cmpl %r9d, %ecx -; CHECK-NEXT: sete %dil -; CHECK-NEXT: testl $33554431, %r8d # imm = 0x1FFFFFF +; CHECK-NEXT: addl %r9d, %ecx +; CHECK-NEXT: cmpl %r8d, %ecx ; CHECK-NEXT: sete %r8b -; CHECK-NEXT: orb %dil, %r8b -; CHECK-NEXT: movzbl %r8b, %edi +; CHECK-NEXT: testl $33554431, %edi # imm = 0x1FFFFFF +; CHECK-NEXT: sete %dil +; CHECK-NEXT: orb %r8b, %dil +; CHECK-NEXT: movzbl %dil, %edi ; CHECK-NEXT: movq e@GOTPCREL(%rip), %r8 ; CHECK-NEXT: movw %di, (%r8) ; CHECK-NEXT: notl %ecx @@ -96,6 +96,17 @@ entry: define void @_Z2x6v() local_unnamed_addr { ; CHECK-LABEL: _Z2x6v: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax +; CHECK-NEXT: movl (%rax), %edx +; CHECK-NEXT: andl $511, %edx # imm = 0x1FF +; CHECK-NEXT: leaq 1(%rdx), %rax +; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx +; CHECK-NEXT: movl %eax, (%rcx) +; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx +; CHECK-NEXT: movl (%rcx), %ecx +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: je .LBB1_18 +; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r15 @@ -114,32 +125,21 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl (%rax), %ebx -; CHECK-NEXT: andl $511, %ebx # imm = 0x1FF -; CHECK-NEXT: leaq 1(%rbx), %rax -; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movl %eax, (%rcx) -; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx -; CHECK-NEXT: movl (%rcx), %ecx -; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: je .LBB1_18 -; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph -; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rdx -; CHECK-NEXT: movq (%rdx), %rsi -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: notl %edx -; CHECK-NEXT: leaq 8(,%rdx,8), %rdi +; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rsi +; CHECK-NEXT: movq (%rsi), %rsi +; CHECK-NEXT: movl %ecx, %edi +; CHECK-NEXT: notl %edi +; CHECK-NEXT: leaq 8(,%rdi,8), %rdi ; CHECK-NEXT: imulq %rax, %rdi ; CHECK-NEXT: addq %rsi, %rdi ; CHECK-NEXT: movq x2@GOTPCREL(%rip), %r8 -; CHECK-NEXT: movl (%r8), %edx -; CHECK-NEXT: leal 8(,%rbx,8), %eax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: leaq 8(%rsi), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movl (%r8), %eax +; CHECK-NEXT: leal 8(,%rdx,8), %r9d +; CHECK-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: leaq 8(%rsi), %r9 +; CHECK-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: leaq 32(%rsi), %r11 -; CHECK-NEXT: leaq 8(,%rbx,8), %rbx +; CHECK-NEXT: leaq 8(,%rdx,8), %rbx ; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: movq x0@GOTPCREL(%rip), %r15 ; CHECK-NEXT: movq %rsi, %r12 @@ -147,7 +147,7 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_15: # %for.cond1.for.inc3_crit_edge ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: movl %edx, (%r8) +; CHECK-NEXT: movl %eax, (%r8) ; CHECK-NEXT: .LBB1_16: # %for.inc3 ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: addq %rbx, %r12 @@ -159,11 +159,11 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB1_12 Depth 2 ; CHECK-NEXT: # Child Loop BB1_14 Depth 2 -; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: jns .LBB1_16 ; CHECK-NEXT: # %bb.3: # %for.body2.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: movslq %edx, %r13 +; CHECK-NEXT: movslq %eax, %r13 ; CHECK-NEXT: testq %r13, %r13 ; CHECK-NEXT: movq $-1, %rbp ; CHECK-NEXT: cmovnsq %r13, %rbp @@ -195,8 +195,8 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: ja .LBB1_14 ; CHECK-NEXT: .LBB1_7: # %vector.body.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: leaq -4(%rdx), %rax -; CHECK-NEXT: btl $2, %eax +; CHECK-NEXT: leaq -4(%rdx), %r10 +; CHECK-NEXT: btl $2, %r10d ; CHECK-NEXT: jb .LBB1_8 ; CHECK-NEXT: # %bb.9: # %vector.body.prol.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 @@ -204,38 +204,35 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; CHECK-NEXT: movdqu %xmm0, (%r12,%r13,8) ; CHECK-NEXT: movdqu %xmm0, 16(%r12,%r13,8) -; CHECK-NEXT: movl $4, %r10d -; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: movl $4, %eax +; CHECK-NEXT: shrq $2, %r10 ; CHECK-NEXT: jne .LBB1_11 ; CHECK-NEXT: jmp .LBB1_13 ; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: xorl %r10d, %r10d -; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: shrq $2, %r10 ; CHECK-NEXT: je .LBB1_13 ; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; CHECK-NEXT: movq %r10, %rax -; CHECK-NEXT: subq %rdx, %rax -; CHECK-NEXT: addq %r13, %r10 -; CHECK-NEXT: leaq (%r11,%r10,8), %r10 +; CHECK-NEXT: leaq (%r11,%r13,8), %r10 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_12: # %vector.body ; CHECK-NEXT: # Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movdqu %xmm0, -32(%r10) -; CHECK-NEXT: movdqu %xmm0, -16(%r10) -; CHECK-NEXT: movdqu %xmm0, (%r10) -; CHECK-NEXT: movdqu %xmm0, 16(%r10) -; CHECK-NEXT: addq $64, %r10 +; CHECK-NEXT: movdqu %xmm0, -32(%r10,%rax,8) +; CHECK-NEXT: movdqu %xmm0, -16(%r10,%rax,8) +; CHECK-NEXT: movdqu %xmm0, (%r10,%rax,8) +; CHECK-NEXT: movdqu %xmm0, 16(%r10,%rax,8) ; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: cmpq %rax, %rdx ; CHECK-NEXT: jne .LBB1_12 ; CHECK-NEXT: .LBB1_13: # %middle.block ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: addq %rdx, %r13 +; CHECK-NEXT: movq %r13, %rax ; CHECK-NEXT: cmpq %rdx, %rbp -; CHECK-NEXT: movq %r13, %rdx ; CHECK-NEXT: je .LBB1_15 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_14: # %for.body2 @@ -243,9 +240,9 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 ; CHECK-NEXT: movq (%r15), %rax ; CHECK-NEXT: movq %rax, (%r12,%r13,8) -; CHECK-NEXT: leaq 1(%r13), %rdx +; CHECK-NEXT: leaq 1(%r13), %rax ; CHECK-NEXT: cmpq $-1, %r13 -; CHECK-NEXT: movq %rdx, %r13 +; CHECK-NEXT: movq %rax, %r13 ; CHECK-NEXT: jl .LBB1_14 ; CHECK-NEXT: jmp .LBB1_15 ; CHECK-NEXT: .LBB1_17: # %for.cond.for.end5_crit_edge @@ -253,7 +250,6 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movq %rdi, (%rax) ; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rax ; CHECK-NEXT: movl $0, (%rax) -; CHECK-NEXT: .LBB1_18: # %for.end5 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -266,6 +262,13 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_restore %rbx +; CHECK-NEXT: .cfi_restore %r12 +; CHECK-NEXT: .cfi_restore %r13 +; CHECK-NEXT: .cfi_restore %r14 +; CHECK-NEXT: .cfi_restore %r15 +; CHECK-NEXT: .cfi_restore %rbp +; CHECK-NEXT: .LBB1_18: # %for.end5 ; CHECK-NEXT: retq entry: %0 = load i32, ptr @x1, align 4 diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll index 3efd536adc4d1..39e67f2d16fe0 100644 --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -51,7 +51,6 @@ define i96 @square_high(i96 %x) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %edi, %eax @@ -64,6 +63,7 @@ define i96 @square_high(i96 %x) nounwind { ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: setb %al ; X86-NEXT: movzbl %al, %ecx ; X86-NEXT: movl %ebx, %eax @@ -88,8 +88,8 @@ define i96 @square_high(i96 %x) nounwind { ; X86-NEXT: setb %dl ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: movzbl %dl, %ecx ; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movzbl %dl, %ecx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %eax diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll index 1380c02663ee0..f2dc181c465c7 100644 --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -44,8 +44,8 @@ define i32 @select_and3(i32 %x, i32 %y) { define <4 x i32> @select_and_v4(i32 %x, <4 x i32> %y) { ; CHECK-LABEL: select_and_v4: ; CHECK: # %bb.0: -; CHECK-NEXT: cmpl $11, %edi ; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cmpl $11, %edi ; CHECK-NEXT: jl .LBB3_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: movaps %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll index 345b2b9309f9a..5f0460ac28341 100644 --- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll +++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll @@ -438,8 +438,8 @@ define <4 x i32> @shift_zext_shl2_vec(<4 x i8> %x) nounwind { ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 8f82a5bc6554e..8877a28d2c0e1 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -96,29 +96,31 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %ebp ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ecx, 4(%edx) -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: imull %eax, %ebp -; X86-NEXT: mull %ebx -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %ebx, (%edi) +; X86-NEXT: imull %ebx, %esi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: subl %eax, %esi -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edi, %edx +; X86-NEXT: subl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -129,10 +131,24 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: orq %rsi, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: je .LBB3_1 +; X64-NEXT: # %bb.2: +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rsi +; X64-NEXT: jmp .LBB3_3 +; X64-NEXT: .LBB3_1: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divl %esi +; X64-NEXT: # kill: def $eax killed $eax def $rax +; X64-NEXT: .LBB3_3: ; X64-NEXT: movq %rax, (%rcx) -; X64-NEXT: movq %rdx, %rax +; X64-NEXT: imulq %rax, %rsi +; X64-NEXT: subq %rsi, %rdi +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %div = sdiv i64 %x, %y store i64 %div, ptr %divdst, align 4 @@ -172,26 +188,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%ebp), %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl 40(%ebp), %ebx +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ebx ; X86-NEXT: movl 36(%ebp), %ecx ; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: movl 32(%ebp), %ebx -; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: movl 32(%ebp), %edi ; X86-NEXT: xorl %edx, %edi -; X86-NEXT: subl %edx, %edi -; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: xorl %edx, %esi +; X86-NEXT: subl %edx, %esi +; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %edx, %ebx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl @@ -203,90 +219,91 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %esi, %edx +; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: bsrl %esi, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: orl $32, %eax ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: bsrl %edi, %edi -; X86-NEXT: xorl $31, %edi -; X86-NEXT: orl $32, %edi +; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: orl $64, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: cmovnel %edx, %edi -; X86-NEXT: orl $64, %edi -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %edx -; X86-NEXT: cmovnel %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: bsrl %ebx, %esi -; X86-NEXT: xorl $31, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: bsrl %esi, %edi +; X86-NEXT: xorl $31, %edi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx ; X86-NEXT: orl $32, %edx -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: cmovnel %esi, %edx +; X86-NEXT: testl %esi, %esi +; X86-NEXT: cmovnel %edi, %edx ; X86-NEXT: orl $64, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl %edx, %edi +; X86-NEXT: subl %edx, %eax ; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movl $0, %eax -; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: movl $127, %ecx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edi, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: cmovnel %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: cmovnel %edx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovnel %ebx, %eax +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovnel %ebx, %eax -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.8: # %_udiv-special-cases -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl $127, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl $127, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %edx, %eax +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl %ebx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: je .LBB4_9 ; X86-NEXT: # %bb.5: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -299,229 +316,227 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorb $127, %cl ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 152(%esp,%eax), %esi -; X86-NEXT: movl 156(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movsbl %al, %edi +; X86-NEXT: movl 152(%esp,%edi), %edx +; X86-NEXT: movl 156(%esp,%edi), %ebx +; X86-NEXT: shldl %cl, %edx, %ebx +; X86-NEXT: movl 148(%esp,%edi), %eax +; X86-NEXT: shldl %cl, %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 144(%esp,%eax), %edx -; X86-NEXT: movl 148(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl 144(%esp,%edi), %edx ; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl $1, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl $0, %edx ; X86-NEXT: jae .LBB4_2 ; X86-NEXT: # %bb.6: -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_1: -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: jmp .LBB4_9 ; X86-NEXT: .LBB4_2: # %udiv-preheader -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 108(%esp,%eax), %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrb $3, %cl +; X86-NEXT: andb $12, %cl +; X86-NEXT: movzbl %cl, %esi +; X86-NEXT: movl 108(%esp,%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 104(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shrdl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%eax), %esi -; X86-NEXT: movl 100(%esp,%eax), %eax +; X86-NEXT: movl 104(%esp,%esi), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: shrdl %cl, %ebx, %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shrl %cl, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shrdl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 100(%esp,%esi), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shrdl %cl, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%esi), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: shrdl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: shldl $1, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: shldl $1, %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %edx -; X86-NEXT: shldl $1, %ecx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: orl %esi, %ecx +; X86-NEXT: shldl $1, %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %eax +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edi, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %esi, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl $1, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: subl %ecx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $-1, %ebx ; X86-NEXT: adcl $-1, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $-1, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %edi, %ecx ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: jne .LBB4_3 ; X86-NEXT: # %bb.4: -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: shldl $1, %ebx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: shldl $1, %eax, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: shldl $1, %edi, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %edi, %edx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: shldl $1, %ecx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: shldl $1, %edx, %eax +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: orl %edi, %eax +; X86-NEXT: addl %edx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: .LBB4_9: # %udiv-end -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: xorl %ecx, %ebx -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: sbbl %ecx, %eax -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: sbbl %ecx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%ebp), %ecx -; X86-NEXT: movl %edx, (%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %ebx, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: xorl %esi, %ebx +; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: xorl %esi, %edx +; X86-NEXT: subl %esi, %edx +; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebp), %ecx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%ebp), %edi ; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl 32(%ebp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: adcl %edi, %edx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %esi ; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl 44(%ebp), %ecx +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 12(%ecx) ; X86-NEXT: movl 28(%ebp), %eax ; X86-NEXT: imull %eax, %ebx ; X86-NEXT: mull %edi @@ -529,32 +544,37 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: imull %esi, %edi ; X86-NEXT: addl %edx, %edi ; X86-NEXT: addl %ebx, %edi -; X86-NEXT: movl 36(%ebp), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 40(%ebp), %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: imull %edx, %ebx -; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: movl 40(%ebp), %esi +; X86-NEXT: imull %eax, %esi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: addl %edx, %esi +; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: addl %ebx, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl 12(%ebp), %edx ; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl 16(%ebp), %ecx ; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl 20(%ebp), %edi -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl 24(%ebp), %esi -; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl 20(%ebp), %ebx +; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: movl %eax, 4(%edi) +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: sbbl %esi, %edi ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -651,17 +671,14 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: idivb {{[0-9]+}}(%esp) -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: movzbl %al, %ecx ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: idivb {{[0-9]+}}(%esp) +; X86-NEXT: movd %ecx, %xmm5 ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movd %eax, %xmm6 ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: idivb {{[0-9]+}}(%esp) -; X86-NEXT: movzbl %al, %edx -; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: idivb {{[0-9]+}}(%esp) ; X86-NEXT: movzbl %al, %esi ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: idivb {{[0-9]+}}(%esp) @@ -672,18 +689,22 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: idivb {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: idivb {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movsbl (%esp), %eax ; X86-NEXT: idivb {{[0-9]+}}(%esp) +; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-NEXT: movd %edx, %xmm7 +; X86-NEXT: movd %esi, %xmm7 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm4 +; X86-NEXT: movd %edi, %xmm4 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X86-NEXT: movd %edi, %xmm2 +; X86-NEXT: movd %ebx, %xmm2 ; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; X86-NEXT: movd %ebx, %xmm5 -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movd %ecx, %xmm5 +; X86-NEXT: movzbl %dl, %ecx ; X86-NEXT: movd %ecx, %xmm6 ; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] @@ -727,74 +748,79 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: movzbl %al, %r11d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %edi +; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %esi +; X64-NEXT: movzbl %al, %r10d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r8d +; X64-NEXT: movzbl %al, %ebx ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r9d +; X64-NEXT: movzbl %al, %ebp ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r10d +; X64-NEXT: movzbl %al, %r14d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r11d +; X64-NEXT: movzbl %al, %r15d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %ebx +; X64-NEXT: movzbl %al, %r12d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %ebp +; X64-NEXT: movl %eax, %edx ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r14d +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r15d +; X64-NEXT: movl %eax, %esi +; X64-NEXT: movzbl %dl, %r13d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r12d +; X64-NEXT: movl %eax, %edx ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r13d +; X64-NEXT: movl %eax, %r8d +; X64-NEXT: movzbl %cl, %edi +; X64-NEXT: movzbl %sil, %esi ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %edx +; X64-NEXT: movd %r11d, %xmm2 +; X64-NEXT: movzbl %dl, %r11d +; X64-NEXT: movzbl %r8b, %edx +; X64-NEXT: movzbl %al, %r8d ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: idivb -{{[0-9]+}}(%rsp) -; X64-NEXT: movd %edi, %xmm3 -; X64-NEXT: movd %esi, %xmm4 -; X64-NEXT: movd %r8d, %xmm5 -; X64-NEXT: movd %r9d, %xmm6 +; X64-NEXT: movd %r9d, %xmm3 +; X64-NEXT: movd %r10d, %xmm4 +; X64-NEXT: movd %ebx, %xmm5 +; X64-NEXT: movd %ebp, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r10d, %xmm7 +; X64-NEXT: movd %r14d, %xmm7 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; X64-NEXT: movd %r11d, %xmm4 +; X64-NEXT: movd %r15d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %ebx, %xmm2 +; X64-NEXT: movd %r12d, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm3 +; X64-NEXT: movd %r13d, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X64-NEXT: movd %r14d, %xmm4 +; X64-NEXT: movd %edi, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; X64-NEXT: movd %r15d, %xmm6 +; X64-NEXT: movd %esi, %xmm6 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; X64-NEXT: movd %r12d, %xmm5 +; X64-NEXT: movd %r11d, %xmm5 ; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; X64-NEXT: movd %r13d, %xmm3 +; X64-NEXT: movd %edx, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; X64-NEXT: movd %edx, %xmm6 +; X64-NEXT: movd %r8d, %xmm6 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: movd %ecx, %xmm4 @@ -837,114 +863,116 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounwind { ; X86-LABEL: vector_i128_i16: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: pextrw $7, %xmm0, %eax -; X86-NEXT: pextrw $7, %xmm1, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: pextrw $7, %xmm1, %edi ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %si +; X86-NEXT: pextrw $6, %xmm0, %ecx +; X86-NEXT: idivw %di ; X86-NEXT: # kill: def $ax killed $ax def $eax +; X86-NEXT: pextrw $6, %xmm1, %edi ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pextrw $6, %xmm0, %eax -; X86-NEXT: pextrw $6, %xmm1, %esi -; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %si +; X86-NEXT: idivw %di ; X86-NEXT: # kill: def $ax killed $ax def $eax +; X86-NEXT: pextrw $5, %xmm0, %ecx ; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X86-NEXT: pextrw $5, %xmm0, %eax -; X86-NEXT: pextrw $5, %xmm1, %esi -; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: pextrw $5, %xmm1, %edi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %si +; X86-NEXT: pextrw $4, %xmm0, %ecx +; X86-NEXT: idivw %di ; X86-NEXT: # kill: def $ax killed $ax def $eax +; X86-NEXT: pextrw $4, %xmm1, %edi ; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: pextrw $4, %xmm0, %eax -; X86-NEXT: pextrw $4, %xmm1, %esi -; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %si +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-NEXT: idivw %di ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-NEXT: pextrw $3, %xmm0, %eax -; X86-NEXT: pextrw $3, %xmm1, %esi +; X86-NEXT: pextrw $3, %xmm1, %ecx +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %si +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: idivw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm4 ; X86-NEXT: pextrw $2, %xmm0, %eax -; X86-NEXT: pextrw $2, %xmm1, %esi +; X86-NEXT: pextrw $2, %xmm1, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %si +; X86-NEXT: idivw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; X86-NEXT: pextrw $1, %xmm0, %eax -; X86-NEXT: pextrw $1, %xmm1, %esi +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-NEXT: pextrw $1, %xmm1, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %si +; X86-NEXT: idivw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm4 ; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: movd %xmm1, %esi +; X86-NEXT: movd %xmm1, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %si +; X86-NEXT: idivw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm5 ; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; X86-NEXT: movdqa %xmm5, (%ecx) +; X86-NEXT: movdqa %xmm5, (%esi) ; X86-NEXT: pmullw %xmm1, %xmm5 ; X86-NEXT: psubw %xmm5, %xmm0 ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: vector_i128_i16: ; X64: # %bb.0: ; X64-NEXT: pextrw $7, %xmm0, %eax -; X64-NEXT: pextrw $7, %xmm1, %ecx +; X64-NEXT: pextrw $7, %xmm1, %esi ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: cwtd -; X64-NEXT: idivw %cx +; X64-NEXT: pextrw $6, %xmm0, %ecx +; X64-NEXT: idivw %si ; X64-NEXT: # kill: def $ax killed $ax def $eax +; X64-NEXT: pextrw $6, %xmm1, %esi ; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pextrw $6, %xmm0, %eax -; X64-NEXT: pextrw $6, %xmm1, %ecx -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: cwtd -; X64-NEXT: idivw %cx +; X64-NEXT: idivw %si ; X64-NEXT: # kill: def $ax killed $ax def $eax +; X64-NEXT: pextrw $5, %xmm0, %ecx ; X64-NEXT: movd %eax, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: pextrw $5, %xmm0, %eax -; X64-NEXT: pextrw $5, %xmm1, %ecx -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: pextrw $5, %xmm1, %esi +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: cwtd -; X64-NEXT: idivw %cx +; X64-NEXT: pextrw $4, %xmm0, %ecx +; X64-NEXT: idivw %si ; X64-NEXT: # kill: def $ax killed $ax def $eax +; X64-NEXT: pextrw $4, %xmm1, %esi ; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: pextrw $4, %xmm0, %eax -; X64-NEXT: pextrw $4, %xmm1, %ecx -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: cwtd -; X64-NEXT: idivw %cx +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: idivw %si ; X64-NEXT: # kill: def $ax killed $ax def $eax ; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X64-NEXT: pextrw $3, %xmm0, %eax ; X64-NEXT: pextrw $3, %xmm1, %ecx +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: cwtd +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X64-NEXT: idivw %cx ; X64-NEXT: # kill: def $ax killed $ax def $eax ; X64-NEXT: movd %eax, %xmm3 @@ -955,8 +983,8 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw ; X64-NEXT: idivw %cx ; X64-NEXT: # kill: def $ax killed $ax def $eax ; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; X64-NEXT: pextrw $1, %xmm0, %eax +; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; X64-NEXT: pextrw $1, %xmm1, %ecx ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: cwtd @@ -1083,50 +1111,41 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: subl $64, %esp -; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm1, (%esp) +; X86-NEXT: movaps %xmm0, %xmm2 +; X86-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; X86-NEXT: movups %xmm2, (%esp) +; X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __divdi3 -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, (%esp) +; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: movups %xmm0, (%esp) ; X86-NEXT: movd %edx, %xmm0 ; X86-NEXT: movd %eax, %xmm1 ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __divdi3 ; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; X86-NEXT: movdqa %xmm3, (%esi) +; X86-NEXT: movd %eax, %xmm4 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; X86-NEXT: movdqa %xmm4, (%esi) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload +; X86-NEXT: movdqa %xmm3, %xmm1 ; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: pmuludq %xmm3, %xmm1 -; X86-NEXT: movdqa %xmm3, %xmm2 +; X86-NEXT: pmuludq %xmm4, %xmm1 +; X86-NEXT: movdqa %xmm4, %xmm2 ; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm3, %xmm2 ; X86-NEXT: paddq %xmm1, %xmm2 ; X86-NEXT: psllq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm3 -; X86-NEXT: paddq %xmm2, %xmm3 +; X86-NEXT: pmuludq %xmm3, %xmm4 +; X86-NEXT: paddq %xmm2, %xmm4 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: psubq %xmm3, %xmm0 +; X86-NEXT: psubq %xmm4, %xmm0 ; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 0bef9ee50bd54..bbfb97effbf4c 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -96,29 +96,31 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %ebp ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ecx, 4(%edx) -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: imull %eax, %ebp -; X86-NEXT: mull %ebx -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %ebx, (%edi) +; X86-NEXT: imull %ebx, %esi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: subl %eax, %esi -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edi, %edx +; X86-NEXT: subl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -129,10 +131,24 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: orq %rsi, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: je .LBB3_1 +; X64-NEXT: # %bb.2: +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rsi +; X64-NEXT: jmp .LBB3_3 +; X64-NEXT: .LBB3_1: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divl %esi +; X64-NEXT: # kill: def $eax killed $eax def $rax +; X64-NEXT: .LBB3_3: ; X64-NEXT: movq %rax, (%rcx) -; X64-NEXT: movq %rdx, %rax +; X64-NEXT: imulq %rax, %rsi +; X64-NEXT: subq %rsi, %rdi +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %div = udiv i64 %x, %y store i64 %div, ptr %divdst, align 4 @@ -153,10 +169,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $160, %esp ; X86-NEXT: movl 28(%ebp), %ebx -; X86-NEXT: movl 40(%ebp), %esi -; X86-NEXT: movl 32(%ebp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl 40(%ebp), %edi +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: orl %edi, %eax ; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: orl 36(%ebp), %ecx ; X86-NEXT: orl %eax, %ecx @@ -169,23 +185,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %esi, %edx +; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl 36(%ebp), %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl %ebx, %eax ; X86-NEXT: xorl $31, %eax ; X86-NEXT: orl $32, %eax -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: orl $64, %eax ; X86-NEXT: movl 36(%ebp), %edx -; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %edi, %edx ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: movl 24(%ebp), %ebx ; X86-NEXT: bsrl %ebx, %edx @@ -205,15 +221,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: orl $64, %edx -; X86-NEXT: movl 20(%ebp), %edi -; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl 20(%ebp), %esi ; X86-NEXT: orl %ebx, %esi ; X86-NEXT: cmovnel %ecx, %edx ; X86-NEXT: subl %edx, %eax ; X86-NEXT: movl $0, %edx ; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi ; X86-NEXT: movl $127, %ecx @@ -221,49 +236,46 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl $0, %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: xorl $127, %eax -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: testb %cl, %cl -; X86-NEXT: movb %cl, %ah -; X86-NEXT: movl 24(%ebp), %ebx -; X86-NEXT: movl $0, %esi -; X86-NEXT: cmovnel %esi, %ebx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: cmovnel %esi, %ecx ; X86-NEXT: movl $0, %edx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%ebp), %esi -; X86-NEXT: cmovnel %edx, %esi -; X86-NEXT: movl 12(%ebp), %edi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: orb %ah, %al -; X86-NEXT: movl 44(%ebp), %eax +; X86-NEXT: cmovnel %edx, %ebx +; X86-NEXT: movl 20(%ebp), %edi +; X86-NEXT: cmovnel %edx, %edi +; X86-NEXT: movl 16(%ebp), %edx +; X86-NEXT: movl $0, %esi +; X86-NEXT: cmovnel %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebp), %edx +; X86-NEXT: cmovnel %esi, %edx +; X86-NEXT: orb %cl, %al +; X86-NEXT: movl 20(%ebp), %ecx ; X86-NEXT: jne .LBB4_7 ; X86-NEXT: # %bb.1: # %udiv-bb1 -; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: xorps %xmm0, %xmm0 ; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: movl 16(%ebp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl 20(%ebp), %edx -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: movl 24(%ebp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorb $127, %cl ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al @@ -271,111 +283,110 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax ; X86-NEXT: movl 136(%esp,%eax), %edi -; X86-NEXT: movl 140(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 128(%esp,%eax), %ebx -; X86-NEXT: movl 132(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %edi -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: addl $1, %edx +; X86-NEXT: movl 140(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %edi, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl 132(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: movl 128(%esp,%eax), %edx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: shll %cl, %edx +; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 20(%ebp), %ebx ; X86-NEXT: jae .LBB4_2 ; X86-NEXT: # %bb.5: -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movl %edi, %esi +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: jmp .LBB4_6 ; X86-NEXT: .LBB4_2: # %udiv-preheader ; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebp), %edx -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl 16(%ebp), %edx -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl 12(%ebp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl 20(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%ebp), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $12, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 92(%esp,%eax), %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 92(%esp,%eax), %ebx +; X86-NEXT: movl 88(%esp,%eax), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 88(%esp,%eax), %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shrdl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 80(%esp,%eax), %edi -; X86-NEXT: movl 84(%esp,%eax), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shrdl %cl, %edx, %esi +; X86-NEXT: shrdl %cl, %ebx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl 84(%esp,%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shrdl %cl, %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl 80(%esp,%eax), %eax +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shrdl %cl, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 28(%ebp), %eax ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 32(%ebp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%ebp), %esi -; X86-NEXT: adcl $-1, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 40(%ebp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl $1, %edi, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: shldl $1, %ebx, %edi +; X86-NEXT: shldl $1, %eax, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %edx -; X86-NEXT: orl %eax, %edx +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -400,115 +411,125 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: andl 28(%ebp), %ecx ; X86-NEXT: subl %ecx, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax -; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $-1, %edx ; X86-NEXT: adcl $-1, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: jne .LBB4_3 ; X86-NEXT: # %bb.4: -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: .LBB4_6: # %udiv-loop-exit ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl $1, %edi, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: shldl $1, %esi, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $1, %ecx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: movl 44(%ebp), %eax +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: shldl $1, %eax, %edi +; X86-NEXT: shldl $1, %edx, %eax +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %edx +; X86-NEXT: orl %esi, %edx ; X86-NEXT: .LBB4_7: # %udiv-end -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull %edx, %esi -; X86-NEXT: mull %ecx +; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: mull %edx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl 40(%ebp), %edi -; X86-NEXT: imull %ecx, %edi -; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl 40(%ebp), %eax +; X86-NEXT: imull %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 28(%ebp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl 44(%ebp), %ecx +; X86-NEXT: movl %ebx, 12(%ecx) ; X86-NEXT: imull 28(%ebp), %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl 32(%ebp), %edx -; X86-NEXT: imull %edx, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, 8(%ecx) +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: imull %esi, %eax +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl 28(%ebp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull 32(%ebp) -; X86-NEXT: movl 16(%ebp), %esi -; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: setb %cl +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebx, %edi ; X86-NEXT: mull 32(%ebp) -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movl 44(%ebp), %ebx +; X86-NEXT: movl %esi, (%ebx) +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl 12(%ebp), %ebx ; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 20(%ebp), %edi -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl 24(%ebp), %ecx -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl 20(%ebp), %esi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: movl 44(%ebp), %eax +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -605,17 +626,14 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: divb {{[0-9]+}}(%esp) -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: movzbl %al, %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: divb {{[0-9]+}}(%esp) +; X86-NEXT: movd %ecx, %xmm5 ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movd %eax, %xmm6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: divb {{[0-9]+}}(%esp) -; X86-NEXT: movzbl %al, %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: divb {{[0-9]+}}(%esp) ; X86-NEXT: movzbl %al, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: divb {{[0-9]+}}(%esp) @@ -626,18 +644,22 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: divb {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: divb {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl (%esp), %eax ; X86-NEXT: divb {{[0-9]+}}(%esp) +; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-NEXT: movd %edx, %xmm7 +; X86-NEXT: movd %esi, %xmm7 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm4 +; X86-NEXT: movd %edi, %xmm4 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X86-NEXT: movd %edi, %xmm2 +; X86-NEXT: movd %ebx, %xmm2 ; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; X86-NEXT: movd %ebx, %xmm5 -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movd %ecx, %xmm5 +; X86-NEXT: movzbl %dl, %ecx ; X86-NEXT: movd %ecx, %xmm6 ; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] @@ -681,74 +703,79 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %eax -; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: movzbl %al, %r11d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %edi +; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %esi +; X64-NEXT: movzbl %al, %r10d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r8d +; X64-NEXT: movzbl %al, %ebx ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r9d +; X64-NEXT: movzbl %al, %ebp ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r10d +; X64-NEXT: movzbl %al, %r14d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r11d +; X64-NEXT: movzbl %al, %r15d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %ebx +; X64-NEXT: movzbl %al, %r12d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %ebp +; X64-NEXT: movl %eax, %edx ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r14d +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r15d +; X64-NEXT: movl %eax, %esi +; X64-NEXT: movzbl %dl, %r13d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r12d +; X64-NEXT: movl %eax, %edx ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %r13d +; X64-NEXT: movl %eax, %r8d +; X64-NEXT: movzbl %cl, %edi +; X64-NEXT: movzbl %sil, %esi ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl %al, %edx +; X64-NEXT: movd %r11d, %xmm2 +; X64-NEXT: movzbl %dl, %r11d +; X64-NEXT: movzbl %r8b, %edx +; X64-NEXT: movzbl %al, %r8d ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; X64-NEXT: divb -{{[0-9]+}}(%rsp) -; X64-NEXT: movd %edi, %xmm3 -; X64-NEXT: movd %esi, %xmm4 -; X64-NEXT: movd %r8d, %xmm5 -; X64-NEXT: movd %r9d, %xmm6 +; X64-NEXT: movd %r9d, %xmm3 +; X64-NEXT: movd %r10d, %xmm4 +; X64-NEXT: movd %ebx, %xmm5 +; X64-NEXT: movd %ebp, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r10d, %xmm7 +; X64-NEXT: movd %r14d, %xmm7 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; X64-NEXT: movd %r11d, %xmm4 +; X64-NEXT: movd %r15d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %ebx, %xmm2 +; X64-NEXT: movd %r12d, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm3 +; X64-NEXT: movd %r13d, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X64-NEXT: movd %r14d, %xmm4 +; X64-NEXT: movd %edi, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; X64-NEXT: movd %r15d, %xmm6 +; X64-NEXT: movd %esi, %xmm6 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; X64-NEXT: movd %r12d, %xmm5 +; X64-NEXT: movd %r11d, %xmm5 ; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; X64-NEXT: movd %r13d, %xmm3 +; X64-NEXT: movd %edx, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; X64-NEXT: movd %edx, %xmm6 +; X64-NEXT: movd %r8d, %xmm6 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: movd %ecx, %xmm4 @@ -791,75 +818,77 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounwind { ; X86-LABEL: vector_i128_i16: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: pextrw $7, %xmm0, %eax -; X86-NEXT: pextrw $7, %xmm1, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: pextrw $7, %xmm1, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divw %si +; X86-NEXT: divw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm2 ; X86-NEXT: pextrw $6, %xmm0, %eax -; X86-NEXT: pextrw $6, %xmm1, %esi +; X86-NEXT: pextrw $6, %xmm1, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divw %si +; X86-NEXT: divw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm3 ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; X86-NEXT: pextrw $5, %xmm0, %eax -; X86-NEXT: pextrw $5, %xmm1, %esi +; X86-NEXT: pextrw $5, %xmm1, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divw %si +; X86-NEXT: divw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax +; X86-NEXT: pextrw $4, %xmm0, %ecx ; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: pextrw $4, %xmm0, %eax -; X86-NEXT: pextrw $4, %xmm1, %esi -; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: pextrw $4, %xmm1, %edi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divw %si +; X86-NEXT: divw %di ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm2 ; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-NEXT: pextrw $3, %xmm0, %eax -; X86-NEXT: pextrw $3, %xmm1, %esi +; X86-NEXT: pextrw $3, %xmm1, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divw %si +; X86-NEXT: divw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax +; X86-NEXT: pextrw $2, %xmm0, %ecx ; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: pextrw $2, %xmm0, %eax -; X86-NEXT: pextrw $2, %xmm1, %esi -; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: pextrw $2, %xmm1, %edi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divw %si +; X86-NEXT: divw %di ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; X86-NEXT: pextrw $1, %xmm0, %eax -; X86-NEXT: pextrw $1, %xmm1, %esi +; X86-NEXT: pextrw $1, %xmm1, %ecx +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divw %si +; X86-NEXT: divw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm4 ; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: movd %xmm1, %esi +; X86-NEXT: movd %xmm1, %ecx ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divw %si +; X86-NEXT: divw %cx ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm5 ; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; X86-NEXT: movdqa %xmm5, (%ecx) +; X86-NEXT: movdqa %xmm5, (%esi) ; X86-NEXT: pmullw %xmm1, %xmm5 ; X86-NEXT: psubw %xmm5, %xmm0 ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: vector_i128_i16: @@ -885,12 +914,12 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divw %cx ; X64-NEXT: # kill: def $ax killed $ax def $eax +; X64-NEXT: pextrw $4, %xmm0, %ecx ; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: pextrw $4, %xmm0, %eax -; X64-NEXT: pextrw $4, %xmm1, %ecx -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: pextrw $4, %xmm1, %esi +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divw %cx +; X64-NEXT: divw %si ; X64-NEXT: # kill: def $ax killed $ax def $eax ; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] @@ -901,17 +930,17 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divw %cx ; X64-NEXT: # kill: def $ax killed $ax def $eax +; X64-NEXT: pextrw $2, %xmm0, %ecx ; X64-NEXT: movd %eax, %xmm3 -; X64-NEXT: pextrw $2, %xmm0, %eax -; X64-NEXT: pextrw $2, %xmm1, %ecx -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: pextrw $2, %xmm1, %esi +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divw %cx +; X64-NEXT: divw %si ; X64-NEXT: # kill: def $ax killed $ax def $eax ; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; X64-NEXT: pextrw $1, %xmm0, %eax ; X64-NEXT: pextrw $1, %xmm1, %ecx +; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divw %cx @@ -1037,50 +1066,41 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: subl $64, %esp -; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm1, (%esp) +; X86-NEXT: movaps %xmm0, %xmm2 +; X86-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; X86-NEXT: movups %xmm2, (%esp) +; X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __udivdi3 -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, (%esp) +; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: movups %xmm0, (%esp) ; X86-NEXT: movd %edx, %xmm0 ; X86-NEXT: movd %eax, %xmm1 ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __udivdi3 ; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; X86-NEXT: movdqa %xmm3, (%esi) +; X86-NEXT: movd %eax, %xmm4 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; X86-NEXT: movdqa %xmm4, (%esi) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload +; X86-NEXT: movdqa %xmm3, %xmm1 ; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: pmuludq %xmm3, %xmm1 -; X86-NEXT: movdqa %xmm3, %xmm2 +; X86-NEXT: pmuludq %xmm4, %xmm1 +; X86-NEXT: movdqa %xmm4, %xmm2 ; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm3, %xmm2 ; X86-NEXT: paddq %xmm1, %xmm2 ; X86-NEXT: psllq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm3 -; X86-NEXT: paddq %xmm2, %xmm3 +; X86-NEXT: pmuludq %xmm3, %xmm4 +; X86-NEXT: paddq %xmm2, %xmm4 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: psubq %xmm3, %xmm0 +; X86-NEXT: psubq %xmm4, %xmm0 ; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll index ac78136b9d8ea..c270d7be04daa 100644 --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -309,31 +309,17 @@ define i64 @PR23590(i64 %x) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: retl ; -; X64-FAST-LABEL: PR23590: -; X64-FAST: # %bb.0: # %entry -; X64-FAST-NEXT: movabsq $6120523590596543007, %rcx # imm = 0x54F077C718E7C21F -; X64-FAST-NEXT: movq %rdi, %rax -; X64-FAST-NEXT: mulq %rcx -; X64-FAST-NEXT: shrq $12, %rdx -; X64-FAST-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039 -; X64-FAST-NEXT: subq %rax, %rdi -; X64-FAST-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 -; X64-FAST-NEXT: movq %rdi, %rax -; X64-FAST-NEXT: mulq %rcx -; X64-FAST-NEXT: movq %rdx, %rax -; X64-FAST-NEXT: retq -; -; X64-SLOW-LABEL: PR23590: -; X64-SLOW: # %bb.0: # %entry -; X64-SLOW-NEXT: movabsq $6120523590596543007, %rcx # imm = 0x54F077C718E7C21F -; X64-SLOW-NEXT: movq %rdi, %rax -; X64-SLOW-NEXT: mulq %rcx -; X64-SLOW-NEXT: shrq $12, %rdx -; X64-SLOW-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039 -; X64-SLOW-NEXT: subq %rax, %rdi -; X64-SLOW-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925 -; X64-SLOW-NEXT: shrq $32, %rax -; X64-SLOW-NEXT: retq +; X64-LABEL: PR23590: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $6120523590596543007, %rcx # imm = 0x54F077C718E7C21F +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq $12, %rdx +; X64-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039 +; X64-NEXT: subq %rax, %rdi +; X64-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925 +; X64-NEXT: shrq $32, %rax +; X64-NEXT: retq entry: %rem = urem i64 %x, 12345 %div = udiv i64 %rem, 7 @@ -1189,3 +1175,6 @@ entry: ret i64 %rem } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64-FAST: {{.*}} +; X64-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll index 3796dd796eaf9..491982e16f50f 100644 --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -478,9 +478,9 @@ define i128 @udiv_i128_3(i128 %x) nounwind { ; X86-64-NEXT: subq %rcx, %rdi ; X86-64-NEXT: sbbq $0, %rsi ; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA -; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 ; X86-64-NEXT: addq %r8, %rdx @@ -501,9 +501,9 @@ define i128 @udiv_i128_3(i128 %x) nounwind { ; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 ; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA -; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx @@ -528,9 +528,9 @@ define i128 @udiv_i128_5(i128 %x) nounwind { ; X86-64-NEXT: subq %rcx, %rdi ; X86-64-NEXT: sbbq $0, %rsi ; X86-64-NEXT: movabsq $-3689348814741910324, %rcx # imm = 0xCCCCCCCCCCCCCCCC -; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 ; X86-64-NEXT: addq %r8, %rdx @@ -551,9 +551,9 @@ define i128 @udiv_i128_5(i128 %x) nounwind { ; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 ; WIN64-NEXT: movabsq $-3689348814741910324, %r9 # imm = 0xCCCCCCCCCCCCCCCC -; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx @@ -579,10 +579,10 @@ define i128 @udiv_i128_15(i128 %x) nounwind { ; X86-64-NEXT: subq %rcx, %rdi ; X86-64-NEXT: sbbq $0, %rsi ; X86-64-NEXT: movabsq $-1229782938247303442, %rcx # imm = 0xEEEEEEEEEEEEEEEE -; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movabsq $-1229782938247303441, %r8 # imm = 0xEEEEEEEEEEEEEEEF ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 ; X86-64-NEXT: addq %r8, %rdx @@ -604,10 +604,10 @@ define i128 @udiv_i128_15(i128 %x) nounwind { ; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 ; WIN64-NEXT: movabsq $-1229782938247303442, %r9 # imm = 0xEEEEEEEEEEEEEEEE -; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movabsq $-1229782938247303441, %r10 # imm = 0xEEEEEEEEEEEEEEEF ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx @@ -634,9 +634,9 @@ define i128 @udiv_i128_17(i128 %x) nounwind { ; X86-64-NEXT: subq %rcx, %rdi ; X86-64-NEXT: sbbq $0, %rsi ; X86-64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 -; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 ; X86-64-NEXT: addq %r8, %rdx @@ -659,9 +659,9 @@ define i128 @udiv_i128_17(i128 %x) nounwind { ; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 ; WIN64-NEXT: movabsq $-1085102592571150096, %r9 # imm = 0xF0F0F0F0F0F0F0F0 -; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx @@ -689,10 +689,10 @@ define i128 @udiv_i128_255(i128 %x) nounwind { ; X86-64-NEXT: subq %rax, %rdi ; X86-64-NEXT: sbbq $0, %rsi ; X86-64-NEXT: movabsq $-72340172838076674, %rcx # imm = 0xFEFEFEFEFEFEFEFE -; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movabsq $-72340172838076673, %r8 # imm = 0xFEFEFEFEFEFEFEFF ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 ; X86-64-NEXT: addq %r8, %rdx @@ -716,10 +716,10 @@ define i128 @udiv_i128_255(i128 %x) nounwind { ; WIN64-NEXT: subq %rax, %rcx ; WIN64-NEXT: sbbq $0, %r8 ; WIN64-NEXT: movabsq $-72340172838076674, %r9 # imm = 0xFEFEFEFEFEFEFEFE -; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movabsq $-72340172838076673, %r10 # imm = 0xFEFEFEFEFEFEFEFF ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx @@ -746,9 +746,9 @@ define i128 @udiv_i128_257(i128 %x) nounwind { ; X86-64-NEXT: subq %rcx, %rdi ; X86-64-NEXT: sbbq $0, %rsi ; X86-64-NEXT: movabsq $-71777214294589696, %rcx # imm = 0xFF00FF00FF00FF00 -; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 ; X86-64-NEXT: addq %r8, %rdx @@ -771,9 +771,9 @@ define i128 @udiv_i128_257(i128 %x) nounwind { ; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 ; WIN64-NEXT: movabsq $-71777214294589696, %r9 # imm = 0xFF00FF00FF00FF00 -; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx @@ -801,10 +801,10 @@ define i128 @udiv_i128_65535(i128 %x) nounwind { ; X86-64-NEXT: subq %rax, %rdi ; X86-64-NEXT: sbbq $0, %rsi ; X86-64-NEXT: movabsq $-281479271743490, %rcx # imm = 0xFFFEFFFEFFFEFFFE -; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movabsq $-281479271743489, %r8 # imm = 0xFFFEFFFEFFFEFFFF ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 ; X86-64-NEXT: addq %r8, %rdx @@ -828,10 +828,10 @@ define i128 @udiv_i128_65535(i128 %x) nounwind { ; WIN64-NEXT: subq %rax, %rcx ; WIN64-NEXT: sbbq $0, %r8 ; WIN64-NEXT: movabsq $-281479271743490, %r9 # imm = 0xFFFEFFFEFFFEFFFE -; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movabsq $-281479271743489, %r10 # imm = 0xFFFEFFFEFFFEFFFF ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx @@ -858,9 +858,9 @@ define i128 @udiv_i128_65537(i128 %x) nounwind { ; X86-64-NEXT: subq %rcx, %rdi ; X86-64-NEXT: sbbq $0, %rsi ; X86-64-NEXT: movabsq $-281470681808896, %rcx # imm = 0xFFFF0000FFFF0000 -; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 ; X86-64-NEXT: addq %r8, %rdx @@ -883,9 +883,9 @@ define i128 @udiv_i128_65537(i128 %x) nounwind { ; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 ; WIN64-NEXT: movabsq $-281470681808896, %r9 # imm = 0xFFFF0000FFFF0000 -; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx @@ -912,9 +912,9 @@ define i128 @udiv_i128_12(i128 %x) nounwind { ; X86-64-NEXT: subq %rcx, %rdi ; X86-64-NEXT: sbbq $0, %rsi ; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA -; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 ; X86-64-NEXT: addq %r8, %rdx @@ -937,9 +937,9 @@ define i128 @udiv_i128_12(i128 %x) nounwind { ; WIN64-NEXT: subq %r9, %rcx ; WIN64-NEXT: sbbq $0, %r8 ; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA -; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 ; WIN64-NEXT: addq %r8, %rdx diff --git a/llvm/test/CodeGen/X86/divrem.ll b/llvm/test/CodeGen/X86/divrem.ll index ba777b4954611..65697988bbf2b 100644 --- a/llvm/test/CodeGen/X86/divrem.ll +++ b/llvm/test/CodeGen/X86/divrem.ll @@ -41,8 +41,21 @@ define void @si64(i64 %x, i64 %y, ptr %p, ptr %q) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdi, %rdx +; X64-NEXT: orq %rsi, %rdx +; X64-NEXT: shrq $32, %rdx +; X64-NEXT: je .LBB0_1 +; X64-NEXT: # %bb.2: ; X64-NEXT: cqto ; X64-NEXT: idivq %rsi +; X64-NEXT: jmp .LBB0_3 +; X64-NEXT: .LBB0_1: +; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divl %esi +; X64-NEXT: # kill: def $edx killed $edx def $rdx +; X64-NEXT: # kill: def $eax killed $eax def $rax +; X64-NEXT: .LBB0_3: ; X64-NEXT: movq %rax, (%r8) ; X64-NEXT: movq %rdx, (%rcx) ; X64-NEXT: retq @@ -182,8 +195,21 @@ define void @ui64(i64 %x, i64 %y, ptr %p, ptr %q) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdi, %rdx +; X64-NEXT: orq %rsi, %rdx +; X64-NEXT: shrq $32, %rdx +; X64-NEXT: je .LBB4_1 +; X64-NEXT: # %bb.2: ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rsi +; X64-NEXT: jmp .LBB4_3 +; X64-NEXT: .LBB4_1: +; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: divl %esi +; X64-NEXT: # kill: def $edx killed $edx def $rdx +; X64-NEXT: # kill: def $eax killed $eax def $rax +; X64-NEXT: .LBB4_3: ; X64-NEXT: movq %rax, (%r8) ; X64-NEXT: movq %rdx, (%rcx) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll index a212f99680ef4..ee679026af33e 100644 --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -53,8 +53,8 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpsllw $4, %xmm1, %xmm1 ; CHECK-NEXT: vpsrlw $4, %xmm1, %xmm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; CHECK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; CHECK-NEXT: vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & mem) ; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -107,7 +107,7 @@ entry: define i32 @mul_zext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_zext_i4i4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/early-ifcvt.ll b/llvm/test/CodeGen/X86/early-ifcvt.ll index d50f7e9e392a8..2dad25be25fc2 100644 --- a/llvm/test/CodeGen/X86/early-ifcvt.ll +++ b/llvm/test/CodeGen/X86/early-ifcvt.ll @@ -1,14 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -x86-early-ifcvt -stress-early-ifcvt | FileCheck %s target triple = "x86_64-apple-macosx10.8.0" -; CHECK: mm2 define i32 @mm2(ptr nocapture %p, i32 %n) nounwind uwtable readonly ssp { +; CHECK-LABEL: mm2: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: LBB0_1: ## %do.body +; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movl %esi, %r8d +; CHECK-NEXT: movl (%rdi,%rdx,4), %r9d +; CHECK-NEXT: cmpl %esi, %r9d +; CHECK-NEXT: cmovll %r9d, %esi +; CHECK-NEXT: cmpl %eax, %r9d +; CHECK-NEXT: cmovgl %r9d, %eax +; CHECK-NEXT: cmovgl %r8d, %esi +; CHECK-NEXT: incq %rdx +; CHECK-NEXT: cmpl %edx, %ecx +; CHECK-NEXT: jne LBB0_1 +; CHECK-NEXT: ## %bb.2: ## %do.end +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: retq entry: br label %do.body -; CHECK: do.body ; Loop body has no branches before the backedge. -; CHECK-NOT: LBB do.body: %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ] %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ] @@ -27,8 +47,6 @@ if.else: do.cond: %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ] %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ] -; CHECK: decl %esi -; CHECK: jne LBB %dec = add i32 %n.addr.0, -1 %tobool = icmp eq i32 %dec, 0 br i1 %tobool, label %do.end, label %do.body @@ -38,16 +56,23 @@ do.end: ret i32 %sub } -; CHECK: multipreds ; Deal with alternative tail predecessors -; CHECK-NOT: LBB -; CHECK: cmov -; CHECK-NOT: LBB -; CHECK: cmov -; CHECK-NOT: LBB -; CHECK: fprintf - define void @multipreds(i32 %sw) nounwind uwtable ssp { +; CHECK-LABEL: multipreds: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movl $66, %ecx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: cmpl $127, %edi +; CHECK-NEXT: movl $2, %eax +; CHECK-NEXT: cmovel %ecx, %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovel %edx, %eax +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: callq _fprintf +; CHECK-NEXT: ud2 entry: switch i32 %sw, label %if.then29 [ i32 0, label %if.then37 @@ -68,10 +93,51 @@ if.end41: declare void @fprintf(...) nounwind -; CHECK: BZ2_decompress ; This test case contains irreducible control flow, so MachineLoopInfo doesn't ; recognize the cycle in the CFG. This would confuse MachineTraceMetrics. define void @BZ2_decompress(ptr %s) nounwind ssp { +; CHECK-LABEL: BZ2_decompress: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: jne LBB2_8 +; CHECK-NEXT: ## %bb.1: ## %entry +; CHECK-NEXT: movabsq $18897856102400, %rax ## imm = 0x113000000000 +; CHECK-NEXT: btq %rbx, %rax +; CHECK-NEXT: jae LBB2_2 +; CHECK-NEXT: LBB2_9: ## %save_state_and_return +; CHECK-NEXT: movl %ebx, (%rax) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +; CHECK-NEXT: LBB2_2: ## %entry +; CHECK-NEXT: cmpq $38, %rbx +; CHECK-NEXT: je LBB2_5 +; CHECK-NEXT: ## %bb.3: ## %entry +; CHECK-NEXT: cmpq $39, %rbx +; CHECK-NEXT: jne LBB2_8 +; CHECK-NEXT: ## %bb.4: ## %if.end.sw.bb2050_crit_edge +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: jmp LBB2_6 +; CHECK-NEXT: LBB2_8: ## %sw.default +; CHECK-NEXT: callq _BZ2_bz__AssertH__fail +; CHECK-NEXT: jmp LBB2_9 +; CHECK-NEXT: LBB2_5: ## %sw.bb1983 +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: jne LBB2_9 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: LBB2_6: ## %while.body2038 +; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne LBB2_9 +; CHECK-NEXT: ## %bb.7: ## %if.end2042 +; CHECK-NEXT: ## in Loop: Header=BB2_6 Depth=1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je LBB2_6 +; CHECK-NEXT: jmp LBB2_9 entry: switch i32 undef, label %sw.default [ i32 39, label %if.end.sw.bb2050_crit_edge @@ -147,6 +213,16 @@ declare void @BZ2_bz__AssertH__fail() ; CHECK: test_idiv ; CHECK-NOT: cmov define i32 @test_idiv(i32 %a, i32 %b) nounwind uwtable readnone ssp { +; CHECK-LABEL: test_idiv: +; CHECK: ## %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: je LBB3_2 +; CHECK-NEXT: ## %bb.1: +; CHECK-NEXT: cltd +; CHECK-NEXT: idivl %esi +; CHECK-NEXT: LBB3_2: +; CHECK-NEXT: retq %1 = icmp eq i32 %b, 0 br i1 %1, label %4, label %2 @@ -162,6 +238,16 @@ define i32 @test_idiv(i32 %a, i32 %b) nounwind uwtable readnone ssp { ; CHECK: test_div ; CHECK-NOT: cmov define i32 @test_div(i32 %a, i32 %b) nounwind uwtable readnone ssp { +; CHECK-LABEL: test_div: +; CHECK: ## %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: je LBB4_2 +; CHECK-NEXT: ## %bb.1: +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: divl %esi +; CHECK-NEXT: LBB4_2: +; CHECK-NEXT: retq %1 = icmp eq i32 %b, 0 br i1 %1, label %4, label %2 diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll index 3243d950740ca..826b33231ec0c 100644 --- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll +++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll @@ -26,15 +26,15 @@ define <4 x i32> @eq_or_eq_ult_2(<4 x i32> %x) { ; AVX2-LABEL: eq_or_eq_ult_2: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,5,5,5] +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_eq_ult_2: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [6,6,6,6] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 @@ -73,9 +73,9 @@ define <4 x i32> @eq_or_eq_ult_2_only_transform_sse2(<4 x i32> %x) { ; AVX2-LABEL: eq_or_eq_ult_2_only_transform_sse2: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -83,7 +83,7 @@ define <4 x i32> @eq_or_eq_ult_2_only_transform_sse2(<4 x i32> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [1,1,1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -156,7 +156,7 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) { ; SSE41-NEXT: paddd %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE41-NEXT: callq use.v4.i32@PLT -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE41-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE41-NEXT: pminud %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 @@ -207,9 +207,9 @@ define <4 x i32> @eq_or_eq_ult_3_fail(<4 x i32> %x) { ; AVX2-LABEL: eq_or_eq_ult_3_fail: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -217,7 +217,7 @@ define <4 x i32> @eq_or_eq_ult_3_fail(<4 x i32> %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [2,2,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2,2,2,2] ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -254,7 +254,7 @@ define <4 x i32> @eq_or_eq_ugt_m3(<4 x i32> %x) { ; ; SSE41-LABEL: eq_or_eq_ugt_m3: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [9,12,9,9] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9,12,9,9] ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 @@ -291,7 +291,7 @@ define <4 x i32> @eq_or_eq_ule_1(<4 x i32> %x) { ; ; SSE41-LABEL: eq_or_eq_ule_1: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [0,4294967295,4294967294,4294967293] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,4294967294,4294967293] ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 @@ -328,7 +328,7 @@ define <2 x i64> @eq_or_eq_uge_m2_i64(<2 x i64> %x) { ; ; SSE41-LABEL: eq_or_eq_uge_m2_i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551613,18446744073709551612] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551613,18446744073709551612] ; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 @@ -424,7 +424,7 @@ define <4 x i32> @eq_or_eq_uge_2_fail_(<4 x i32> %x) { ; SSE41-LABEL: eq_or_eq_uge_2_fail_: ; SSE41: # %bb.0: ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [2,2,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2,2,2,2] ; SSE41-NEXT: pmaxud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -471,18 +471,18 @@ define <8 x i32> @eq_or_eq_ult_2_256(<8 x i32> %x) { ; AVX2-LABEL: eq_or_eq_ult_2_256: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_eq_ult_2_256: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [6,6,6,6] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [6,6,6,6] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [5,5,5,5] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [5,5,5,5] ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 @@ -535,18 +535,18 @@ define <8 x i32> @eq_or_eq_ult_2_256_m1(<8 x i32> %x) { ; AVX2-LABEL: eq_or_eq_ult_2_256_m1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_eq_ult_2_256_m1: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [2,2,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [1,1,1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll index dbfa69d497698..a737c1b1cd8f8 100644 --- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll @@ -89,11 +89,9 @@ declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; X86-LABEL: vp_sdiv_v4i32: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2 -; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 @@ -102,10 +100,10 @@ define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: cltd ; X86-NEXT: idivl %ecx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: vmovd %xmm1, %edi +; X86-NEXT: vmovd %xmm1, %esi ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %edi +; X86-NEXT: idivl %esi ; X86-NEXT: vmovd %eax, %xmm2 ; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ; X86-NEXT: vpextrd $2, %xmm1, %ecx @@ -115,12 +113,12 @@ define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 ; X86-NEXT: vpextrd $3, %xmm1, %ecx ; X86-NEXT: vpextrd $3, %xmm0, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %ecx ; X86-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%esi) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; SSE-LABEL: vp_sdiv_v4i32: @@ -169,7 +167,7 @@ define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %esi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -201,7 +199,7 @@ define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %esi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] @@ -268,11 +266,9 @@ declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; X86-LABEL: vp_udiv_v4i32: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2 -; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 @@ -281,10 +277,10 @@ define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %ecx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: vmovd %xmm1, %edi +; X86-NEXT: vmovd %xmm1, %esi ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %edi +; X86-NEXT: divl %esi ; X86-NEXT: vmovd %eax, %xmm2 ; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ; X86-NEXT: vpextrd $2, %xmm1, %ecx @@ -294,12 +290,12 @@ define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 ; X86-NEXT: vpextrd $3, %xmm1, %ecx ; X86-NEXT: vpextrd $3, %xmm0, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %ecx ; X86-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%esi) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; SSE-LABEL: vp_udiv_v4i32: @@ -348,7 +344,7 @@ define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %esi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -380,7 +376,7 @@ define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %esi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] @@ -447,11 +443,9 @@ declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; X86-LABEL: vp_srem_v4i32: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2 -; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 @@ -460,10 +454,10 @@ define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: cltd ; X86-NEXT: idivl %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: vmovd %xmm1, %edi +; X86-NEXT: vmovd %xmm1, %esi ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %edi +; X86-NEXT: idivl %esi ; X86-NEXT: vmovd %edx, %xmm2 ; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ; X86-NEXT: vpextrd $2, %xmm1, %ecx @@ -473,12 +467,12 @@ define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 ; X86-NEXT: vpextrd $3, %xmm1, %ecx ; X86-NEXT: vpextrd $3, %xmm0, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %ecx ; X86-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%esi) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; SSE-LABEL: vp_srem_v4i32: @@ -527,7 +521,7 @@ define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %esi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -559,7 +553,7 @@ define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %esi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] @@ -626,11 +620,9 @@ declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; X86-LABEL: vp_urem_v4i32: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm2 -; X86-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; X86-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; X86-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; X86-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 @@ -639,10 +631,10 @@ define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: vmovd %xmm1, %edi +; X86-NEXT: vmovd %xmm1, %esi ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %edi +; X86-NEXT: divl %esi ; X86-NEXT: vmovd %edx, %xmm2 ; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ; X86-NEXT: vpextrd $2, %xmm1, %ecx @@ -652,12 +644,12 @@ define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; X86-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 ; X86-NEXT: vpextrd $3, %xmm1, %ecx ; X86-NEXT: vpextrd $3, %xmm0, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %ecx ; X86-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%esi) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; SSE-LABEL: vp_urem_v4i32: @@ -706,7 +698,7 @@ define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %esi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -738,7 +730,7 @@ define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %esi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] @@ -805,19 +797,19 @@ declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_ashr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; X86-LABEL: vp_ashr_v4i32: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpsrad %xmm2, %xmm0, %xmm2 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 ; X86-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; X86-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X86-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; X86-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-NEXT: vpsrad %xmm4, %xmm0, %xmm4 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X86-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl ; @@ -829,14 +821,14 @@ define void @vp_ashr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: psrad %xmm2, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrad %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psrad %xmm2, %xmm5 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE-NEXT: psrad %xmm1, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3] ; SSE-NEXT: movaps %xmm4, (%rdi) ; SSE-NEXT: retq @@ -847,14 +839,14 @@ define void @vp_ashr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: retq ; @@ -878,19 +870,19 @@ declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_lshr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; X86-LABEL: vp_lshr_v4i32: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpsrld %xmm2, %xmm0, %xmm2 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 ; X86-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; X86-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X86-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; X86-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-NEXT: vpsrld %xmm4, %xmm0, %xmm4 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X86-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl ; @@ -902,14 +894,14 @@ define void @vp_lshr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: psrld %xmm2, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrld %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psrld %xmm2, %xmm5 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE-NEXT: psrld %xmm1, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3] ; SSE-NEXT: movaps %xmm4, (%rdi) ; SSE-NEXT: retq @@ -920,14 +912,14 @@ define void @vp_lshr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: retq ; @@ -951,9 +943,9 @@ declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define void @vp_shl_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind { ; X86-LABEL: vp_shl_v4i32: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpslld $23, %xmm1, %xmm1 ; X86-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vcvttps2dq %xmm1, %xmm1 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) @@ -1291,7 +1283,7 @@ define <4 x i32> @vp_bitreverse_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %e ; AVX2-LABEL: vp_bitreverse_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1305,7 +1297,7 @@ define <4 x i32> @vp_bitreverse_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %e ; AVX512-LABEL: vp_bitreverse_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1412,7 +1404,7 @@ define <4 x i32> @vp_ctpop_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; ; AVX2-LABEL: vp_ctpop_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1430,7 +1422,7 @@ define <4 x i32> @vp_ctpop_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; ; AVX512-LABEL: vp_ctpop_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1665,7 +1657,7 @@ define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1686,7 +1678,7 @@ define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1712,10 +1704,10 @@ define <4 x i32> @vp_sadd_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i ; X86: # %bb.0: ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm2 ; X86-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm2, %xmm3 +; X86-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3 ; X86-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpsrad $31, %xmm2, %xmm1 -; X86-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; X86-NEXT: retl ; ; SSE-LABEL: vp_sadd_sat_v4i32: @@ -1737,10 +1729,10 @@ define <4 x i32> @vp_sadd_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vp_sadd_sat_v4i32: @@ -1825,10 +1817,10 @@ define <4 x i32> @vp_ssub_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i ; X86-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 ; X86-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; X86-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm1, %xmm3 +; X86-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3 ; X86-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; X86-NEXT: vpsrad $31, %xmm1, %xmm2 -; X86-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2 -; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 ; X86-NEXT: retl ; ; SSE-LABEL: vp_ssub_sat_v4i32: @@ -1853,10 +1845,10 @@ define <4 x i32> @vp_ssub_sat_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vp_ssub_sat_v4i32: @@ -1964,14 +1956,14 @@ define <4 x i32> @vp_fshl_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: psrld %xmm7, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: psrld %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: psrld %xmm7, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE-NEXT: psrld %xmm5, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pslld $23, %xmm2 @@ -2054,14 +2046,14 @@ define <4 x i32> @vp_fshr_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 ; X86-NEXT: vpsrld %xmm5, %xmm1, %xmm5 ; X86-NEXT: vpsrlq $32, %xmm4, %xmm6 ; X86-NEXT: vpsrld %xmm6, %xmm1, %xmm6 -; X86-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; X86-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; X86-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; X86-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; X86-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; X86-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; X86-NEXT: vpsrld %xmm7, %xmm1, %xmm7 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; X86-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; X86-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; X86-NEXT: vpslld $23, %xmm2, %xmm2 ; X86-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2 @@ -2093,10 +2085,10 @@ define <4 x i32> @vp_fshr_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE-NEXT: psrld %xmm5, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE-NEXT: pandn %xmm4, %xmm2 ; SSE-NEXT: pslld $23, %xmm2 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] @@ -2117,14 +2109,14 @@ define <4 x i32> @vp_fshr_v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc, <4 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll index 90e075bfabf0a..28fe3be69b5c4 100644 --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -544,14 +544,15 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: movl %edx, %edi +; X86-NOBMI-NEXT: movl %eax, %ecx ; X86-NOBMI-NEXT: shrl %cl, %edi -; X86-NOBMI-NEXT: shrdl %cl, %eax, %esi -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: shrdl %cl, %edx, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: je .LBB7_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -559,10 +560,9 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: .LBB7_2: ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shldl %cl, %eax, %edx ; X86-NOBMI-NEXT: shll %cl, %eax -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB7_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %eax, %edx @@ -580,14 +580,15 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movl %eax, %edi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI1-NEXT: movl %edx, %edi +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edi -; X86-BMI1-NEXT: shrdl %cl, %eax, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: shrdl %cl, %edx, %esi +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB7_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edi, %esi @@ -595,10 +596,9 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI1-NEXT: .LBB7_2: ; X86-BMI1-NEXT: movl $1, %eax ; X86-BMI1-NEXT: xorl %edx, %edx -; X86-BMI1-NEXT: movb %ch, %cl ; X86-BMI1-NEXT: shldl %cl, %eax, %edx ; X86-BMI1-NEXT: shll %cl, %eax -; X86-BMI1-NEXT: testb $32, %ch +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB7_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %eax, %edx @@ -614,16 +614,16 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; ; X86-BMI2-LABEL: bextr64_a0: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: shrdl %cl, %eax, %esi -; X86-BMI2-NEXT: shrxl %ecx, %eax, %edi -; X86-BMI2-NEXT: testb $32, %cl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl %eax, %ecx +; X86-BMI2-NEXT: shrdl %cl, %edx, %esi +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: shrxl %eax, %edx, %edi +; X86-BMI2-NEXT: testb $32, %al ; X86-BMI2-NEXT: je .LBB7_2 ; X86-BMI2-NEXT: # %bb.1: ; X86-BMI2-NEXT: movl %edi, %esi @@ -631,10 +631,9 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI2-NEXT: .LBB7_2: ; X86-BMI2-NEXT: movl $1, %eax ; X86-BMI2-NEXT: xorl %edx, %edx -; X86-BMI2-NEXT: movl %ebx, %ecx ; X86-BMI2-NEXT: shldl %cl, %eax, %edx -; X86-BMI2-NEXT: shlxl %ebx, %eax, %eax -; X86-BMI2-NEXT: testb $32, %bl +; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB7_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl %eax, %edx @@ -646,7 +645,6 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI2-NEXT: andl %edi, %edx ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: popl %edi -; X86-BMI2-NEXT: popl %ebx ; X86-BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr64_a0: @@ -686,26 +684,26 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: movl %edx, %esi +; X86-NOBMI-NEXT: movl %eax, %ecx ; X86-NOBMI-NEXT: sarl %cl, %esi -; X86-NOBMI-NEXT: shrdl %cl, %eax, %edi -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: shrdl %cl, %edx, %edi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: je .LBB8_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: sarl $31, %eax +; X86-NOBMI-NEXT: sarl $31, %edx ; X86-NOBMI-NEXT: movl %esi, %edi -; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: .LBB8_2: ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shldl %cl, %eax, %edx ; X86-NOBMI-NEXT: shll %cl, %eax -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB8_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %eax, %edx @@ -723,26 +721,26 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movl %eax, %esi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI1-NEXT: movl %edx, %esi +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: sarl %cl, %esi -; X86-BMI1-NEXT: shrdl %cl, %eax, %edi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: shrdl %cl, %edx, %edi +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB8_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: sarl $31, %eax +; X86-BMI1-NEXT: sarl $31, %edx ; X86-BMI1-NEXT: movl %esi, %edi -; X86-BMI1-NEXT: movl %eax, %esi +; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: .LBB8_2: ; X86-BMI1-NEXT: movl $1, %eax ; X86-BMI1-NEXT: xorl %edx, %edx -; X86-BMI1-NEXT: movb %ch, %cl ; X86-BMI1-NEXT: shldl %cl, %eax, %edx ; X86-BMI1-NEXT: shll %cl, %eax -; X86-BMI1-NEXT: testb $32, %ch +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB8_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %eax, %edx @@ -758,28 +756,27 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n ; ; X86-BMI2-LABEL: bextr64_a0_arithmetic: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: shrdl %cl, %eax, %esi -; X86-BMI2-NEXT: sarxl %ecx, %eax, %edi -; X86-BMI2-NEXT: testb $32, %cl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl %eax, %ecx +; X86-BMI2-NEXT: shrdl %cl, %edx, %esi +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: sarxl %eax, %edx, %edi +; X86-BMI2-NEXT: testb $32, %al ; X86-BMI2-NEXT: je .LBB8_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: sarl $31, %eax +; X86-BMI2-NEXT: sarl $31, %edx ; X86-BMI2-NEXT: movl %edi, %esi -; X86-BMI2-NEXT: movl %eax, %edi +; X86-BMI2-NEXT: movl %edx, %edi ; X86-BMI2-NEXT: .LBB8_2: ; X86-BMI2-NEXT: movl $1, %eax ; X86-BMI2-NEXT: xorl %edx, %edx -; X86-BMI2-NEXT: movl %ebx, %ecx ; X86-BMI2-NEXT: shldl %cl, %eax, %edx -; X86-BMI2-NEXT: shlxl %ebx, %eax, %eax -; X86-BMI2-NEXT: testb $32, %bl +; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB8_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl %eax, %edx @@ -791,7 +788,6 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n ; X86-BMI2-NEXT: andl %edi, %edx ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: popl %edi -; X86-BMI2-NEXT: popl %ebx ; X86-BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr64_a0_arithmetic: @@ -832,14 +828,15 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: movl %edx, %edi +; X86-NOBMI-NEXT: movl %eax, %ecx ; X86-NOBMI-NEXT: shrl %cl, %edi -; X86-NOBMI-NEXT: shrdl %cl, %eax, %esi -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: shrdl %cl, %edx, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: je .LBB9_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -847,10 +844,9 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-NOBMI-NEXT: .LBB9_2: ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shldl %cl, %eax, %edx ; X86-NOBMI-NEXT: shll %cl, %eax -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB9_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %eax, %edx @@ -868,14 +864,15 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movl %eax, %edi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI1-NEXT: movl %edx, %edi +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edi -; X86-BMI1-NEXT: shrdl %cl, %eax, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: shrdl %cl, %edx, %esi +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB9_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edi, %esi @@ -883,10 +880,9 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-BMI1-NEXT: .LBB9_2: ; X86-BMI1-NEXT: movl $1, %eax ; X86-BMI1-NEXT: xorl %edx, %edx -; X86-BMI1-NEXT: movb %ch, %cl ; X86-BMI1-NEXT: shldl %cl, %eax, %edx ; X86-BMI1-NEXT: shll %cl, %eax -; X86-BMI1-NEXT: testb $32, %ch +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB9_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %eax, %edx @@ -902,16 +898,16 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; ; X86-BMI2-LABEL: bextr64_a1_indexzext: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: shrdl %cl, %eax, %esi -; X86-BMI2-NEXT: shrxl %ecx, %eax, %edi -; X86-BMI2-NEXT: testb $32, %cl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl %eax, %ecx +; X86-BMI2-NEXT: shrdl %cl, %edx, %esi +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: shrxl %eax, %edx, %edi +; X86-BMI2-NEXT: testb $32, %al ; X86-BMI2-NEXT: je .LBB9_2 ; X86-BMI2-NEXT: # %bb.1: ; X86-BMI2-NEXT: movl %edi, %esi @@ -919,10 +915,9 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-BMI2-NEXT: .LBB9_2: ; X86-BMI2-NEXT: movl $1, %eax ; X86-BMI2-NEXT: xorl %edx, %edx -; X86-BMI2-NEXT: movl %ebx, %ecx ; X86-BMI2-NEXT: shldl %cl, %eax, %edx -; X86-BMI2-NEXT: shlxl %ebx, %eax, %eax -; X86-BMI2-NEXT: testb $32, %bl +; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB9_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl %eax, %edx @@ -934,7 +929,6 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-BMI2-NEXT: andl %edi, %edx ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: popl %edi -; X86-BMI2-NEXT: popl %ebx ; X86-BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr64_a1_indexzext: @@ -978,15 +972,16 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax), %esi -; X86-NOBMI-NEXT: movl 4(%eax), %eax -; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movl (%ecx), %esi +; X86-NOBMI-NEXT: movl 4(%ecx), %edx +; X86-NOBMI-NEXT: movl %edx, %edi +; X86-NOBMI-NEXT: movl %eax, %ecx ; X86-NOBMI-NEXT: shrl %cl, %edi -; X86-NOBMI-NEXT: shrdl %cl, %eax, %esi -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: shrdl %cl, %edx, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: je .LBB10_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -994,10 +989,9 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-NOBMI-NEXT: .LBB10_2: ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shldl %cl, %eax, %edx ; X86-NOBMI-NEXT: shll %cl, %eax -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB10_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %eax, %edx @@ -1015,15 +1009,16 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movl (%eax), %esi -; X86-BMI1-NEXT: movl 4(%eax), %eax -; X86-BMI1-NEXT: movl %eax, %edi +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl (%ecx), %esi +; X86-BMI1-NEXT: movl 4(%ecx), %edx +; X86-BMI1-NEXT: movl %edx, %edi +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edi -; X86-BMI1-NEXT: shrdl %cl, %eax, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: shrdl %cl, %edx, %esi +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB10_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edi, %esi @@ -1031,10 +1026,9 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1-NEXT: .LBB10_2: ; X86-BMI1-NEXT: movl $1, %eax ; X86-BMI1-NEXT: xorl %edx, %edx -; X86-BMI1-NEXT: movb %ch, %cl ; X86-BMI1-NEXT: shldl %cl, %eax, %edx ; X86-BMI1-NEXT: shll %cl, %eax -; X86-BMI1-NEXT: testb $32, %ch +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB10_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %eax, %edx @@ -1050,17 +1044,17 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; ; X86-BMI2-LABEL: bextr64_a2_load: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: movl (%ecx), %esi +; X86-BMI2-NEXT: movl 4(%ecx), %edx +; X86-BMI2-NEXT: movl %eax, %ecx +; X86-BMI2-NEXT: shrdl %cl, %edx, %esi ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movl (%eax), %esi -; X86-BMI2-NEXT: movl 4(%eax), %eax -; X86-BMI2-NEXT: shrxl %ecx, %eax, %edi -; X86-BMI2-NEXT: shrdl %cl, %eax, %esi -; X86-BMI2-NEXT: testb $32, %cl +; X86-BMI2-NEXT: shrxl %eax, %edx, %edi +; X86-BMI2-NEXT: testb $32, %al ; X86-BMI2-NEXT: je .LBB10_2 ; X86-BMI2-NEXT: # %bb.1: ; X86-BMI2-NEXT: movl %edi, %esi @@ -1068,10 +1062,9 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI2-NEXT: .LBB10_2: ; X86-BMI2-NEXT: movl $1, %eax ; X86-BMI2-NEXT: xorl %edx, %edx -; X86-BMI2-NEXT: movl %ebx, %ecx ; X86-BMI2-NEXT: shldl %cl, %eax, %edx -; X86-BMI2-NEXT: shlxl %ebx, %eax, %eax -; X86-BMI2-NEXT: testb $32, %bl +; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB10_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl %eax, %edx @@ -1083,7 +1076,6 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI2-NEXT: andl %edi, %edx ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: popl %edi -; X86-BMI2-NEXT: popl %ebx ; X86-BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr64_a2_load: @@ -1125,15 +1117,16 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax), %esi -; X86-NOBMI-NEXT: movl 4(%eax), %eax -; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movl (%ecx), %esi +; X86-NOBMI-NEXT: movl 4(%ecx), %edx +; X86-NOBMI-NEXT: movl %edx, %edi +; X86-NOBMI-NEXT: movl %eax, %ecx ; X86-NOBMI-NEXT: shrl %cl, %edi -; X86-NOBMI-NEXT: shrdl %cl, %eax, %esi -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: shrdl %cl, %edx, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: je .LBB11_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -1141,10 +1134,9 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-NOBMI-NEXT: .LBB11_2: ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shldl %cl, %eax, %edx ; X86-NOBMI-NEXT: shll %cl, %eax -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB11_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %eax, %edx @@ -1162,15 +1154,16 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movl (%eax), %esi -; X86-BMI1-NEXT: movl 4(%eax), %eax -; X86-BMI1-NEXT: movl %eax, %edi +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl (%ecx), %esi +; X86-BMI1-NEXT: movl 4(%ecx), %edx +; X86-BMI1-NEXT: movl %edx, %edi +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edi -; X86-BMI1-NEXT: shrdl %cl, %eax, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: shrdl %cl, %edx, %esi +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB11_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edi, %esi @@ -1178,10 +1171,9 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-BMI1-NEXT: .LBB11_2: ; X86-BMI1-NEXT: movl $1, %eax ; X86-BMI1-NEXT: xorl %edx, %edx -; X86-BMI1-NEXT: movb %ch, %cl ; X86-BMI1-NEXT: shldl %cl, %eax, %edx ; X86-BMI1-NEXT: shll %cl, %eax -; X86-BMI1-NEXT: testb $32, %ch +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB11_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %eax, %edx @@ -1197,17 +1189,17 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; ; X86-BMI2-LABEL: bextr64_a3_load_indexzext: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: movl (%ecx), %esi +; X86-BMI2-NEXT: movl 4(%ecx), %edx +; X86-BMI2-NEXT: movl %eax, %ecx +; X86-BMI2-NEXT: shrdl %cl, %edx, %esi ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: movl (%eax), %esi -; X86-BMI2-NEXT: movl 4(%eax), %eax -; X86-BMI2-NEXT: shrxl %ecx, %eax, %edi -; X86-BMI2-NEXT: shrdl %cl, %eax, %esi -; X86-BMI2-NEXT: testb $32, %cl +; X86-BMI2-NEXT: shrxl %eax, %edx, %edi +; X86-BMI2-NEXT: testb $32, %al ; X86-BMI2-NEXT: je .LBB11_2 ; X86-BMI2-NEXT: # %bb.1: ; X86-BMI2-NEXT: movl %edi, %esi @@ -1215,10 +1207,9 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-BMI2-NEXT: .LBB11_2: ; X86-BMI2-NEXT: movl $1, %eax ; X86-BMI2-NEXT: xorl %edx, %edx -; X86-BMI2-NEXT: movl %ebx, %ecx ; X86-BMI2-NEXT: shldl %cl, %eax, %edx -; X86-BMI2-NEXT: shlxl %ebx, %eax, %eax -; X86-BMI2-NEXT: testb $32, %bl +; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB11_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl %eax, %edx @@ -1230,7 +1221,6 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-BMI2-NEXT: andl %edi, %edx ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: popl %edi -; X86-BMI2-NEXT: popl %ebx ; X86-BMI2-NEXT: retl ; ; X64-NOBMI-LABEL: bextr64_a3_load_indexzext: @@ -1277,13 +1267,14 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl %esi, %edx +; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shrl %cl, %edx ; X86-NOBMI-NEXT: shrdl %cl, %esi, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: testb $32, %ch ; X86-NOBMI-NEXT: je .LBB12_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edx, %eax @@ -1291,10 +1282,9 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-NOBMI-NEXT: .LBB12_2: ; X86-NOBMI-NEXT: movl $1, %esi ; X86-NOBMI-NEXT: xorl %edi, %edi -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shldl %cl, %esi, %edi ; X86-NOBMI-NEXT: shll %cl, %esi -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB12_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %esi, %edi @@ -1313,13 +1303,14 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movb %ch, %cl ; X86-BMI1-NEXT: shrl %cl, %edx ; X86-BMI1-NEXT: shrdl %cl, %esi, %eax -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-BMI1-NEXT: testb $32, %ch ; X86-BMI1-NEXT: je .LBB12_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edx, %eax @@ -1327,10 +1318,9 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-BMI1-NEXT: .LBB12_2: ; X86-BMI1-NEXT: movl $1, %esi ; X86-BMI1-NEXT: xorl %edi, %edi -; X86-BMI1-NEXT: movb %ch, %cl ; X86-BMI1-NEXT: shldl %cl, %esi, %edi ; X86-BMI1-NEXT: shll %cl, %esi -; X86-BMI1-NEXT: testb $32, %ch +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB12_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %esi, %edi @@ -1350,12 +1340,13 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl %ebx, %ecx ; X86-BMI2-NEXT: shrdl %cl, %edx, %eax -; X86-BMI2-NEXT: shrxl %ecx, %edx, %edx -; X86-BMI2-NEXT: testb $32, %cl +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: shrxl %ebx, %edx, %edx +; X86-BMI2-NEXT: testb $32, %bl ; X86-BMI2-NEXT: je .LBB12_2 ; X86-BMI2-NEXT: # %bb.1: ; X86-BMI2-NEXT: movl %edx, %eax @@ -1363,18 +1354,17 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-BMI2-NEXT: .LBB12_2: ; X86-BMI2-NEXT: movl $1, %edi ; X86-BMI2-NEXT: xorl %esi, %esi -; X86-BMI2-NEXT: movl %ebx, %ecx ; X86-BMI2-NEXT: shldl %cl, %edi, %esi -; X86-BMI2-NEXT: shlxl %ebx, %edi, %ecx -; X86-BMI2-NEXT: testb $32, %bl +; X86-BMI2-NEXT: shlxl %ecx, %edi, %edi +; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB12_4 ; X86-BMI2-NEXT: # %bb.3: -; X86-BMI2-NEXT: movl %ecx, %esi -; X86-BMI2-NEXT: xorl %ecx, %ecx +; X86-BMI2-NEXT: movl %edi, %esi +; X86-BMI2-NEXT: xorl %edi, %edi ; X86-BMI2-NEXT: .LBB12_4: -; X86-BMI2-NEXT: addl $-1, %ecx +; X86-BMI2-NEXT: addl $-1, %edi ; X86-BMI2-NEXT: adcl $-1, %esi -; X86-BMI2-NEXT: andl %ecx, %eax +; X86-BMI2-NEXT: andl %edi, %eax ; X86-BMI2-NEXT: andl %esi, %edx ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: popl %edi @@ -1421,26 +1411,26 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: subl $12, %esp -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl %esi, %ebp +; X86-NOBMI-NEXT: movl %esi, %ebx ; X86-NOBMI-NEXT: movl %eax, %ecx -; X86-NOBMI-NEXT: shrl %cl, %ebp -; X86-NOBMI-NEXT: shrdl %cl, %esi, %ebx +; X86-NOBMI-NEXT: shrl %cl, %ebx +; X86-NOBMI-NEXT: shrdl %cl, %esi, %edx +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: je .LBB13_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: movl %ebp, %ebx -; X86-NOBMI-NEXT: xorl %ebp, %ebp +; X86-NOBMI-NEXT: movl %ebx, %edx +; X86-NOBMI-NEXT: xorl %ebx, %ebx ; X86-NOBMI-NEXT: .LBB13_2: ; X86-NOBMI-NEXT: movl $1, %esi ; X86-NOBMI-NEXT: xorl %edi, %edi -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shldl %cl, %esi, %edi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NOBMI-NEXT: shll %cl, %esi -; X86-NOBMI-NEXT: testb $32, %dl +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB13_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %esi, %edi @@ -1448,10 +1438,10 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-NOBMI-NEXT: .LBB13_4: ; X86-NOBMI-NEXT: addl $-1, %esi ; X86-NOBMI-NEXT: adcl $-1, %edi -; X86-NOBMI-NEXT: andl %ebx, %esi -; X86-NOBMI-NEXT: andl %ebp, %edi +; X86-NOBMI-NEXT: andl %edx, %esi +; X86-NOBMI-NEXT: andl %ebx, %edi ; X86-NOBMI-NEXT: subl $8, %esp -; X86-NOBMI-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: pushl %ebp ; X86-NOBMI-NEXT: pushl %eax ; X86-NOBMI-NEXT: calll use64@PLT ; X86-NOBMI-NEXT: addl $16, %esp @@ -1471,26 +1461,26 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: subl $12, %esp -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movl %esi, %ebp +; X86-BMI1-NEXT: movl %esi, %ebx ; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shrl %cl, %ebp -; X86-BMI1-NEXT: shrdl %cl, %esi, %ebx +; X86-BMI1-NEXT: shrl %cl, %ebx +; X86-BMI1-NEXT: shrdl %cl, %esi, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB13_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %ebp, %ebx -; X86-BMI1-NEXT: xorl %ebp, %ebp +; X86-BMI1-NEXT: movl %ebx, %edx +; X86-BMI1-NEXT: xorl %ebx, %ebx ; X86-BMI1-NEXT: .LBB13_2: ; X86-BMI1-NEXT: movl $1, %esi ; X86-BMI1-NEXT: xorl %edi, %edi -; X86-BMI1-NEXT: movl %edx, %ecx ; X86-BMI1-NEXT: shldl %cl, %esi, %edi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-BMI1-NEXT: shll %cl, %esi -; X86-BMI1-NEXT: testb $32, %dl +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB13_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %esi, %edi @@ -1498,10 +1488,10 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI1-NEXT: .LBB13_4: ; X86-BMI1-NEXT: addl $-1, %esi ; X86-BMI1-NEXT: adcl $-1, %edi -; X86-BMI1-NEXT: andl %ebx, %esi -; X86-BMI1-NEXT: andl %ebp, %edi +; X86-BMI1-NEXT: andl %edx, %esi +; X86-BMI1-NEXT: andl %ebx, %edi ; X86-BMI1-NEXT: subl $8, %esp -; X86-BMI1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-BMI1-NEXT: pushl %ebp ; X86-BMI1-NEXT: pushl %eax ; X86-BMI1-NEXT: calll use64@PLT ; X86-BMI1-NEXT: addl $16, %esp @@ -1521,25 +1511,25 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: subl $12, %esp -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl %eax, %ecx -; X86-BMI2-NEXT: shrdl %cl, %esi, %ebx -; X86-BMI2-NEXT: shrxl %eax, %esi, %ebp +; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI2-NEXT: shrxl %eax, %esi, %ebx ; X86-BMI2-NEXT: testb $32, %al ; X86-BMI2-NEXT: je .LBB13_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: movl %ebp, %ebx -; X86-BMI2-NEXT: xorl %ebp, %ebp +; X86-BMI2-NEXT: movl %ebx, %edx +; X86-BMI2-NEXT: xorl %ebx, %ebx ; X86-BMI2-NEXT: .LBB13_2: ; X86-BMI2-NEXT: movl $1, %edi ; X86-BMI2-NEXT: xorl %esi, %esi -; X86-BMI2-NEXT: movl %edx, %ecx ; X86-BMI2-NEXT: shldl %cl, %edi, %esi -; X86-BMI2-NEXT: shlxl %edx, %edi, %edi -; X86-BMI2-NEXT: testb $32, %dl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-BMI2-NEXT: shlxl %ecx, %edi, %edi +; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB13_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl %edi, %esi @@ -1547,10 +1537,10 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI2-NEXT: .LBB13_4: ; X86-BMI2-NEXT: addl $-1, %edi ; X86-BMI2-NEXT: adcl $-1, %esi -; X86-BMI2-NEXT: andl %ebx, %edi -; X86-BMI2-NEXT: andl %ebp, %esi +; X86-BMI2-NEXT: andl %edx, %edi +; X86-BMI2-NEXT: andl %ebx, %esi ; X86-BMI2-NEXT: subl $8, %esp -; X86-BMI2-NEXT: pushl {{[0-9]+}}(%esp) +; X86-BMI2-NEXT: pushl %ebp ; X86-BMI2-NEXT: pushl %eax ; X86-BMI2-NEXT: calll use64@PLT ; X86-BMI2-NEXT: addl $16, %esp @@ -1618,29 +1608,29 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl %edi, %esi -; X86-NOBMI-NEXT: shrl %cl, %esi -; X86-NOBMI-NEXT: shrdl %cl, %edi, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: movl %edi, %edx +; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: shrl %cl, %edx +; X86-NOBMI-NEXT: shrdl %cl, %edi, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: jne .LBB14_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: movl %esi, %edx ; X86-NOBMI-NEXT: .LBB14_2: -; X86-NOBMI-NEXT: movl $1, %edi -; X86-NOBMI-NEXT: movl %edx, %ecx -; X86-NOBMI-NEXT: shll %cl, %edi +; X86-NOBMI-NEXT: movl $1, %esi +; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: testb $32, %dl +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB14_4 ; X86-NOBMI-NEXT: # %bb.3: -; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: .LBB14_4: ; X86-NOBMI-NEXT: decl %eax -; X86-NOBMI-NEXT: andl %esi, %eax +; X86-NOBMI-NEXT: andl %edx, %eax ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: retl @@ -1649,29 +1639,29 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %esi -; X86-BMI1-NEXT: shrl %cl, %esi -; X86-BMI1-NEXT: shrdl %cl, %edi, %eax -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movl %edi, %edx +; X86-BMI1-NEXT: movl %eax, %ecx +; X86-BMI1-NEXT: shrl %cl, %edx +; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: jne .LBB14_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %eax, %esi +; X86-BMI1-NEXT: movl %esi, %edx ; X86-BMI1-NEXT: .LBB14_2: -; X86-BMI1-NEXT: movl $1, %edi -; X86-BMI1-NEXT: movl %edx, %ecx -; X86-BMI1-NEXT: shll %cl, %edi +; X86-BMI1-NEXT: movl $1, %esi +; X86-BMI1-NEXT: shll %cl, %esi ; X86-BMI1-NEXT: xorl %eax, %eax -; X86-BMI1-NEXT: testb $32, %dl +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB14_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %edi, %eax +; X86-BMI1-NEXT: movl %esi, %eax ; X86-BMI1-NEXT: .LBB14_4: ; X86-BMI1-NEXT: decl %eax -; X86-BMI1-NEXT: andl %esi, %eax +; X86-BMI1-NEXT: andl %edx, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -1679,11 +1669,11 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI2-LABEL: bextr64_32_a0: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: shrdl %cl, %eax, %edx +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB14_2 ; X86-BMI2-NEXT: # %bb.1: @@ -1742,63 +1732,63 @@ define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NOBMI-NEXT: movl %edi, %edx +; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: shrl %cl, %edx +; X86-NOBMI-NEXT: shrdl %cl, %edi, %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl %edi, %esi -; X86-NOBMI-NEXT: shrl %cl, %esi -; X86-NOBMI-NEXT: shrdl %cl, %edi, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: jne .LBB15_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: movl %esi, %edx ; X86-NOBMI-NEXT: .LBB15_2: ; X86-NOBMI-NEXT: movl $1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: decl %eax -; X86-NOBMI-NEXT: andl %esi, %eax +; X86-NOBMI-NEXT: andl %edx, %eax ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: retl ; ; X86-BMI1-LABEL: bextr64_32_a1: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %edi +; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %esi, %eax +; X86-BMI1-NEXT: shrl %cl, %eax +; X86-BMI1-NEXT: shrdl %cl, %esi, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB15_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %eax ; X86-BMI1-NEXT: .LBB15_2: -; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: shll $8, %ebx +; X86-BMI1-NEXT: bextrl %ebx, %eax, %eax ; X86-BMI1-NEXT: popl %esi -; X86-BMI1-NEXT: popl %edi +; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_32_a1: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB15_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx +; X86-BMI2-NEXT: shrxl %ecx, %esi, %eax ; X86-BMI2-NEXT: .LBB15_2: -; X86-BMI2-NEXT: bzhil %eax, %edx, %eax +; X86-BMI2-NEXT: bzhil %edx, %eax, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; @@ -1844,13 +1834,13 @@ define i32 @bextr64_32_a1_trunc_extrause(i64 %val, i64 %numskipbits, i32 %numlow ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: pushl %eax -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: shrl %cl, %esi ; X86-NOBMI-NEXT: shrdl %cl, %edx, %eax +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB16_2 ; X86-NOBMI-NEXT: # %bb.1: @@ -1873,13 +1863,13 @@ define i32 @bextr64_32_a1_trunc_extrause(i64 %val, i64 %numskipbits, i32 %numlow ; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: pushl %eax -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: shrl %cl, %esi ; X86-BMI1-NEXT: shrdl %cl, %edx, %eax +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB16_2 ; X86-BMI1-NEXT: # %bb.1: @@ -1899,11 +1889,11 @@ define i32 @bextr64_32_a1_trunc_extrause(i64 %val, i64 %numskipbits, i32 %numlow ; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: pushl %eax -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: shrdl %cl, %eax, %esi +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB16_2 ; X86-BMI2-NEXT: # %bb.1: @@ -1988,63 +1978,63 @@ define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl %edi, %esi -; X86-NOBMI-NEXT: shrl %cl, %esi -; X86-NOBMI-NEXT: shrdl %cl, %edi, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: movl %edi, %edx +; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: shrl %cl, %edx +; X86-NOBMI-NEXT: shrdl %cl, %edi, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: jne .LBB17_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: movl %esi, %edx ; X86-NOBMI-NEXT: .LBB17_2: ; X86-NOBMI-NEXT: movl $1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: decl %eax -; X86-NOBMI-NEXT: andl %esi, %eax +; X86-NOBMI-NEXT: andl %edx, %eax ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: retl ; ; X86-BMI1-LABEL: bextr64_32_a2: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %edi +; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %esi, %eax +; X86-BMI1-NEXT: shrl %cl, %eax +; X86-BMI1-NEXT: shrdl %cl, %esi, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB17_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %eax ; X86-BMI1-NEXT: .LBB17_2: -; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: shll $8, %ebx +; X86-BMI1-NEXT: bextrl %ebx, %eax, %eax ; X86-BMI1-NEXT: popl %esi -; X86-BMI1-NEXT: popl %edi +; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_32_a2: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB17_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx +; X86-BMI2-NEXT: shrxl %ecx, %esi, %eax ; X86-BMI2-NEXT: .LBB17_2: -; X86-BMI2-NEXT: bzhil %eax, %edx, %eax +; X86-BMI2-NEXT: bzhil %edx, %eax, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; @@ -2090,29 +2080,29 @@ define i32 @bextr64_32_a3(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl %edi, %esi -; X86-NOBMI-NEXT: shrl %cl, %esi -; X86-NOBMI-NEXT: shrdl %cl, %edi, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: movl %edi, %edx +; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: shrl %cl, %edx +; X86-NOBMI-NEXT: shrdl %cl, %edi, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: jne .LBB18_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: movl %esi, %edx ; X86-NOBMI-NEXT: .LBB18_2: -; X86-NOBMI-NEXT: movl $1, %edi -; X86-NOBMI-NEXT: movl %edx, %ecx -; X86-NOBMI-NEXT: shll %cl, %edi +; X86-NOBMI-NEXT: movl $1, %esi +; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: testb $32, %dl +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB18_4 ; X86-NOBMI-NEXT: # %bb.3: -; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: .LBB18_4: ; X86-NOBMI-NEXT: decl %eax -; X86-NOBMI-NEXT: andl %esi, %eax +; X86-NOBMI-NEXT: andl %edx, %eax ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: retl @@ -2121,29 +2111,29 @@ define i32 @bextr64_32_a3(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %esi -; X86-BMI1-NEXT: shrl %cl, %esi -; X86-BMI1-NEXT: shrdl %cl, %edi, %eax -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movl %edi, %edx +; X86-BMI1-NEXT: movl %eax, %ecx +; X86-BMI1-NEXT: shrl %cl, %edx +; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: jne .LBB18_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %eax, %esi +; X86-BMI1-NEXT: movl %esi, %edx ; X86-BMI1-NEXT: .LBB18_2: -; X86-BMI1-NEXT: movl $1, %edi -; X86-BMI1-NEXT: movl %edx, %ecx -; X86-BMI1-NEXT: shll %cl, %edi +; X86-BMI1-NEXT: movl $1, %esi +; X86-BMI1-NEXT: shll %cl, %esi ; X86-BMI1-NEXT: xorl %eax, %eax -; X86-BMI1-NEXT: testb $32, %dl +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB18_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %edi, %eax +; X86-BMI1-NEXT: movl %esi, %eax ; X86-BMI1-NEXT: .LBB18_4: ; X86-BMI1-NEXT: decl %eax -; X86-BMI1-NEXT: andl %esi, %eax +; X86-BMI1-NEXT: andl %edx, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -2151,11 +2141,11 @@ define i32 @bextr64_32_a3(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI2-LABEL: bextr64_32_a3: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: shrdl %cl, %eax, %edx +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB18_2 ; X86-BMI2-NEXT: # %bb.1: @@ -2660,15 +2650,16 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: shrdl %cl, %eax, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: testb $32, %dl ; X86-NOBMI-NEXT: je .LBB25_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -2676,9 +2667,8 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: .LBB25_2: ; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: movl $-1, %ebx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shll %cl, %ebx -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB25_3 ; X86-NOBMI-NEXT: # %bb.4: ; X86-NOBMI-NEXT: movl %ebx, %eax @@ -2697,48 +2687,46 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; ; X86-BMI1-LABEL: bextr64_b0: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI1-NEXT: movl %edi, %edx +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edx ; X86-BMI1-NEXT: shrdl %cl, %edi, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB25_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: .LBB25_2: +; X86-BMI1-NEXT: movl $-1, %eax ; X86-BMI1-NEXT: movl $-1, %edi -; X86-BMI1-NEXT: movl $-1, %ebx -; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx -; X86-BMI1-NEXT: testb $32, %al +; X86-BMI1-NEXT: shll %cl, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB25_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %eax +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB25_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %eax, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi -; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_b0: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: shrxl %ecx, %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB25_2 @@ -2798,15 +2786,16 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: shrdl %cl, %eax, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: testb $32, %dl ; X86-NOBMI-NEXT: je .LBB26_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -2814,9 +2803,8 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-NOBMI-NEXT: .LBB26_2: ; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: movl $-1, %ebx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shll %cl, %ebx -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB26_3 ; X86-NOBMI-NEXT: # %bb.4: ; X86-NOBMI-NEXT: movl %ebx, %eax @@ -2835,48 +2823,46 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; ; X86-BMI1-LABEL: bextr64_b1_indexzext: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI1-NEXT: movl %edi, %edx +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edx ; X86-BMI1-NEXT: shrdl %cl, %edi, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB26_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: .LBB26_2: +; X86-BMI1-NEXT: movl $-1, %eax ; X86-BMI1-NEXT: movl $-1, %edi -; X86-BMI1-NEXT: movl $-1, %ebx -; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx -; X86-BMI1-NEXT: testb $32, %al +; X86-BMI1-NEXT: shll %cl, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB26_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %eax +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB26_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %eax, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi -; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_b1_indexzext: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: shrxl %ecx, %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB26_2 @@ -2940,16 +2926,17 @@ define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi ; X86-NOBMI-NEXT: movl 4(%eax), %eax ; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: shrdl %cl, %eax, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: testb $32, %dl ; X86-NOBMI-NEXT: je .LBB27_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -2957,9 +2944,8 @@ define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-NOBMI-NEXT: .LBB27_2: ; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: movl $-1, %ebx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shll %cl, %ebx -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB27_3 ; X86-NOBMI-NEXT: # %bb.4: ; X86-NOBMI-NEXT: movl %ebx, %eax @@ -2978,51 +2964,49 @@ define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; ; X86-BMI1-LABEL: bextr64_b2_load: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI1-NEXT: movl (%edx), %esi -; X86-BMI1-NEXT: movl 4(%edx), %edi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl (%ecx), %esi +; X86-BMI1-NEXT: movl 4(%ecx), %edi ; X86-BMI1-NEXT: movl %edi, %edx +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edx ; X86-BMI1-NEXT: shrdl %cl, %edi, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB27_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: .LBB27_2: +; X86-BMI1-NEXT: movl $-1, %eax ; X86-BMI1-NEXT: movl $-1, %edi -; X86-BMI1-NEXT: movl $-1, %ebx -; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx -; X86-BMI1-NEXT: testb $32, %al +; X86-BMI1-NEXT: shll %cl, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB27_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %eax +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB27_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %eax, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi -; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_b2_load: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movl (%edx), %eax -; X86-BMI2-NEXT: movl 4(%edx), %esi -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx -; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movl 4(%edx), %edx +; X86-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-BMI2-NEXT: shrxl %ecx, %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB27_2 ; X86-BMI2-NEXT: # %bb.1: @@ -3083,16 +3067,17 @@ define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi ; X86-NOBMI-NEXT: movl 4(%eax), %eax ; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: shrdl %cl, %eax, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: testb $32, %dl ; X86-NOBMI-NEXT: je .LBB28_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -3100,9 +3085,8 @@ define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-NOBMI-NEXT: .LBB28_2: ; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: movl $-1, %ebx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shll %cl, %ebx -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB28_3 ; X86-NOBMI-NEXT: # %bb.4: ; X86-NOBMI-NEXT: movl %ebx, %eax @@ -3121,51 +3105,49 @@ define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; ; X86-BMI1-LABEL: bextr64_b3_load_indexzext: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI1-NEXT: movl (%edx), %esi -; X86-BMI1-NEXT: movl 4(%edx), %edi +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl (%ecx), %esi +; X86-BMI1-NEXT: movl 4(%ecx), %edi ; X86-BMI1-NEXT: movl %edi, %edx +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edx ; X86-BMI1-NEXT: shrdl %cl, %edi, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB28_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx -; X86-BMI1-NEXT: .LBB28_2: -; X86-BMI1-NEXT: movl $-1, %edi -; X86-BMI1-NEXT: movl $-1, %ebx -; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx -; X86-BMI1-NEXT: testb $32, %al +; X86-BMI1-NEXT: .LBB28_2: +; X86-BMI1-NEXT: movl $-1, %eax +; X86-BMI1-NEXT: movl $-1, %edi +; X86-BMI1-NEXT: shll %cl, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB28_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %eax +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB28_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %eax, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi -; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_b3_load_indexzext: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movl (%edx), %eax -; X86-BMI2-NEXT: movl 4(%edx), %esi -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx -; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movl 4(%edx), %edx +; X86-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-BMI2-NEXT: shrxl %ecx, %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB28_2 ; X86-BMI2-NEXT: # %bb.1: @@ -3231,14 +3213,15 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl %esi, %edx +; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shrl %cl, %edx ; X86-NOBMI-NEXT: shrdl %cl, %esi, %eax +; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: xorl %esi, %esi -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: testb $32, %ch ; X86-NOBMI-NEXT: je .LBB29_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %edx, %eax @@ -3246,9 +3229,8 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X86-NOBMI-NEXT: .LBB29_2: ; X86-NOBMI-NEXT: movl $-1, %edi ; X86-NOBMI-NEXT: movl $-1, %ebx -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shll %cl, %ebx -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB29_3 ; X86-NOBMI-NEXT: # %bb.4: ; X86-NOBMI-NEXT: movl %ebx, %esi @@ -3267,48 +3249,46 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; ; X86-BMI1-LABEL: bextr64_b4_commutative: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI1-NEXT: movl %edi, %edx +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edx ; X86-BMI1-NEXT: shrdl %cl, %edi, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB29_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: .LBB29_2: +; X86-BMI1-NEXT: movl $-1, %eax ; X86-BMI1-NEXT: movl $-1, %edi -; X86-BMI1-NEXT: movl $-1, %ebx -; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %ebx -; X86-BMI1-NEXT: testb $32, %al +; X86-BMI1-NEXT: shll %cl, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB29_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebx, %edi -; X86-BMI1-NEXT: xorl %ebx, %ebx +; X86-BMI1-NEXT: movl %edi, %eax +; X86-BMI1-NEXT: xorl %edi, %edi ; X86-BMI1-NEXT: .LBB29_4: -; X86-BMI1-NEXT: andnl %edx, %edi, %edx -; X86-BMI1-NEXT: andnl %esi, %ebx, %eax +; X86-BMI1-NEXT: andnl %edx, %eax, %edx +; X86-BMI1-NEXT: andnl %esi, %edi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi -; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_b4_commutative: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: shrxl %ecx, %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB29_2 @@ -3370,43 +3350,42 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: subl $12, %esp -; X86-NOBMI-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl %esi, %ebp -; X86-NOBMI-NEXT: movb %al, %cl -; X86-NOBMI-NEXT: shrl %cl, %ebp +; X86-NOBMI-NEXT: movl %esi, %ebx +; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: shrl %cl, %ebx ; X86-NOBMI-NEXT: shrdl %cl, %esi, %edx -; X86-NOBMI-NEXT: xorl %ebx, %ebx +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: xorl %ebp, %ebp ; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: je .LBB30_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: movl %ebp, %edx -; X86-NOBMI-NEXT: xorl %ebp, %ebp +; X86-NOBMI-NEXT: movl %ebx, %edx +; X86-NOBMI-NEXT: xorl %ebx, %ebx ; X86-NOBMI-NEXT: .LBB30_2: ; X86-NOBMI-NEXT: movl $-1, %edi ; X86-NOBMI-NEXT: movl $-1, %esi -; X86-NOBMI-NEXT: movb %ch, %cl ; X86-NOBMI-NEXT: shll %cl, %esi -; X86-NOBMI-NEXT: testb $32, %ch +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB30_3 ; X86-NOBMI-NEXT: # %bb.4: -; X86-NOBMI-NEXT: movl %esi, %ebx +; X86-NOBMI-NEXT: movl %esi, %ebp ; X86-NOBMI-NEXT: jmp .LBB30_5 ; X86-NOBMI-NEXT: .LBB30_3: ; X86-NOBMI-NEXT: movl %esi, %edi ; X86-NOBMI-NEXT: .LBB30_5: ; X86-NOBMI-NEXT: notl %edi -; X86-NOBMI-NEXT: andl %ebp, %edi -; X86-NOBMI-NEXT: notl %ebx -; X86-NOBMI-NEXT: andl %edx, %ebx +; X86-NOBMI-NEXT: andl %ebx, %edi +; X86-NOBMI-NEXT: notl %ebp +; X86-NOBMI-NEXT: andl %edx, %ebp ; X86-NOBMI-NEXT: subl $8, %esp ; X86-NOBMI-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NOBMI-NEXT: pushl %eax ; X86-NOBMI-NEXT: calll use64@PLT ; X86-NOBMI-NEXT: addl $16, %esp -; X86-NOBMI-NEXT: movl %ebx, %eax +; X86-NOBMI-NEXT: movl %ebp, %eax ; X86-NOBMI-NEXT: movl %edi, %edx ; X86-NOBMI-NEXT: addl $12, %esp ; X86-NOBMI-NEXT: popl %esi @@ -3422,34 +3401,34 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: subl $12, %esp -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movl %ebx, %esi +; X86-BMI1-NEXT: movl %edi, %esi ; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %esi -; X86-BMI1-NEXT: shrdl %cl, %ebx, %edi +; X86-BMI1-NEXT: shrdl %cl, %edi, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: je .LBB30_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edi +; X86-BMI1-NEXT: movl %esi, %edx ; X86-BMI1-NEXT: xorl %esi, %esi ; X86-BMI1-NEXT: .LBB30_2: -; X86-BMI1-NEXT: movl $-1, %ebx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl $-1, %ebp -; X86-BMI1-NEXT: movl %edx, %ecx ; X86-BMI1-NEXT: shll %cl, %ebp -; X86-BMI1-NEXT: testb $32, %dl +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB30_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %ebp, %ebx +; X86-BMI1-NEXT: movl %ebp, %edi ; X86-BMI1-NEXT: xorl %ebp, %ebp ; X86-BMI1-NEXT: .LBB30_4: -; X86-BMI1-NEXT: andnl %esi, %ebx, %esi -; X86-BMI1-NEXT: andnl %edi, %ebp, %edi +; X86-BMI1-NEXT: andnl %esi, %edi, %esi +; X86-BMI1-NEXT: andnl %edx, %ebp, %edi ; X86-BMI1-NEXT: subl $8, %esp -; X86-BMI1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %eax ; X86-BMI1-NEXT: calll use64@PLT ; X86-BMI1-NEXT: addl $16, %esp @@ -3469,11 +3448,11 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: subl $12, %esp -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI2-NEXT: shrxl %ecx, %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB30_2 @@ -3562,28 +3541,28 @@ define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shrl %cl, %eax ; X86-NOBMI-NEXT: shrdl %cl, %edi, %esi -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %dl ; X86-NOBMI-NEXT: jne .LBB31_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: .LBB31_2: ; X86-NOBMI-NEXT: movl $-1, %esi -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %esi -; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: testb $32, %dl +; X86-NOBMI-NEXT: xorl %edx, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB31_4 ; X86-NOBMI-NEXT: # %bb.3: -; X86-NOBMI-NEXT: movl %esi, %ecx +; X86-NOBMI-NEXT: movl %esi, %edx ; X86-NOBMI-NEXT: .LBB31_4: -; X86-NOBMI-NEXT: notl %ecx -; X86-NOBMI-NEXT: andl %ecx, %eax +; X86-NOBMI-NEXT: notl %edx +; X86-NOBMI-NEXT: andl %edx, %eax ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: retl @@ -3593,27 +3572,27 @@ define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI1-NEXT: movl %edi, %edx +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edx ; X86-BMI1-NEXT: shrdl %cl, %edi, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: jne .LBB31_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %esi, %edx ; X86-BMI1-NEXT: .LBB31_2: -; X86-BMI1-NEXT: movl $-1, %esi -; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %esi -; X86-BMI1-NEXT: xorl %ecx, %ecx -; X86-BMI1-NEXT: testb $32, %al +; X86-BMI1-NEXT: movl $-1, %eax +; X86-BMI1-NEXT: shll %cl, %eax +; X86-BMI1-NEXT: xorl %esi, %esi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB31_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %esi, %ecx +; X86-BMI1-NEXT: movl %eax, %esi ; X86-BMI1-NEXT: .LBB31_4: -; X86-BMI1-NEXT: andnl %edx, %ecx, %eax +; X86-BMI1-NEXT: andnl %edx, %esi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -3621,24 +3600,24 @@ define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-BMI2-LABEL: bextr64_32_b0: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB31_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx +; X86-BMI2-NEXT: shrxl %ecx, %esi, %eax ; X86-BMI2-NEXT: .LBB31_2: ; X86-BMI2-NEXT: xorl %ecx, %ecx -; X86-BMI2-NEXT: testb $32, %al +; X86-BMI2-NEXT: testb $32, %dl ; X86-BMI2-NEXT: jne .LBB31_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl $-1, %ecx -; X86-BMI2-NEXT: shlxl %eax, %ecx, %ecx +; X86-BMI2-NEXT: shlxl %edx, %ecx, %ecx ; X86-BMI2-NEXT: .LBB31_4: -; X86-BMI2-NEXT: andnl %edx, %ecx, %eax +; X86-BMI2-NEXT: andnl %eax, %ecx, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; @@ -3684,63 +3663,63 @@ define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl %edi, %esi -; X86-NOBMI-NEXT: shrl %cl, %esi -; X86-NOBMI-NEXT: shrdl %cl, %edi, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: movl %edi, %edx +; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: shrl %cl, %edx +; X86-NOBMI-NEXT: shrdl %cl, %edi, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: jne .LBB32_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: movl %esi, %edx ; X86-NOBMI-NEXT: .LBB32_2: ; X86-NOBMI-NEXT: movl $-1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: notl %eax -; X86-NOBMI-NEXT: andl %esi, %eax +; X86-NOBMI-NEXT: andl %edx, %eax ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: retl ; ; X86-BMI1-LABEL: bextr64_32_b1: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %edi +; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %esi, %eax +; X86-BMI1-NEXT: shrl %cl, %eax +; X86-BMI1-NEXT: shrdl %cl, %esi, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB32_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %eax ; X86-BMI1-NEXT: .LBB32_2: -; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: shll $8, %ebx +; X86-BMI1-NEXT: bextrl %ebx, %eax, %eax ; X86-BMI1-NEXT: popl %esi -; X86-BMI1-NEXT: popl %edi +; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_32_b1: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB32_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx +; X86-BMI2-NEXT: shrxl %ecx, %esi, %eax ; X86-BMI2-NEXT: .LBB32_2: -; X86-BMI2-NEXT: bzhil %eax, %edx, %eax +; X86-BMI2-NEXT: bzhil %edx, %eax, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; @@ -3786,63 +3765,63 @@ define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl %edi, %esi -; X86-NOBMI-NEXT: shrl %cl, %esi -; X86-NOBMI-NEXT: shrdl %cl, %edi, %eax -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: movl %edi, %edx +; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: shrl %cl, %edx +; X86-NOBMI-NEXT: shrdl %cl, %edi, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %al ; X86-NOBMI-NEXT: jne .LBB33_2 ; X86-NOBMI-NEXT: # %bb.1: -; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: movl %esi, %edx ; X86-NOBMI-NEXT: .LBB33_2: ; X86-NOBMI-NEXT: movl $-1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: notl %eax -; X86-NOBMI-NEXT: andl %esi, %eax +; X86-NOBMI-NEXT: andl %edx, %eax ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: retl ; ; X86-BMI1-LABEL: bextr64_32_b2: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %edi +; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %esi, %eax +; X86-BMI1-NEXT: shrl %cl, %eax +; X86-BMI1-NEXT: shrdl %cl, %esi, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB33_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %eax ; X86-BMI1-NEXT: .LBB33_2: -; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: shll $8, %ebx +; X86-BMI1-NEXT: bextrl %ebx, %eax, %eax ; X86-BMI1-NEXT: popl %esi -; X86-BMI1-NEXT: popl %edi +; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_32_b2: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB33_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx +; X86-BMI2-NEXT: shrxl %ecx, %esi, %eax ; X86-BMI2-NEXT: .LBB33_2: -; X86-BMI2-NEXT: bzhil %eax, %edx, %eax +; X86-BMI2-NEXT: bzhil %edx, %eax, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; @@ -3890,28 +3869,28 @@ define i32 @bextr64_32_b3(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shrl %cl, %eax ; X86-NOBMI-NEXT: shrdl %cl, %edi, %esi -; X86-NOBMI-NEXT: testb $32, %cl +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: testb $32, %dl ; X86-NOBMI-NEXT: jne .LBB34_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: .LBB34_2: ; X86-NOBMI-NEXT: movl $-1, %esi -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %esi -; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: testb $32, %dl +; X86-NOBMI-NEXT: xorl %edx, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB34_4 ; X86-NOBMI-NEXT: # %bb.3: -; X86-NOBMI-NEXT: movl %esi, %ecx +; X86-NOBMI-NEXT: movl %esi, %edx ; X86-NOBMI-NEXT: .LBB34_4: -; X86-NOBMI-NEXT: notl %ecx -; X86-NOBMI-NEXT: andl %ecx, %eax +; X86-NOBMI-NEXT: notl %edx +; X86-NOBMI-NEXT: andl %edx, %eax ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: retl @@ -3921,27 +3900,27 @@ define i32 @bextr64_32_b3(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI1-NEXT: movl %edi, %edx +; X86-BMI1-NEXT: movl %eax, %ecx ; X86-BMI1-NEXT: shrl %cl, %edx ; X86-BMI1-NEXT: shrdl %cl, %edi, %esi -; X86-BMI1-NEXT: testb $32, %cl +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: testb $32, %al ; X86-BMI1-NEXT: jne .LBB34_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %esi, %edx ; X86-BMI1-NEXT: .LBB34_2: -; X86-BMI1-NEXT: movl $-1, %esi -; X86-BMI1-NEXT: movl %eax, %ecx -; X86-BMI1-NEXT: shll %cl, %esi -; X86-BMI1-NEXT: xorl %ecx, %ecx -; X86-BMI1-NEXT: testb $32, %al +; X86-BMI1-NEXT: movl $-1, %eax +; X86-BMI1-NEXT: shll %cl, %eax +; X86-BMI1-NEXT: xorl %esi, %esi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB34_4 ; X86-BMI1-NEXT: # %bb.3: -; X86-BMI1-NEXT: movl %esi, %ecx +; X86-BMI1-NEXT: movl %eax, %esi ; X86-BMI1-NEXT: .LBB34_4: -; X86-BMI1-NEXT: andnl %edx, %ecx, %eax +; X86-BMI1-NEXT: andnl %edx, %esi, %eax ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: popl %edi ; X86-BMI1-NEXT: retl @@ -3949,24 +3928,24 @@ define i32 @bextr64_32_b3(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X86-BMI2-LABEL: bextr64_32_b3: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB34_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx +; X86-BMI2-NEXT: shrxl %ecx, %esi, %eax ; X86-BMI2-NEXT: .LBB34_2: ; X86-BMI2-NEXT: xorl %ecx, %ecx -; X86-BMI2-NEXT: testb $32, %al +; X86-BMI2-NEXT: testb $32, %dl ; X86-BMI2-NEXT: jne .LBB34_4 ; X86-BMI2-NEXT: # %bb.3: ; X86-BMI2-NEXT: movl $-1, %ecx -; X86-BMI2-NEXT: shlxl %eax, %ecx, %ecx +; X86-BMI2-NEXT: shlxl %edx, %ecx, %ecx ; X86-BMI2-NEXT: .LBB34_4: -; X86-BMI2-NEXT: andnl %edx, %ecx, %eax +; X86-BMI2-NEXT: andnl %eax, %ecx, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; @@ -3975,10 +3954,10 @@ define i32 @bextr64_32_b3(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; X64-NOBMI-NEXT: movq %rsi, %rcx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rdi -; X64-NOBMI-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NOBMI-NEXT: movl $4294967295, %esi # imm = 0xFFFFFFFF ; X64-NOBMI-NEXT: movl %edx, %ecx ; X64-NOBMI-NEXT: shlq %cl, %rsi +; X64-NOBMI-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NOBMI-NEXT: xorl %esi, %eax ; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax @@ -4083,9 +4062,10 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X64-NOBMI-NEXT: movl %edi, %ebx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl $-1, %ebp ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl +; X64-NOBMI-NEXT: movl $-1, %ebp +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp ; X64-NOBMI-NEXT: movl %ebp, %edi ; X64-NOBMI-NEXT: callq use32@PLT @@ -4105,9 +4085,10 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X64-BMI1-NEXT: movl %edi, %ebx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebx -; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movl $-1, %ebp ; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: negb %cl +; X64-BMI1-NEXT: movl $-1, %ebp +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebp ; X64-BMI1-NEXT: movl %ebp, %edi ; X64-BMI1-NEXT: callq use32@PLT @@ -4213,9 +4194,10 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun ; X64-NOBMI-NEXT: movl %edi, %ebx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl $-1, %ebp ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl +; X64-NOBMI-NEXT: movl $-1, %ebp +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp ; X64-NOBMI-NEXT: movl %ebp, %edi ; X64-NOBMI-NEXT: callq use32@PLT @@ -4235,9 +4217,10 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun ; X64-BMI1-NEXT: movl %edi, %ebx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebx -; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movl $-1, %ebp ; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: negb %cl +; X64-BMI1-NEXT: movl $-1, %ebp +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebp ; X64-BMI1-NEXT: movl %ebp, %edi ; X64-BMI1-NEXT: callq use32@PLT @@ -4348,9 +4331,10 @@ define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X64-NOBMI-NEXT: movl (%rdi), %ebp ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl $-1, %ebx ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl +; X64-NOBMI-NEXT: movl $-1, %ebx +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx ; X64-NOBMI-NEXT: movl %ebx, %edi ; X64-NOBMI-NEXT: callq use32@PLT @@ -4370,9 +4354,10 @@ define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X64-BMI1-NEXT: movl (%rdi), %ebp ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebp -; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movl $-1, %ebx ; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: negb %cl +; X64-BMI1-NEXT: movl $-1, %ebx +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebx ; X64-BMI1-NEXT: movl %ebx, %edi ; X64-BMI1-NEXT: callq use32@PLT @@ -4482,9 +4467,10 @@ define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X64-NOBMI-NEXT: movl (%rdi), %ebp ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl $-1, %ebx ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl +; X64-NOBMI-NEXT: movl $-1, %ebx +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx ; X64-NOBMI-NEXT: movl %ebx, %edi ; X64-NOBMI-NEXT: callq use32@PLT @@ -4504,9 +4490,10 @@ define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X64-BMI1-NEXT: movl (%rdi), %ebp ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebp -; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movl $-1, %ebx ; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: negb %cl +; X64-BMI1-NEXT: movl $-1, %ebx +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebx ; X64-BMI1-NEXT: movl %ebx, %edi ; X64-BMI1-NEXT: callq use32@PLT @@ -4615,9 +4602,10 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X64-NOBMI-NEXT: movl %edi, %ebx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebx -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl $-1, %ebp ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl +; X64-NOBMI-NEXT: movl $-1, %ebp +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp ; X64-NOBMI-NEXT: movl %ebp, %edi ; X64-NOBMI-NEXT: callq use32@PLT @@ -4637,9 +4625,10 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X64-BMI1-NEXT: movl %edi, %ebx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebx -; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movl $-1, %ebp ; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: negb %cl +; X64-BMI1-NEXT: movl $-1, %ebp +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %ebp ; X64-BMI1-NEXT: movl %ebp, %edi ; X64-BMI1-NEXT: callq use32@PLT @@ -4760,9 +4749,10 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X64-NOBMI-NEXT: movl %edi, %ebp ; X64-NOBMI-NEXT: movl %ebx, %ecx ; X64-NOBMI-NEXT: shrl %cl, %ebp -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl $-1, %r14d ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl +; X64-NOBMI-NEXT: movl $-1, %r14d +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %r14d ; X64-NOBMI-NEXT: movl %r14d, %edi ; X64-NOBMI-NEXT: callq use32@PLT @@ -4784,9 +4774,10 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X64-BMI1-NEXT: movl %edi, %ebp ; X64-BMI1-NEXT: movl %ebx, %ecx ; X64-BMI1-NEXT: shrl %cl, %ebp -; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movl $-1, %r14d ; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: negb %cl +; X64-BMI1-NEXT: movl $-1, %r14d +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrl %cl, %r14d ; X64-BMI1-NEXT: movl %r14d, %edi ; X64-BMI1-NEXT: callq use32@PLT @@ -4975,9 +4966,10 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X64-NOBMI-NEXT: movq %rdi, %rbx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rbx -; X64-NOBMI-NEXT: negb %dl +; X64-NOBMI-NEXT: movq %rdx, %rcx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movq $-1, %r14 -; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %r14 ; X64-NOBMI-NEXT: movq %r14, %rdi ; X64-NOBMI-NEXT: callq use64@PLT @@ -4997,9 +4989,10 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X64-BMI1-NEXT: movq %rdi, %rbx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1-NEXT: shrq %cl, %rbx -; X64-BMI1-NEXT: negb %dl +; X64-BMI1-NEXT: movq %rdx, %rcx +; X64-BMI1-NEXT: negb %cl ; X64-BMI1-NEXT: movq $-1, %r14 -; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1-NEXT: shrq %cl, %r14 ; X64-BMI1-NEXT: movq %r14, %rdi ; X64-BMI1-NEXT: callq use64@PLT @@ -5179,9 +5172,10 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun ; X64-NOBMI-NEXT: movq %rdi, %rbx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrq %cl, %rbx -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movq $-1, %r14 ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl +; X64-NOBMI-NEXT: movq $-1, %r14 +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrq %cl, %r14 ; X64-NOBMI-NEXT: movq %r14, %rdi ; X64-NOBMI-NEXT: callq use64@PLT @@ -5201,9 +5195,10 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun ; X64-BMI1-NEXT: movq %rdi, %rbx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrq %cl, %rbx -; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movq $-1, %r14 ; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: negb %cl +; X64-BMI1-NEXT: movq $-1, %r14 +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrq %cl, %r14 ; X64-BMI1-NEXT: movq %r14, %rdi ; X64-BMI1-NEXT: callq use64@PLT @@ -5346,8 +5341,8 @@ define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl (%eax), %esi ; X86-BMI2-NEXT: movl 4(%eax), %eax -; X86-BMI2-NEXT: shrxl %ecx, %eax, %edi ; X86-BMI2-NEXT: shrdl %cl, %eax, %esi +; X86-BMI2-NEXT: shrxl %ecx, %eax, %edi ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB43_2 ; X86-BMI2-NEXT: # %bb.1: @@ -5389,9 +5384,10 @@ define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq (%rdi), %r14 ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %r14 -; X64-NOBMI-NEXT: negb %dl +; X64-NOBMI-NEXT: movq %rdx, %rcx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movq $-1, %rbx -; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rbx ; X64-NOBMI-NEXT: movq %rbx, %rdi ; X64-NOBMI-NEXT: callq use64@PLT @@ -5411,9 +5407,10 @@ define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-BMI1-NEXT: movq (%rdi), %r14 ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1-NEXT: shrq %cl, %r14 -; X64-BMI1-NEXT: negb %dl +; X64-BMI1-NEXT: movq %rdx, %rcx +; X64-BMI1-NEXT: negb %cl ; X64-BMI1-NEXT: movq $-1, %rbx -; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1-NEXT: shrq %cl, %rbx ; X64-BMI1-NEXT: movq %rbx, %rdi ; X64-BMI1-NEXT: callq use64@PLT @@ -5554,8 +5551,8 @@ define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl (%eax), %esi ; X86-BMI2-NEXT: movl 4(%eax), %eax -; X86-BMI2-NEXT: shrxl %ecx, %eax, %edi ; X86-BMI2-NEXT: shrdl %cl, %eax, %esi +; X86-BMI2-NEXT: shrxl %ecx, %eax, %edi ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB44_2 ; X86-BMI2-NEXT: # %bb.1: @@ -5597,9 +5594,10 @@ define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X64-NOBMI-NEXT: movq (%rdi), %r14 ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrq %cl, %r14 -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movq $-1, %rbx ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl +; X64-NOBMI-NEXT: movq $-1, %rbx +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrq %cl, %rbx ; X64-NOBMI-NEXT: movq %rbx, %rdi ; X64-NOBMI-NEXT: callq use64@PLT @@ -5619,9 +5617,10 @@ define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X64-BMI1-NEXT: movq (%rdi), %r14 ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrq %cl, %r14 -; X64-BMI1-NEXT: negb %dl -; X64-BMI1-NEXT: movq $-1, %rbx ; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: negb %cl +; X64-BMI1-NEXT: movq $-1, %rbx +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1-NEXT: shrq %cl, %rbx ; X64-BMI1-NEXT: movq %rbx, %rdi ; X64-BMI1-NEXT: callq use64@PLT @@ -5805,9 +5804,10 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X64-NOBMI-NEXT: movq %rdi, %rbx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rbx -; X64-NOBMI-NEXT: negb %dl +; X64-NOBMI-NEXT: movq %rdx, %rcx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movq $-1, %r14 -; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %r14 ; X64-NOBMI-NEXT: movq %r14, %rdi ; X64-NOBMI-NEXT: callq use64@PLT @@ -5827,9 +5827,10 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X64-BMI1-NEXT: movq %rdi, %rbx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1-NEXT: shrq %cl, %rbx -; X64-BMI1-NEXT: negb %dl +; X64-BMI1-NEXT: movq %rdx, %rcx +; X64-BMI1-NEXT: negb %cl ; X64-BMI1-NEXT: movq $-1, %r14 -; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1-NEXT: shrq %cl, %r14 ; X64-BMI1-NEXT: movq %r14, %rdi ; X64-BMI1-NEXT: callq use64@PLT @@ -6024,9 +6025,10 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X64-NOBMI-NEXT: movq %rdi, %r14 ; X64-NOBMI-NEXT: movl %ebx, %ecx ; X64-NOBMI-NEXT: shrq %cl, %r14 -; X64-NOBMI-NEXT: negb %dl +; X64-NOBMI-NEXT: movq %rdx, %rcx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movq $-1, %r15 -; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %r15 ; X64-NOBMI-NEXT: movq %r15, %rdi ; X64-NOBMI-NEXT: callq use64@PLT @@ -6048,9 +6050,10 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X64-BMI1-NEXT: movq %rdi, %r14 ; X64-BMI1-NEXT: movl %ebx, %ecx ; X64-BMI1-NEXT: shrq %cl, %r14 -; X64-BMI1-NEXT: negb %dl +; X64-BMI1-NEXT: movq %rdx, %rcx +; X64-BMI1-NEXT: negb %cl ; X64-BMI1-NEXT: movq $-1, %r15 -; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1-NEXT: shrq %cl, %r15 ; X64-BMI1-NEXT: movq %r15, %rdi ; X64-BMI1-NEXT: callq use64@PLT @@ -6176,9 +6179,10 @@ define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq %rsi, %rcx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rdi -; X64-NOBMI-NEXT: negb %dl +; X64-NOBMI-NEXT: movq %rdx, %rcx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movq $-1, %rax -; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax @@ -6232,40 +6236,40 @@ define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; ; X86-BMI1-LABEL: bextr64_32_c1: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %edi +; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %esi, %eax +; X86-BMI1-NEXT: shrl %cl, %eax +; X86-BMI1-NEXT: shrdl %cl, %esi, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB48_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %eax ; X86-BMI1-NEXT: .LBB48_2: -; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: shll $8, %ebx +; X86-BMI1-NEXT: bextrl %ebx, %eax, %eax ; X86-BMI1-NEXT: popl %esi -; X86-BMI1-NEXT: popl %edi +; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_32_c1: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB48_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx +; X86-BMI2-NEXT: shrxl %ecx, %esi, %eax ; X86-BMI2-NEXT: .LBB48_2: -; X86-BMI2-NEXT: bzhil %eax, %edx, %eax +; X86-BMI2-NEXT: bzhil %edx, %eax, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; @@ -6275,9 +6279,10 @@ define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shll %cl, %eax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq @@ -6331,40 +6336,40 @@ define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; ; X86-BMI1-LABEL: bextr64_32_c2: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %edi +; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %esi, %eax +; X86-BMI1-NEXT: shrl %cl, %eax +; X86-BMI1-NEXT: shrdl %cl, %esi, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB49_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %eax ; X86-BMI1-NEXT: .LBB49_2: -; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: shll $8, %ebx +; X86-BMI1-NEXT: bextrl %ebx, %eax, %eax ; X86-BMI1-NEXT: popl %esi -; X86-BMI1-NEXT: popl %edi +; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_32_c2: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB49_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx +; X86-BMI2-NEXT: shrxl %ecx, %esi, %eax ; X86-BMI2-NEXT: .LBB49_2: -; X86-BMI2-NEXT: bzhil %eax, %edx, %eax +; X86-BMI2-NEXT: bzhil %edx, %eax, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; @@ -6374,9 +6379,10 @@ define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shll %cl, %eax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq @@ -6494,9 +6500,10 @@ define i32 @bextr64_32_c3(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq %rsi, %rcx ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rdi -; X64-NOBMI-NEXT: negb %dl +; X64-NOBMI-NEXT: movq %rdx, %rcx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax @@ -6507,9 +6514,10 @@ define i32 @bextr64_32_c3(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-BMI1-NEXT: movq %rsi, %rcx ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1-NEXT: shrq %cl, %rdi -; X64-BMI1-NEXT: negb %dl +; X64-BMI1-NEXT: movq %rdx, %rcx +; X64-BMI1-NEXT: negb %cl ; X64-BMI1-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-BMI1-NEXT: movl %edx, %ecx +; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1-NEXT: shrq %cl, %rax ; X64-BMI1-NEXT: andl %edi, %eax ; X64-BMI1-NEXT: # kill: def $eax killed $eax killed $rax @@ -6572,9 +6580,10 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X64-NOBMI-NEXT: movl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax -; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shll %cl, %eax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax ; X64-NOBMI-NEXT: retq ; @@ -6634,9 +6643,10 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun ; X64-NOBMI-NEXT: movl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax -; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shll %cl, %eax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax ; X64-NOBMI-NEXT: retq ; @@ -6701,9 +6711,10 @@ define i32 @bextr32_d2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X64-NOBMI-NEXT: movl (%rdi), %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax -; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shll %cl, %eax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax ; X64-NOBMI-NEXT: retq ; @@ -6767,9 +6778,10 @@ define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X64-NOBMI-NEXT: movl (%rdi), %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax -; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shll %cl, %eax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax ; X64-NOBMI-NEXT: retq ; @@ -6921,16 +6933,16 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shldl %cl, %edi, %eax ; X86-NOBMI-NEXT: shll %cl, %edi -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl %edi, %ebx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB56_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %eax, %ebx ; X86-NOBMI-NEXT: .LBB56_4: ; X86-NOBMI-NEXT: movl %ebx, %eax ; X86-NOBMI-NEXT: shrl %cl, %eax -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl $0, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB56_6 ; X86-NOBMI-NEXT: # %bb.5: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -6969,16 +6981,16 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1-NEXT: shldl %cl, %edi, %eax ; X86-BMI1-NEXT: shll %cl, %edi -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl %edi, %ebx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB56_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %eax, %ebx ; X86-BMI1-NEXT: .LBB56_4: ; X86-BMI1-NEXT: movl %ebx, %eax ; X86-BMI1-NEXT: shrl %cl, %eax -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl $0, %edx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB56_6 ; X86-BMI1-NEXT: # %bb.5: ; X86-BMI1-NEXT: movl %edi, %esi @@ -7042,9 +7054,10 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: movq %rdx, %rcx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shlq %cl, %rax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: retq ; @@ -7091,16 +7104,16 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shldl %cl, %edi, %eax ; X86-NOBMI-NEXT: shll %cl, %edi -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl %edi, %ebx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB57_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %eax, %ebx ; X86-NOBMI-NEXT: .LBB57_4: ; X86-NOBMI-NEXT: movl %ebx, %eax ; X86-NOBMI-NEXT: shrl %cl, %eax -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl $0, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB57_6 ; X86-NOBMI-NEXT: # %bb.5: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -7139,16 +7152,16 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1-NEXT: shldl %cl, %edi, %eax ; X86-BMI1-NEXT: shll %cl, %edi -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl %edi, %ebx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB57_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %eax, %ebx ; X86-BMI1-NEXT: .LBB57_4: ; X86-BMI1-NEXT: movl %ebx, %eax ; X86-BMI1-NEXT: shrl %cl, %eax -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl $0, %edx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB57_6 ; X86-BMI1-NEXT: # %bb.5: ; X86-BMI1-NEXT: movl %edi, %esi @@ -7212,9 +7225,10 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun ; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shlq %cl, %rax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: retq ; @@ -7266,16 +7280,16 @@ define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shldl %cl, %edi, %eax ; X86-NOBMI-NEXT: shll %cl, %edi -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl %edi, %ebx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB58_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %eax, %ebx ; X86-NOBMI-NEXT: .LBB58_4: ; X86-NOBMI-NEXT: movl %ebx, %eax ; X86-NOBMI-NEXT: shrl %cl, %eax -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl $0, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB58_6 ; X86-NOBMI-NEXT: # %bb.5: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -7315,16 +7329,16 @@ define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1-NEXT: shldl %cl, %edi, %eax ; X86-BMI1-NEXT: shll %cl, %edi -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl %edi, %ebx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB58_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %eax, %ebx ; X86-BMI1-NEXT: .LBB58_4: ; X86-BMI1-NEXT: movl %ebx, %eax ; X86-BMI1-NEXT: shrl %cl, %eax -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl $0, %edx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB58_6 ; X86-BMI1-NEXT: # %bb.5: ; X86-BMI1-NEXT: movl %edi, %esi @@ -7349,8 +7363,8 @@ define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movl (%edx), %eax ; X86-BMI2-NEXT: movl 4(%edx), %edx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %esi ; X86-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-BMI2-NEXT: shrxl %ecx, %edx, %esi ; X86-BMI2-NEXT: xorl %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB58_2 @@ -7389,9 +7403,10 @@ define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq (%rdi), %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: movq %rdx, %rcx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shlq %cl, %rax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: retq ; @@ -7440,16 +7455,16 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shldl %cl, %edi, %eax ; X86-NOBMI-NEXT: shll %cl, %edi -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl %edi, %ebx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB59_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %eax, %ebx ; X86-NOBMI-NEXT: .LBB59_4: ; X86-NOBMI-NEXT: movl %ebx, %eax ; X86-NOBMI-NEXT: shrl %cl, %eax -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl $0, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB59_6 ; X86-NOBMI-NEXT: # %bb.5: ; X86-NOBMI-NEXT: movl %edi, %esi @@ -7489,16 +7504,16 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1-NEXT: shldl %cl, %edi, %eax ; X86-BMI1-NEXT: shll %cl, %edi -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl %edi, %ebx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB59_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %eax, %ebx ; X86-BMI1-NEXT: .LBB59_4: ; X86-BMI1-NEXT: movl %ebx, %eax ; X86-BMI1-NEXT: shrl %cl, %eax -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl $0, %edx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB59_6 ; X86-BMI1-NEXT: # %bb.5: ; X86-BMI1-NEXT: movl %edi, %esi @@ -7523,8 +7538,8 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movl (%edx), %eax ; X86-BMI2-NEXT: movl 4(%edx), %edx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %esi ; X86-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-BMI2-NEXT: shrxl %ecx, %edx, %esi ; X86-BMI2-NEXT: xorl %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB59_2 @@ -7563,9 +7578,10 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; X64-NOBMI-NEXT: movq (%rdi), %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shlq %cl, %rax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: retq ; @@ -7620,30 +7636,30 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NOBMI-NEXT: shldl %cl, %ebx, %esi ; X86-NOBMI-NEXT: shll %cl, %ebx -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl %ebx, %ebp +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB60_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %esi, %ebp ; X86-NOBMI-NEXT: .LBB60_4: ; X86-NOBMI-NEXT: movl %ebp, %esi ; X86-NOBMI-NEXT: shrl %cl, %esi -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl $0, %edi +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB60_6 ; X86-NOBMI-NEXT: # %bb.5: ; X86-NOBMI-NEXT: movl %ebx, %edx ; X86-NOBMI-NEXT: movl %esi, %edi ; X86-NOBMI-NEXT: .LBB60_6: ; X86-NOBMI-NEXT: shrdl %cl, %ebp, %edx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NOBMI-NEXT: testb $32, %cl -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: jne .LBB60_8 ; X86-NOBMI-NEXT: # %bb.7: ; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: .LBB60_8: ; X86-NOBMI-NEXT: subl $8, %esp -; X86-NOBMI-NEXT: pushl %ecx +; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %eax ; X86-NOBMI-NEXT: calll use64@PLT ; X86-NOBMI-NEXT: addl $16, %esp @@ -7681,30 +7697,30 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-BMI1-NEXT: shldl %cl, %ebx, %esi ; X86-BMI1-NEXT: shll %cl, %ebx -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl %ebx, %ebp +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB60_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %esi, %ebp ; X86-BMI1-NEXT: .LBB60_4: ; X86-BMI1-NEXT: movl %ebp, %esi ; X86-BMI1-NEXT: shrl %cl, %esi -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl $0, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB60_6 ; X86-BMI1-NEXT: # %bb.5: ; X86-BMI1-NEXT: movl %ebx, %edx ; X86-BMI1-NEXT: movl %esi, %edi ; X86-BMI1-NEXT: .LBB60_6: ; X86-BMI1-NEXT: shrdl %cl, %ebp, %edx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: testb $32, %cl -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: jne .LBB60_8 ; X86-BMI1-NEXT: # %bb.7: ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: .LBB60_8: ; X86-BMI1-NEXT: subl $8, %esp -; X86-BMI1-NEXT: pushl %ecx +; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %eax ; X86-BMI1-NEXT: calll use64@PLT ; X86-BMI1-NEXT: addl $16, %esp @@ -7751,14 +7767,14 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X86-BMI2-NEXT: movl %edi, %esi ; X86-BMI2-NEXT: .LBB60_6: ; X86-BMI2-NEXT: shrdl %cl, %edx, %ebx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: jne .LBB60_8 ; X86-BMI2-NEXT: # %bb.7: ; X86-BMI2-NEXT: movl %ebx, %edi ; X86-BMI2-NEXT: .LBB60_8: ; X86-BMI2-NEXT: subl $8, %esp -; X86-BMI2-NEXT: pushl %ecx +; X86-BMI2-NEXT: pushl %edx ; X86-BMI2-NEXT: pushl %eax ; X86-BMI2-NEXT: calll use64@PLT ; X86-BMI2-NEXT: addl $16, %esp @@ -7927,9 +7943,10 @@ define i32 @bextr64_32_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: movq %rdx, %rcx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shlq %cl, %rax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq @@ -7983,40 +8000,40 @@ define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; ; X86-BMI1-LABEL: bextr64_32_d1: ; X86-BMI1: # %bb.0: -; X86-BMI1-NEXT: pushl %edi +; X86-BMI1-NEXT: pushl %ebx ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI1-NEXT: movl %edi, %edx -; X86-BMI1-NEXT: shrl %cl, %edx -; X86-BMI1-NEXT: shrdl %cl, %edi, %esi +; X86-BMI1-NEXT: movl %esi, %eax +; X86-BMI1-NEXT: shrl %cl, %eax +; X86-BMI1-NEXT: shrdl %cl, %esi, %edx +; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB62_2 ; X86-BMI1-NEXT: # %bb.1: -; X86-BMI1-NEXT: movl %esi, %edx +; X86-BMI1-NEXT: movl %edx, %eax ; X86-BMI1-NEXT: .LBB62_2: -; X86-BMI1-NEXT: shll $8, %eax -; X86-BMI1-NEXT: bextrl %eax, %edx, %eax +; X86-BMI1-NEXT: shll $8, %ebx +; X86-BMI1-NEXT: bextrl %ebx, %eax, %eax ; X86-BMI1-NEXT: popl %esi -; X86-BMI1-NEXT: popl %edi +; X86-BMI1-NEXT: popl %ebx ; X86-BMI1-NEXT: retl ; ; X86-BMI2-LABEL: bextr64_32_d1: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-BMI2-NEXT: shrdl %cl, %esi, %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB62_2 ; X86-BMI2-NEXT: # %bb.1: -; X86-BMI2-NEXT: shrxl %ecx, %esi, %edx +; X86-BMI2-NEXT: shrxl %ecx, %esi, %eax ; X86-BMI2-NEXT: .LBB62_2: -; X86-BMI2-NEXT: bzhil %eax, %edx, %eax +; X86-BMI2-NEXT: bzhil %edx, %eax, %eax ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl ; @@ -8026,9 +8043,10 @@ define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind ; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: negb %dl ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: negb %cl ; X64-NOBMI-NEXT: shll %cl, %eax +; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shrl %cl, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq @@ -8074,18 +8092,18 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind { ; X86-BMINOTBM-LABEL: pr38938: ; X86-BMINOTBM: # %bb.0: ; X86-BMINOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMINOTBM-NEXT: movl $2581, %ecx # imm = 0xA15 +; X86-BMINOTBM-NEXT: bextrl %ecx, (%eax), %eax ; X86-BMINOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMINOTBM-NEXT: movl $2581, %edx # imm = 0xA15 -; X86-BMINOTBM-NEXT: bextrl %edx, (%ecx), %ecx -; X86-BMINOTBM-NEXT: incl (%eax,%ecx,4) +; X86-BMINOTBM-NEXT: incl (%ecx,%eax,4) ; X86-BMINOTBM-NEXT: retl ; ; X86-BMITBM-LABEL: pr38938: ; X86-BMITBM: # %bb.0: ; X86-BMITBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMITBM-NEXT: bextrl $2581, (%eax), %eax # imm = 0xA15 ; X86-BMITBM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMITBM-NEXT: bextrl $2581, (%ecx), %ecx # imm = 0xA15 -; X86-BMITBM-NEXT: incl (%eax,%ecx,4) +; X86-BMITBM-NEXT: incl (%ecx,%eax,4) ; X86-BMITBM-NEXT: retl ; ; X64-NOBMI-LABEL: pr38938: @@ -8480,19 +8498,19 @@ define void @c5_i64(i64 %arg, ptr %ptr) nounwind { ; ; X86-BMINOTBM-LABEL: c5_i64: ; X86-BMINOTBM: # %bb.0: -; X86-BMINOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMINOTBM-NEXT: movl $2579, %ecx # imm = 0xA13 -; X86-BMINOTBM-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx -; X86-BMINOTBM-NEXT: movl %ecx, (%eax) -; X86-BMINOTBM-NEXT: movl $0, 4(%eax) +; X86-BMINOTBM-NEXT: movl $2579, %eax # imm = 0xA13 +; X86-BMINOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax +; X86-BMINOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMINOTBM-NEXT: movl %eax, (%ecx) +; X86-BMINOTBM-NEXT: movl $0, 4(%ecx) ; X86-BMINOTBM-NEXT: retl ; ; X86-BMITBM-LABEL: c5_i64: ; X86-BMITBM: # %bb.0: -; X86-BMITBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMITBM-NEXT: bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13 -; X86-BMITBM-NEXT: movl %ecx, (%eax) -; X86-BMITBM-NEXT: movl $0, 4(%eax) +; X86-BMITBM-NEXT: bextrl $2579, {{[0-9]+}}(%esp), %eax # imm = 0xA13 +; X86-BMITBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMITBM-NEXT: movl %eax, (%ecx) +; X86-BMITBM-NEXT: movl $0, 4(%ecx) ; X86-BMITBM-NEXT: retl ; ; X64-NOBMI-LABEL: c5_i64: @@ -8534,19 +8552,19 @@ define void @c6_i64(i64 %arg, ptr %ptr) nounwind { ; ; X86-BMINOTBM-LABEL: c6_i64: ; X86-BMINOTBM: # %bb.0: -; X86-BMINOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMINOTBM-NEXT: movl $3091, %ecx # imm = 0xC13 -; X86-BMINOTBM-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx -; X86-BMINOTBM-NEXT: movl %ecx, (%eax) -; X86-BMINOTBM-NEXT: movl $0, 4(%eax) +; X86-BMINOTBM-NEXT: movl $3091, %eax # imm = 0xC13 +; X86-BMINOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax +; X86-BMINOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMINOTBM-NEXT: movl %eax, (%ecx) +; X86-BMINOTBM-NEXT: movl $0, 4(%ecx) ; X86-BMINOTBM-NEXT: retl ; ; X86-BMITBM-LABEL: c6_i64: ; X86-BMITBM: # %bb.0: -; X86-BMITBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMITBM-NEXT: bextrl $3091, {{[0-9]+}}(%esp), %ecx # imm = 0xC13 -; X86-BMITBM-NEXT: movl %ecx, (%eax) -; X86-BMITBM-NEXT: movl $0, 4(%eax) +; X86-BMITBM-NEXT: bextrl $3091, {{[0-9]+}}(%esp), %eax # imm = 0xC13 +; X86-BMITBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMITBM-NEXT: movl %eax, (%ecx) +; X86-BMITBM-NEXT: movl $0, 4(%ecx) ; X86-BMITBM-NEXT: retl ; ; X64-NOBMI-LABEL: c6_i64: diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll index f12693469a3f6..cf462b1e229dc 100644 --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -84,9 +84,9 @@ define <16 x i64> @catcat(<4 x i64> %x) { ; AVX512F-LABEL: catcat: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3] +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm1 ; AVX512F-NEXT: vmovaps %zmm2, %zmm0 ; AVX512F-NEXT: retq @@ -135,9 +135,9 @@ define <16 x i64> @load_catcat(ptr %p) { ; AVX512F-LABEL: load_catcat: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5] ; AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7] ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: retq %x = load <4 x i64>, ptr %p diff --git a/llvm/test/CodeGen/X86/extract-lowbits.ll b/llvm/test/CodeGen/X86/extract-lowbits.ll index 077e7770c83cd..b6b68cd81ad9c 100644 --- a/llvm/test/CodeGen/X86/extract-lowbits.ll +++ b/llvm/test/CodeGen/X86/extract-lowbits.ll @@ -547,11 +547,11 @@ define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-NOBMI-LABEL: bzhi64_a2_load: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx ; X86-NOBMI-NEXT: shldl %cl, %eax, %edx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB8_2 @@ -569,11 +569,11 @@ define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-BMI1-LABEL: bzhi64_a2_load: ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl $1, %eax ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: shldl %cl, %eax, %edx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: shll %cl, %eax ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB8_2 @@ -591,11 +591,11 @@ define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-BMI2-LABEL: bzhi64_a2_load: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl $1, %eax ; X86-BMI2-NEXT: xorl %edx, %edx ; X86-BMI2-NEXT: shldl %cl, %eax, %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB8_2 @@ -641,11 +641,11 @@ define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind { ; X86-NOBMI-LABEL: bzhi64_a3_load_indexzext: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx ; X86-NOBMI-NEXT: shldl %cl, %eax, %edx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: je .LBB9_2 @@ -663,11 +663,11 @@ define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind { ; X86-BMI1-LABEL: bzhi64_a3_load_indexzext: ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl $1, %eax ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: shldl %cl, %eax, %edx +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: shll %cl, %eax ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: je .LBB9_2 @@ -685,11 +685,11 @@ define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind { ; X86-BMI2-LABEL: bzhi64_a3_load_indexzext: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl $1, %eax ; X86-BMI2-NEXT: xorl %edx, %edx ; X86-BMI2-NEXT: shldl %cl, %eax, %edx +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: je .LBB9_2 @@ -950,14 +950,16 @@ define i32 @bzhi64_32_a1(i64 %val, i32 %numlowbits) nounwind { define i32 @bzhi64_32_a1_trunc_extrause(i64 %val, i32 %numlowbits, ptr %escape) nounwind { ; X86-NOBMI-LABEL: bzhi64_32_a1_trunc_extrause: ; X86-NOBMI: # %bb.0: +; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl %edx, (%eax) +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: shll %cl, %eax +; X86-NOBMI-NEXT: movl %edx, (%esi) ; X86-NOBMI-NEXT: decl %eax ; X86-NOBMI-NEXT: andl %edx, %eax +; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: retl ; ; X86-BMI1-LABEL: bzhi64_32_a1_trunc_extrause: @@ -982,10 +984,10 @@ define i32 @bzhi64_32_a1_trunc_extrause(i64 %val, i32 %numlowbits, ptr %escape) ; X64-NOBMI-LABEL: bzhi64_32_a1_trunc_extrause: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movl %esi, %ecx -; X64-NOBMI-NEXT: movl %edi, (%rdx) ; X64-NOBMI-NEXT: movl $1, %eax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shll %cl, %eax +; X64-NOBMI-NEXT: movl %edi, (%rdx) ; X64-NOBMI-NEXT: decl %eax ; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: retq @@ -1404,9 +1406,9 @@ define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind { ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: shll %cl, %esi +; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB21_1 @@ -1485,9 +1487,9 @@ define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind { ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: shll %cl, %esi +; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB22_1 @@ -1569,11 +1571,11 @@ define i64 @bzhi64_b2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: movl $-1, %edi ; X86-NOBMI-NEXT: shll %cl, %edi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB23_1 @@ -1660,11 +1662,11 @@ define i64 @bzhi64_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind { ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: movl $-1, %edi ; X86-NOBMI-NEXT: shll %cl, %edi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB24_1 @@ -1754,9 +1756,9 @@ define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind { ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: shll %cl, %esi +; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB25_1 @@ -2059,10 +2061,10 @@ define i32 @bzhi64_32_b3(i64 %val, i8 %numlowbits) nounwind { ; X64-NOBMI-LABEL: bzhi64_32_b3: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movl %esi, %ecx -; X64-NOBMI-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NOBMI-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NOBMI-NEXT: shlq %cl, %rdx +; X64-NOBMI-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NOBMI-NEXT: xorl %edx, %eax ; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax @@ -2481,9 +2483,9 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits, ptr %escape) nounwind { ; X86-NOBMI-LABEL: bzhi64_c0: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movb $64, %cl ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl $-1, %eax ; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: shrl %cl, %edx @@ -2503,9 +2505,9 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits, ptr %escape) nounwind { ; X86-BMI1-LABEL: bzhi64_c0: ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movb $64, %cl ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl $-1, %eax ; X86-BMI1-NEXT: movl $-1, %edx ; X86-BMI1-NEXT: shrl %cl, %edx @@ -2525,9 +2527,9 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits, ptr %escape) nounwind { ; X86-BMI2-LABEL: bzhi64_c0: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movb $64, %bl ; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %bl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl $-1, %eax ; X86-BMI2-NEXT: shrxl %ebx, %eax, %edx ; X86-BMI2-NEXT: testb $32, %bl @@ -2583,9 +2585,9 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits, ptr %escape) nounwind ; X86-NOBMI-LABEL: bzhi64_c1_indexzext: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movb $64, %cl ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl $-1, %eax ; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: shrl %cl, %edx @@ -2605,9 +2607,9 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits, ptr %escape) nounwind ; X86-BMI1-LABEL: bzhi64_c1_indexzext: ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movb $64, %cl ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl $-1, %eax ; X86-BMI1-NEXT: movl $-1, %edx ; X86-BMI1-NEXT: shrl %cl, %edx @@ -2627,9 +2629,9 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits, ptr %escape) nounwind ; X86-BMI2-LABEL: bzhi64_c1_indexzext: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movb $64, %bl ; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %bl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl $-1, %eax ; X86-BMI2-NEXT: shrxl %ebx, %eax, %edx ; X86-BMI2-NEXT: testb $32, %bl @@ -2690,9 +2692,9 @@ define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits, ptr %escape) nounwind { ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movb $64, %cl ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl $-1, %edi ; X86-NOBMI-NEXT: movl $-1, %ebx ; X86-NOBMI-NEXT: shrl %cl, %ebx @@ -2719,9 +2721,9 @@ define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits, ptr %escape) nounwind { ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movb $64, %cl ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl $-1, %ebx ; X86-BMI1-NEXT: shrl %cl, %ebx @@ -2747,9 +2749,9 @@ define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits, ptr %escape) nounwind { ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movb $64, %dl ; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %dl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl $-1, %esi ; X86-BMI2-NEXT: shrxl %edx, %esi, %edi ; X86-BMI2-NEXT: testb $32, %dl @@ -2814,9 +2816,9 @@ define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits, ptr %escape) nounwi ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movb $64, %cl ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl $-1, %edi ; X86-NOBMI-NEXT: movl $-1, %ebx ; X86-NOBMI-NEXT: shrl %cl, %ebx @@ -2843,9 +2845,9 @@ define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits, ptr %escape) nounwi ; X86-BMI1-NEXT: pushl %edi ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movb $64, %cl ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1-NEXT: movl $-1, %edi ; X86-BMI1-NEXT: movl $-1, %ebx ; X86-BMI1-NEXT: shrl %cl, %ebx @@ -2871,9 +2873,9 @@ define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits, ptr %escape) nounwi ; X86-BMI2-NEXT: pushl %edi ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movb $64, %dl ; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %dl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI2-NEXT: movl $-1, %esi ; X86-BMI2-NEXT: shrxl %edx, %esi, %edi ; X86-BMI2-NEXT: testb $32, %dl @@ -2937,9 +2939,9 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits, ptr %escape) nounwi ; X86-NOBMI-LABEL: bzhi64_c4_commutative: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movb $64, %cl ; X86-NOBMI-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl $-1, %eax ; X86-NOBMI-NEXT: movl $-1, %edx ; X86-NOBMI-NEXT: shrl %cl, %edx @@ -2959,9 +2961,9 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits, ptr %escape) nounwi ; X86-BMI1-LABEL: bzhi64_c4_commutative: ; X86-BMI1: # %bb.0: ; X86-BMI1-NEXT: pushl %esi -; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movb $64, %cl ; X86-BMI1-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-BMI1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI1-NEXT: movl $-1, %eax ; X86-BMI1-NEXT: movl $-1, %edx ; X86-BMI1-NEXT: shrl %cl, %edx @@ -2981,9 +2983,9 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits, ptr %escape) nounwi ; X86-BMI2-LABEL: bzhi64_c4_commutative: ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %ebx -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movb $64, %bl ; X86-BMI2-NEXT: subb {{[0-9]+}}(%esp), %bl +; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl $-1, %eax ; X86-BMI2-NEXT: shrxl %ebx, %eax, %edx ; X86-BMI2-NEXT: testb $32, %bl @@ -3500,8 +3502,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: shldl %cl, %edx, %eax -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl %esi, %edi +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB48_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %eax, %edi @@ -3509,8 +3511,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: shrl %cl, %eax ; X86-NOBMI-NEXT: xorl %ebx, %ebx -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl $0, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB48_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %esi, %ebx @@ -3539,8 +3541,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: shll %cl, %esi ; X86-BMI1-NEXT: shldl %cl, %edx, %eax -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl %esi, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB48_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %eax, %edi @@ -3548,8 +3550,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind { ; X86-BMI1-NEXT: movl %edi, %eax ; X86-BMI1-NEXT: shrl %cl, %eax ; X86-BMI1-NEXT: xorl %ebx, %ebx -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl $0, %edx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB48_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %esi, %ebx @@ -3637,8 +3639,8 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: shldl %cl, %edx, %eax -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl %esi, %edi +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB49_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %eax, %edi @@ -3646,8 +3648,8 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: shrl %cl, %eax ; X86-NOBMI-NEXT: xorl %ebx, %ebx -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl $0, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB49_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %esi, %ebx @@ -3676,8 +3678,8 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind { ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: shll %cl, %esi ; X86-BMI1-NEXT: shldl %cl, %edx, %eax -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl %esi, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB49_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %eax, %edi @@ -3685,8 +3687,8 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind { ; X86-BMI1-NEXT: movl %edi, %eax ; X86-BMI1-NEXT: shrl %cl, %eax ; X86-BMI1-NEXT: xorl %ebx, %ebx -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl $0, %edx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB49_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %esi, %ebx @@ -3778,8 +3780,8 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: shldl %cl, %edx, %eax -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl %esi, %edi +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB50_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %eax, %edi @@ -3787,8 +3789,8 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: shrl %cl, %eax ; X86-NOBMI-NEXT: xorl %ebx, %ebx -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl $0, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB50_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %esi, %ebx @@ -3818,8 +3820,8 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: shll %cl, %esi ; X86-BMI1-NEXT: shldl %cl, %edx, %eax -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl %esi, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB50_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %eax, %edi @@ -3827,8 +3829,8 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind { ; X86-BMI1-NEXT: movl %edi, %eax ; X86-BMI1-NEXT: shrl %cl, %eax ; X86-BMI1-NEXT: xorl %ebx, %ebx -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl $0, %edx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB50_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %esi, %ebx @@ -3919,8 +3921,8 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: shldl %cl, %edx, %eax -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl %esi, %edi +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB51_2 ; X86-NOBMI-NEXT: # %bb.1: ; X86-NOBMI-NEXT: movl %eax, %edi @@ -3928,8 +3930,8 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: shrl %cl, %eax ; X86-NOBMI-NEXT: xorl %ebx, %ebx -; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: movl $0, %edx +; X86-NOBMI-NEXT: testb $32, %cl ; X86-NOBMI-NEXT: jne .LBB51_4 ; X86-NOBMI-NEXT: # %bb.3: ; X86-NOBMI-NEXT: movl %esi, %ebx @@ -3959,8 +3961,8 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { ; X86-BMI1-NEXT: movl %edx, %esi ; X86-BMI1-NEXT: shll %cl, %esi ; X86-BMI1-NEXT: shldl %cl, %edx, %eax -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl %esi, %edi +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB51_2 ; X86-BMI1-NEXT: # %bb.1: ; X86-BMI1-NEXT: movl %eax, %edi @@ -3968,8 +3970,8 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { ; X86-BMI1-NEXT: movl %edi, %eax ; X86-BMI1-NEXT: shrl %cl, %eax ; X86-BMI1-NEXT: xorl %ebx, %ebx -; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: movl $0, %edx +; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: jne .LBB51_4 ; X86-BMI1-NEXT: # %bb.3: ; X86-BMI1-NEXT: movl %esi, %ebx diff --git a/llvm/test/CodeGen/X86/extract-store.ll b/llvm/test/CodeGen/X86/extract-store.ll index 255ea44e520c0..ddc913708fabd 100644 --- a/llvm/test/CodeGen/X86/extract-store.ll +++ b/llvm/test/CodeGen/X86/extract-store.ll @@ -463,19 +463,9 @@ define void @extract_f64_1(ptr nocapture %dst, <2 x double> %foo) nounwind { define void @extract_f128_0(ptr nocapture %dst, <2 x fp128> %foo) nounwind { ; SSE-X86-LABEL: extract_f128_0: ; SSE-X86: # %bb.0: -; SSE-X86-NEXT: pushl %edi -; SSE-X86-NEXT: pushl %esi +; SSE-X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 ; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; SSE-X86-NEXT: movl %esi, 12(%edi) -; SSE-X86-NEXT: movl %edx, 8(%edi) -; SSE-X86-NEXT: movl %ecx, 4(%edi) -; SSE-X86-NEXT: movl %eax, (%edi) -; SSE-X86-NEXT: popl %esi -; SSE-X86-NEXT: popl %edi +; SSE-X86-NEXT: movups %xmm0, (%eax) ; SSE-X86-NEXT: retl ; ; SSE-X64-LABEL: extract_f128_0: @@ -502,19 +492,9 @@ define void @extract_f128_0(ptr nocapture %dst, <2 x fp128> %foo) nounwind { define void @extract_f128_1(ptr nocapture %dst, <2 x fp128> %foo) nounwind { ; SSE-X86-LABEL: extract_f128_1: ; SSE-X86: # %bb.0: -; SSE-X86-NEXT: pushl %edi -; SSE-X86-NEXT: pushl %esi +; SSE-X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 ; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; SSE-X86-NEXT: movl %esi, 12(%edi) -; SSE-X86-NEXT: movl %edx, 8(%edi) -; SSE-X86-NEXT: movl %ecx, 4(%edi) -; SSE-X86-NEXT: movl %eax, (%edi) -; SSE-X86-NEXT: popl %esi -; SSE-X86-NEXT: popl %edi +; SSE-X86-NEXT: movups %xmm0, (%eax) ; SSE-X86-NEXT: retl ; ; SSE-X64-LABEL: extract_f128_1: diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll index 1706f17eac165..c338eb9f0b335 100644 --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -952,9 +952,9 @@ define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: copysign_v4f32: ; X64: # %bb.0: ; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] ; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 +; X64-NEXT: vandps %xmm3, %xmm0, %xmm0 ; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index ce68eebd5b752..8dd6710e20f2c 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -489,33 +489,61 @@ define i32 @main() nounwind { ; X64-SSSE3-NEXT: popq %rbp ; X64-SSSE3-NEXT: retq ; -; X64-AVX-LABEL: main: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: pushq %rbp -; X64-AVX-NEXT: movq %rsp, %rbp -; X64-AVX-NEXT: andq $-32, %rsp -; X64-AVX-NEXT: subq $64, %rsp -; X64-AVX-NEXT: movq n1@GOTPCREL(%rip), %rax -; X64-AVX-NEXT: vmovaps (%rax), %ymm0 -; X64-AVX-NEXT: movl zero+4(%rip), %ecx -; X64-AVX-NEXT: movl zero+8(%rip), %eax -; X64-AVX-NEXT: vmovaps %ymm0, zero(%rip) -; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] -; X64-AVX-NEXT: vmovaps %ymm0, (%rsp) -; X64-AVX-NEXT: vmovaps (%rsp), %ymm0 -; X64-AVX-NEXT: vextractps $2, %xmm0, %esi -; X64-AVX-NEXT: xorl %edx, %edx -; X64-AVX-NEXT: divl %esi -; X64-AVX-NEXT: movl %eax, %esi -; X64-AVX-NEXT: vextractps $1, %xmm0, %edi -; X64-AVX-NEXT: movl %ecx, %eax -; X64-AVX-NEXT: xorl %edx, %edx -; X64-AVX-NEXT: divl %edi -; X64-AVX-NEXT: addl %esi, %eax -; X64-AVX-NEXT: movq %rbp, %rsp -; X64-AVX-NEXT: popq %rbp -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: main: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rbp +; X64-AVX1-NEXT: movq %rsp, %rbp +; X64-AVX1-NEXT: andq $-32, %rsp +; X64-AVX1-NEXT: subq $64, %rsp +; X64-AVX1-NEXT: movq n1@GOTPCREL(%rip), %rax +; X64-AVX1-NEXT: vmovaps (%rax), %ymm0 +; X64-AVX1-NEXT: movl zero+4(%rip), %ecx +; X64-AVX1-NEXT: movl zero+8(%rip), %eax +; X64-AVX1-NEXT: vmovaps %ymm0, zero(%rip) +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; X64-AVX1-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX1-NEXT: vmovaps (%rsp), %ymm0 +; X64-AVX1-NEXT: vextractps $2, %xmm0, %esi +; X64-AVX1-NEXT: vextractps $1, %xmm0, %edi +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %esi +; X64-AVX1-NEXT: movl %eax, %esi +; X64-AVX1-NEXT: movl %ecx, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %edi +; X64-AVX1-NEXT: addl %esi, %eax +; X64-AVX1-NEXT: movq %rbp, %rsp +; X64-AVX1-NEXT: popq %rbp +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: main: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: pushq %rbp +; X64-AVX2-NEXT: movq %rsp, %rbp +; X64-AVX2-NEXT: andq $-32, %rsp +; X64-AVX2-NEXT: subq $64, %rsp +; X64-AVX2-NEXT: movq n1@GOTPCREL(%rip), %rax +; X64-AVX2-NEXT: vmovaps (%rax), %ymm0 +; X64-AVX2-NEXT: movl zero+4(%rip), %ecx +; X64-AVX2-NEXT: movl zero+8(%rip), %eax +; X64-AVX2-NEXT: vmovaps %ymm0, zero(%rip) +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX2-NEXT: vmovaps (%rsp), %ymm0 +; X64-AVX2-NEXT: vextractps $2, %xmm0, %esi +; X64-AVX2-NEXT: vextractps $1, %xmm0, %edi +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %esi +; X64-AVX2-NEXT: movl %eax, %esi +; X64-AVX2-NEXT: movl %ecx, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %edi +; X64-AVX2-NEXT: addl %esi, %eax +; X64-AVX2-NEXT: movq %rbp, %rsp +; X64-AVX2-NEXT: popq %rbp +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %stackptr = alloca <8 x i32>, align 32 %z = load <8 x i32>, ptr @zero, align 32 %t1 = load <8 x i32>, ptr @n1, align 32 diff --git a/llvm/test/CodeGen/X86/fast-isel-select-cmov2.ll b/llvm/test/CodeGen/X86/fast-isel-select-cmov2.ll index eb2bd08a2b14c..b8fee60d3ef64 100644 --- a/llvm/test/CodeGen/X86/fast-isel-select-cmov2.ll +++ b/llvm/test/CodeGen/X86/fast-isel-select-cmov2.ll @@ -71,17 +71,10 @@ define i64 @select_fcmp_ogt_cmov(double %a, double %b, i64 %c, i64 %d) { define i64 @select_fcmp_oge_cmov(double %a, double %b, i64 %c, i64 %d) { ; NOAVX-LABEL: select_fcmp_oge_cmov: ; NOAVX: ## %bb.0: -; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: ucomisd %xmm1, %xmm0 +; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: cmovbq %rsi, %rax ; NOAVX-NEXT: retq -; -; FAST_AVX-LABEL: select_fcmp_oge_cmov: -; FAST_AVX: ## %bb.0: -; FAST_AVX-NEXT: movq %rdi, %rax -; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0 -; FAST_AVX-NEXT: cmovbq %rsi, %rax -; FAST_AVX-NEXT: retq %1 = fcmp oge double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 @@ -109,17 +102,10 @@ define i64 @select_fcmp_olt_cmov(double %a, double %b, i64 %c, i64 %d) { define i64 @select_fcmp_ole_cmov(double %a, double %b, i64 %c, i64 %d) { ; NOAVX-LABEL: select_fcmp_ole_cmov: ; NOAVX: ## %bb.0: -; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: ucomisd %xmm0, %xmm1 +; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: cmovbq %rsi, %rax ; NOAVX-NEXT: retq -; -; FAST_AVX-LABEL: select_fcmp_ole_cmov: -; FAST_AVX: ## %bb.0: -; FAST_AVX-NEXT: movq %rdi, %rax -; FAST_AVX-NEXT: vucomisd %xmm0, %xmm1 -; FAST_AVX-NEXT: cmovbq %rsi, %rax -; FAST_AVX-NEXT: retq %1 = fcmp ole double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 @@ -128,17 +114,10 @@ define i64 @select_fcmp_ole_cmov(double %a, double %b, i64 %c, i64 %d) { define i64 @select_fcmp_one_cmov(double %a, double %b, i64 %c, i64 %d) { ; NOAVX-LABEL: select_fcmp_one_cmov: ; NOAVX: ## %bb.0: -; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: ucomisd %xmm1, %xmm0 +; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: cmoveq %rsi, %rax ; NOAVX-NEXT: retq -; -; FAST_AVX-LABEL: select_fcmp_one_cmov: -; FAST_AVX: ## %bb.0: -; FAST_AVX-NEXT: movq %rdi, %rax -; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0 -; FAST_AVX-NEXT: cmoveq %rsi, %rax -; FAST_AVX-NEXT: retq %1 = fcmp one double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 @@ -147,17 +126,10 @@ define i64 @select_fcmp_one_cmov(double %a, double %b, i64 %c, i64 %d) { define i64 @select_fcmp_ord_cmov(double %a, double %b, i64 %c, i64 %d) { ; NOAVX-LABEL: select_fcmp_ord_cmov: ; NOAVX: ## %bb.0: -; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: ucomisd %xmm1, %xmm0 +; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: cmovpq %rsi, %rax ; NOAVX-NEXT: retq -; -; FAST_AVX-LABEL: select_fcmp_ord_cmov: -; FAST_AVX: ## %bb.0: -; FAST_AVX-NEXT: movq %rdi, %rax -; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0 -; FAST_AVX-NEXT: cmovpq %rsi, %rax -; FAST_AVX-NEXT: retq %1 = fcmp ord double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 @@ -166,17 +138,10 @@ define i64 @select_fcmp_ord_cmov(double %a, double %b, i64 %c, i64 %d) { define i64 @select_fcmp_uno_cmov(double %a, double %b, i64 %c, i64 %d) { ; NOAVX-LABEL: select_fcmp_uno_cmov: ; NOAVX: ## %bb.0: -; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: ucomisd %xmm1, %xmm0 +; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: cmovnpq %rsi, %rax ; NOAVX-NEXT: retq -; -; FAST_AVX-LABEL: select_fcmp_uno_cmov: -; FAST_AVX: ## %bb.0: -; FAST_AVX-NEXT: movq %rdi, %rax -; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0 -; FAST_AVX-NEXT: cmovnpq %rsi, %rax -; FAST_AVX-NEXT: retq %1 = fcmp uno double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 @@ -185,17 +150,10 @@ define i64 @select_fcmp_uno_cmov(double %a, double %b, i64 %c, i64 %d) { define i64 @select_fcmp_ueq_cmov(double %a, double %b, i64 %c, i64 %d) { ; NOAVX-LABEL: select_fcmp_ueq_cmov: ; NOAVX: ## %bb.0: -; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: ucomisd %xmm1, %xmm0 +; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: cmovneq %rsi, %rax ; NOAVX-NEXT: retq -; -; FAST_AVX-LABEL: select_fcmp_ueq_cmov: -; FAST_AVX: ## %bb.0: -; FAST_AVX-NEXT: movq %rdi, %rax -; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0 -; FAST_AVX-NEXT: cmovneq %rsi, %rax -; FAST_AVX-NEXT: retq %1 = fcmp ueq double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 @@ -204,17 +162,10 @@ define i64 @select_fcmp_ueq_cmov(double %a, double %b, i64 %c, i64 %d) { define i64 @select_fcmp_ugt_cmov(double %a, double %b, i64 %c, i64 %d) { ; NOAVX-LABEL: select_fcmp_ugt_cmov: ; NOAVX: ## %bb.0: -; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: ucomisd %xmm0, %xmm1 +; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: cmovaeq %rsi, %rax ; NOAVX-NEXT: retq -; -; FAST_AVX-LABEL: select_fcmp_ugt_cmov: -; FAST_AVX: ## %bb.0: -; FAST_AVX-NEXT: movq %rdi, %rax -; FAST_AVX-NEXT: vucomisd %xmm0, %xmm1 -; FAST_AVX-NEXT: cmovaeq %rsi, %rax -; FAST_AVX-NEXT: retq %1 = fcmp ugt double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 @@ -242,17 +193,10 @@ define i64 @select_fcmp_uge_cmov(double %a, double %b, i64 %c, i64 %d) { define i64 @select_fcmp_ult_cmov(double %a, double %b, i64 %c, i64 %d) { ; NOAVX-LABEL: select_fcmp_ult_cmov: ; NOAVX: ## %bb.0: -; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: ucomisd %xmm1, %xmm0 +; NOAVX-NEXT: movq %rdi, %rax ; NOAVX-NEXT: cmovaeq %rsi, %rax ; NOAVX-NEXT: retq -; -; FAST_AVX-LABEL: select_fcmp_ult_cmov: -; FAST_AVX: ## %bb.0: -; FAST_AVX-NEXT: movq %rdi, %rax -; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0 -; FAST_AVX-NEXT: cmovaeq %rsi, %rax -; FAST_AVX-NEXT: retq %1 = fcmp ult double %a, %b %2 = select i1 %1, i64 %c, i64 %d ret i64 %2 diff --git a/llvm/test/CodeGen/X86/fcmp-logic.ll b/llvm/test/CodeGen/X86/fcmp-logic.ll index 7b806bca43c2e..0fde02f44bb7b 100644 --- a/llvm/test/CodeGen/X86/fcmp-logic.ll +++ b/llvm/test/CodeGen/X86/fcmp-logic.ll @@ -271,23 +271,35 @@ define i1 @une_uno_xor_f64_use1(double %w, double %x, double %y, double %z, ptr ; SSE2-NEXT: setp %al ; SSE2-NEXT: setne %cl ; SSE2-NEXT: orb %al, %cl -; SSE2-NEXT: movb %cl, (%rdi) ; SSE2-NEXT: ucomisd %xmm3, %xmm2 +; SSE2-NEXT: movb %cl, (%rdi) ; SSE2-NEXT: setp %al ; SSE2-NEXT: xorb %cl, %al ; SSE2-NEXT: retq ; -; AVX-LABEL: une_uno_xor_f64_use1: -; AVX: # %bb.0: -; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: setp %al -; AVX-NEXT: setne %cl -; AVX-NEXT: orb %al, %cl -; AVX-NEXT: movb %cl, (%rdi) -; AVX-NEXT: vucomisd %xmm3, %xmm2 -; AVX-NEXT: setp %al -; AVX-NEXT: xorb %cl, %al -; AVX-NEXT: retq +; AVX1-LABEL: une_uno_xor_f64_use1: +; AVX1: # %bb.0: +; AVX1-NEXT: vucomisd %xmm1, %xmm0 +; AVX1-NEXT: setp %al +; AVX1-NEXT: setne %cl +; AVX1-NEXT: orb %al, %cl +; AVX1-NEXT: vucomisd %xmm3, %xmm2 +; AVX1-NEXT: movb %cl, (%rdi) +; AVX1-NEXT: setp %al +; AVX1-NEXT: xorb %cl, %al +; AVX1-NEXT: retq +; +; AVX512-LABEL: une_uno_xor_f64_use1: +; AVX512: # %bb.0: +; AVX512-NEXT: vucomisd %xmm1, %xmm0 +; AVX512-NEXT: setp %al +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: movb %cl, (%rdi) +; AVX512-NEXT: vucomisd %xmm3, %xmm2 +; AVX512-NEXT: setp %al +; AVX512-NEXT: xorb %cl, %al +; AVX512-NEXT: retq %f1 = fcmp une double %w, %x store i1 %f1, ptr %p %f2 = fcmp uno double %y, %z @@ -305,8 +317,8 @@ define i1 @une_uno_xor_f64_use2(double %w, double %x, double %y, double %z, ptr ; SSE2-NEXT: setne %cl ; SSE2-NEXT: orb %al, %cl ; SSE2-NEXT: ucomisd %xmm3, %xmm2 -; SSE2-NEXT: setp %al ; SSE2-NEXT: setp (%rdi) +; SSE2-NEXT: setp %al ; SSE2-NEXT: xorb %cl, %al ; SSE2-NEXT: retq ; @@ -317,8 +329,8 @@ define i1 @une_uno_xor_f64_use2(double %w, double %x, double %y, double %z, ptr ; AVX-NEXT: setne %cl ; AVX-NEXT: orb %al, %cl ; AVX-NEXT: vucomisd %xmm3, %xmm2 -; AVX-NEXT: setp %al ; AVX-NEXT: setp (%rdi) +; AVX-NEXT: setp %al ; AVX-NEXT: xorb %cl, %al ; AVX-NEXT: retq %f1 = fcmp une double %w, %x diff --git a/llvm/test/CodeGen/X86/fixup-bw-inst.ll b/llvm/test/CodeGen/X86/fixup-bw-inst.ll index 4301498912003..c33e5ae239f7f 100644 --- a/llvm/test/CodeGen/X86/fixup-bw-inst.ll +++ b/llvm/test/CodeGen/X86/fixup-bw-inst.ll @@ -14,35 +14,41 @@ define void @foo1(i32 %count, ptr noalias nocapture %q, ptr noalias nocapture %p ; BWON-LABEL: foo1: ; BWON: ## %bb.0: ; BWON-NEXT: testl %edi, %edi -; BWON-NEXT: jle LBB0_2 +; BWON-NEXT: jle LBB0_3 +; BWON-NEXT: ## %bb.1: ## %.lr.ph +; BWON-NEXT: movl %edi, %eax +; BWON-NEXT: xorl %ecx, %ecx ; BWON-NEXT: .p2align 4 -; BWON-NEXT: LBB0_1: ## %a4 +; BWON-NEXT: LBB0_2: ## %a4 ; BWON-NEXT: ## =>This Inner Loop Header: Depth=1 -; BWON-NEXT: movzbl (%rsi), %eax -; BWON-NEXT: movb %al, (%rdx) -; BWON-NEXT: movzbl 1(%rsi), %eax -; BWON-NEXT: movb %al, 1(%rdx) -; BWON-NEXT: addq $8, %rdx -; BWON-NEXT: decl %edi -; BWON-NEXT: jne LBB0_1 -; BWON-NEXT: LBB0_2: ## %._crit_edge +; BWON-NEXT: movzbl (%rsi), %edi +; BWON-NEXT: movb %dil, (%rdx,%rcx,8) +; BWON-NEXT: movzbl 1(%rsi), %edi +; BWON-NEXT: movb %dil, 1(%rdx,%rcx,8) +; BWON-NEXT: incq %rcx +; BWON-NEXT: cmpl %ecx, %eax +; BWON-NEXT: jne LBB0_2 +; BWON-NEXT: LBB0_3: ## %._crit_edge ; BWON-NEXT: retq ; ; BWOFF-LABEL: foo1: ; BWOFF: ## %bb.0: ; BWOFF-NEXT: testl %edi, %edi -; BWOFF-NEXT: jle LBB0_2 +; BWOFF-NEXT: jle LBB0_3 +; BWOFF-NEXT: ## %bb.1: ## %.lr.ph +; BWOFF-NEXT: movl %edi, %eax +; BWOFF-NEXT: xorl %ecx, %ecx ; BWOFF-NEXT: .p2align 4 -; BWOFF-NEXT: LBB0_1: ## %a4 +; BWOFF-NEXT: LBB0_2: ## %a4 ; BWOFF-NEXT: ## =>This Inner Loop Header: Depth=1 -; BWOFF-NEXT: movb (%rsi), %al -; BWOFF-NEXT: movb %al, (%rdx) -; BWOFF-NEXT: movb 1(%rsi), %al -; BWOFF-NEXT: movb %al, 1(%rdx) -; BWOFF-NEXT: addq $8, %rdx -; BWOFF-NEXT: decl %edi -; BWOFF-NEXT: jne LBB0_1 -; BWOFF-NEXT: LBB0_2: ## %._crit_edge +; BWOFF-NEXT: movb (%rsi), %dil +; BWOFF-NEXT: movb %dil, (%rdx,%rcx,8) +; BWOFF-NEXT: movb 1(%rsi), %dil +; BWOFF-NEXT: movb %dil, 1(%rdx,%rcx,8) +; BWOFF-NEXT: incq %rcx +; BWOFF-NEXT: cmpl %ecx, %eax +; BWOFF-NEXT: jne LBB0_2 +; BWOFF-NEXT: LBB0_3: ## %._crit_edge ; BWOFF-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll index b9a2772d3df70..31a9f3804542a 100644 --- a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll @@ -189,7 +189,7 @@ define <4 x double> @negated_constant_v4f64_fadd(<4 x double> %a) { define <4 x double> @negated_constant_v4f64_2fma_undefs(<4 x double> %a, <4 x double> %b) { ; FMA3-LABEL: negated_constant_v4f64_2fma_undefs: ; FMA3: # %bb.0: -; FMA3-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; FMA3-NEXT: vmovapd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; FMA3-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + mem ; FMA3-NEXT: vfmadd132pd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2 ; FMA3-NEXT: vaddpd %ymm1, %ymm0, %ymm0 @@ -197,7 +197,7 @@ define <4 x double> @negated_constant_v4f64_2fma_undefs(<4 x double> %a, <4 x do ; ; FMA4-LABEL: negated_constant_v4f64_2fma_undefs: ; FMA4: # %bb.0: -; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; FMA4-NEXT: vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + mem ; FMA4-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2 ; FMA4-NEXT: vaddpd %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll index a886a3c830340..71942f3effdab 100644 --- a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll @@ -343,7 +343,7 @@ entry: define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) { ; CHECK-LABEL: test_mm256_fnmsub_ps: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vxorps %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0 @@ -358,7 +358,7 @@ entry: define <4 x double> @test_mm256_fnmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) { ; CHECK-LABEL: test_mm256_fnmsub_pd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorpd %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vxorpd %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0 diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll index 0ffcb8c46cef9..9c2f688e4fc83 100644 --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1367,7 +1367,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) { ; FMA-INFS-LABEL: test_v8f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 ; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 @@ -1375,7 +1375,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float ; ; FMA4-INFS-LABEL: test_v8f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 ; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 @@ -1517,7 +1517,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) { ; FMA-INFS-LABEL: test_v4f64_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3 ; FMA-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 @@ -1525,7 +1525,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do ; ; FMA4-INFS-LABEL: test_v4f64_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3 ; FMA4-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll index fe5ddca67470c..9d190a18c4552 100644 --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -259,7 +259,7 @@ define <8 x double> @test_8f64_fmsub_load(ptr %a0, <8 x double> %a1, <8 x double define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -268,7 +268,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> % ; ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -305,7 +305,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> % define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -314,7 +314,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -351,7 +351,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -360,7 +360,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float ; ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -397,7 +397,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -406,7 +406,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -443,7 +443,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -452,7 +452,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> % ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -490,7 +490,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> % define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -499,7 +499,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -537,7 +537,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -546,7 +546,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -584,7 +584,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -593,7 +593,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -631,7 +631,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -640,7 +640,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> % ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -677,7 +677,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> % define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -686,7 +686,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -723,7 +723,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -732,7 +732,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -769,7 +769,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -778,7 +778,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -819,7 +819,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) { ; FMA-INFS-LABEL: test_v16f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 ; FMA-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 ; FMA-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 @@ -830,7 +830,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x ; ; FMA4-INFS-LABEL: test_v16f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 ; FMA4-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 ; FMA4-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 @@ -878,7 +878,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) { ; FMA-INFS-LABEL: test_v8f64_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7 ; FMA-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6 ; FMA-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3 @@ -889,7 +889,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do ; ; FMA4-INFS-LABEL: test_v8f64_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7 ; FMA4-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6 ; FMA4-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3 @@ -1143,7 +1143,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> % ; FMA: # %bb.0: ; FMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; FMA-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA-NEXT: vxorpd %ymm2, %ymm0, %ymm0 ; FMA-NEXT: vxorpd %ymm2, %ymm1, %ymm1 ; FMA-NEXT: retq @@ -1152,7 +1152,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> % ; FMA4: # %bb.0: ; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA4-NEXT: vxorpd %ymm2, %ymm0, %ymm0 ; FMA4-NEXT: vxorpd %ymm2, %ymm1, %ymm1 ; FMA4-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll index 80e3a017a44e3..7b69d5504dda3 100644 --- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -22,25 +22,25 @@ declare <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat>, <4 x bfloat>) define float @test_fmaximum(float %x, float %y) nounwind { ; SSE2-LABEL: test_fmaximum: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB0_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpunordss %xmm3, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: cmpunordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 ; SSE2-NEXT: andps %xmm3, %xmm4 ; SSE2-NEXT: js .LBB0_4 ; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: .LBB0_4: ; SSE2-NEXT: maxss %xmm1, %xmm3 -; SSE2-NEXT: andnps %xmm3, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximum: @@ -460,8 +460,8 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB9_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movaps %xmm1, %xmm3 @@ -566,8 +566,8 @@ define float @test_fminimum(float %x, float %y) nounwind { ; SSE2-LABEL: test_fminimum: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB10_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -818,9 +818,9 @@ define double @test_fminimum_zero0(double %x, double %y) nounwind { ; ; AVX1-LABEL: test_fminimum_zero0: ; AVX1: # %bb.0: -; AVX1-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm0 -; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimum_zero0: @@ -869,9 +869,9 @@ define double @test_fminimum_zero1(double %x, double %y) nounwind { ; ; AVX1-LABEL: test_fminimum_zero1: ; AVX1: # %bb.0: -; AVX1-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimum_zero1: @@ -984,8 +984,8 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB19_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movaps %xmm0, %xmm3 @@ -1415,9 +1415,9 @@ define <2 x double> @test_fminimum_vector_signed_zero(<2 x double> %x) { ; ; AVX-LABEL: test_fminimum_vector_signed_zero: ; AVX: # %bb.0: -; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimum_vector_signed_zero: @@ -1427,9 +1427,9 @@ define <2 x double> @test_fminimum_vector_signed_zero(<2 x double> %x) { ; ; X86-LABEL: test_fminimum_vector_signed_zero: ; X86: # %bb.0: -; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 -; X86-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 -; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> ) ret <2 x double> %r @@ -1584,8 +1584,8 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { ; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB33_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm4, %xmm2 @@ -1618,8 +1618,8 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { ; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB33_6 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: movdqa %xmm4, %xmm2 @@ -1651,8 +1651,8 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload ; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB33_10 ; SSE2-NEXT: # %bb.9: ; SSE2-NEXT: movdqa %xmm4, %xmm2 @@ -1680,8 +1680,8 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { ; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB33_14 ; SSE2-NEXT: # %bb.13: ; SSE2-NEXT: movdqa %xmm4, %xmm2 @@ -1878,9 +1878,9 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX512-NEXT: vucomiss %xmm2, %xmm3 ; AVX512-NEXT: movl $0, %r15d -; AVX512-NEXT: cmovpl %ecx, %r15d ; AVX512-NEXT: movl $0, %r12d ; AVX512-NEXT: cmoval %ecx, %r12d +; AVX512-NEXT: cmovpl %ecx, %r15d ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3 ; AVX512-NEXT: vucomiss %xmm2, %xmm3 @@ -2154,8 +2154,8 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; SSE2-NEXT: .cfi_offset %r14, -32 ; SSE2-NEXT: .cfi_offset %r15, -24 ; SSE2-NEXT: .cfi_offset %rbp, -16 -; SSE2-NEXT: pextrw $0, %xmm1, %r14d -; SSE2-NEXT: pextrw $0, %xmm0, %r15d +; SSE2-NEXT: pextrw $0, %xmm1, %r15d +; SSE2-NEXT: pextrw $0, %xmm0, %r14d ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $16, %xmm2 ; SSE2-NEXT: pextrw $0, %xmm2, %eax @@ -2166,8 +2166,8 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: testl %ecx, %ecx ; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: testl %ecx, %ecx ; SSE2-NEXT: js .LBB34_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm2, %xmm7 @@ -2193,12 +2193,12 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; SSE2-NEXT: orps %xmm4, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: shll $16, %r15d -; SSE2-NEXT: movd %r15d, %xmm3 ; SSE2-NEXT: shll $16, %r14d -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: testl %r15d, %r15d +; SSE2-NEXT: movd %r14d, %xmm3 +; SSE2-NEXT: shll $16, %r15d +; SSE2-NEXT: movd %r15d, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: testl %r14d, %r14d ; SSE2-NEXT: js .LBB34_6 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: movdqa %xmm2, %xmm1 @@ -2226,8 +2226,8 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; SSE2-NEXT: movd %ebx, %xmm1 ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movd %ebp, %xmm3 -; SSE2-NEXT: testl %ebx, %ebx ; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: testl %ebx, %ebx ; SSE2-NEXT: js .LBB34_10 ; SSE2-NEXT: # %bb.9: ; SSE2-NEXT: movdqa %xmm3, %xmm2 @@ -2249,8 +2249,8 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; SSE2-NEXT: movd %r14d, %xmm1 ; SSE2-NEXT: shll $16, %r15d ; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: testl %r14d, %r14d ; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: testl %r14d, %r14d ; SSE2-NEXT: js .LBB34_14 ; SSE2-NEXT: # %bb.13: ; SSE2-NEXT: movdqa %xmm3, %xmm2 @@ -2307,35 +2307,35 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; AVX1-NEXT: .cfi_offset %r14, -32 ; AVX1-NEXT: .cfi_offset %r15, -24 ; AVX1-NEXT: .cfi_offset %rbp, -16 -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm3 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX1-NEXT: vpextrw $0, %xmm4, %ebx -; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX1-NEXT: vpextrw $0, %xmm4, %r14d +; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpextrw $0, %xmm2, %ebx +; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpextrw $0, %xmm2, %r14d ; AVX1-NEXT: vpextrw $0, %xmm0, %r12d ; AVX1-NEXT: vpextrw $0, %xmm1, %r13d +; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm2 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, %eax -; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, %ecx +; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpextrw $0, %xmm1, %ecx ; AVX1-NEXT: shll $16, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vmovd %ecx, %xmm1 ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: vmovd %eax, %xmm4 ; AVX1-NEXT: js .LBB34_1 ; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vmovdqa %xmm4, %xmm1 +; AVX1-NEXT: vmovdqa %xmm4, %xmm3 ; AVX1-NEXT: jmp .LBB34_3 ; AVX1-NEXT: .LBB34_1: -; AVX1-NEXT: vmovdqa %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa %xmm4, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa %xmm4, %xmm1 ; AVX1-NEXT: .LBB34_3: ; AVX1-NEXT: vpextrw $0, %xmm2, %ebp -; AVX1-NEXT: vpextrw $0, %xmm3, %r15d -; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpextrw $0, %xmm0, %r15d +; AVX1-NEXT: vmaxss %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: shll $16, %r13d @@ -2538,35 +2538,35 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: vpsrlq $48, %xmm0, %xmm2 -; X86-NEXT: vpsrlq $48, %xmm1, %xmm3 -; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; X86-NEXT: vpextrw $0, %xmm4, %esi -; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; X86-NEXT: vpextrw $0, %xmm4, %ebx +; X86-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: vpextrw $0, %xmm2, %esi +; X86-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X86-NEXT: vpextrw $0, %xmm2, %ebx ; X86-NEXT: vpextrw $0, %xmm0, %eax ; X86-NEXT: vpextrw $0, %xmm1, %ecx +; X86-NEXT: vpsrlq $48, %xmm0, %xmm2 ; X86-NEXT: vpsrld $16, %xmm0, %xmm0 ; X86-NEXT: vpextrw $0, %xmm0, %edx -; X86-NEXT: vpsrld $16, %xmm1, %xmm0 -; X86-NEXT: vpextrw $0, %xmm0, %edi +; X86-NEXT: vpsrlq $48, %xmm1, %xmm0 +; X86-NEXT: vpsrld $16, %xmm1, %xmm1 +; X86-NEXT: vpextrw $0, %xmm1, %edi ; X86-NEXT: shll $16, %edi -; X86-NEXT: vmovd %edi, %xmm0 +; X86-NEXT: vmovd %edi, %xmm1 ; X86-NEXT: shll $16, %edx ; X86-NEXT: vmovd %edx, %xmm4 ; X86-NEXT: js .LBB34_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: vmovdqa %xmm4, %xmm1 +; X86-NEXT: vmovdqa %xmm4, %xmm3 ; X86-NEXT: jmp .LBB34_3 ; X86-NEXT: .LBB34_1: -; X86-NEXT: vmovdqa %xmm0, %xmm1 -; X86-NEXT: vmovdqa %xmm4, %xmm0 +; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: vmovdqa %xmm4, %xmm1 ; X86-NEXT: .LBB34_3: ; X86-NEXT: vpextrw $0, %xmm2, %edi -; X86-NEXT: vpextrw $0, %xmm3, %ebp -; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpextrw $0, %xmm0, %ebp +; X86-NEXT: vmaxss %xmm3, %xmm1, %xmm0 +; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: shll $16, %ecx ; X86-NEXT: vmovd %ecx, %xmm0 diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll index 5945bae94f452..98001ca9411fa 100644 --- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll +++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll @@ -22,25 +22,25 @@ declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>) define float @test_fmaximumnum(float %x, float %y) nounwind { ; SSE2-LABEL: test_fmaximumnum: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB0_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: .LBB0_2: -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: cmpordss %xmm3, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: cmpordss %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm4 ; SSE2-NEXT: andps %xmm3, %xmm4 ; SSE2-NEXT: js .LBB0_4 ; SSE2-NEXT: # %bb.3: -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: .LBB0_4: ; SSE2-NEXT: maxss %xmm1, %xmm3 -; SSE2-NEXT: andnps %xmm3, %xmm0 -; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: andnps %xmm3, %xmm2 +; SSE2-NEXT: orps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximumnum: @@ -457,8 +457,8 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB9_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movaps %xmm1, %xmm3 @@ -563,8 +563,8 @@ define float @test_fminimumnum(float %x, float %y) nounwind { ; SSE2-LABEL: test_fminimumnum: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB10_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -812,9 +812,9 @@ define double @test_fminimumnum_zero0(double %x, double %y) nounwind { ; ; AVX1-LABEL: test_fminimumnum_zero0: ; AVX1: # %bb.0: -; AVX1-NEXT: vcmpordsd %xmm1, %xmm1, %xmm0 -; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: vcmpordsd %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimumnum_zero0: @@ -863,9 +863,9 @@ define double @test_fminimumnum_zero1(double %x, double %y) nounwind { ; ; AVX1-LABEL: test_fminimumnum_zero1: ; AVX1: # %bb.0: -; AVX1-NEXT: vcmpordsd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vcmpordsd %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: test_fminimumnum_zero1: @@ -978,8 +978,8 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB19_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movaps %xmm0, %xmm3 @@ -1330,9 +1330,9 @@ define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) { define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) { ; SSE2-LABEL: test_fminimumnum_vector_nan: ; SSE2: # %bb.0: -; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: xorpd %xmm1, %xmm1 ; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: minpd %xmm0, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE2-NEXT: movapd %xmm1, %xmm0 @@ -1408,9 +1408,9 @@ define <2 x double> @test_fminimumnum_vector_signed_zero(<2 x double> %x) { ; ; AVX-LABEL: test_fminimumnum_vector_signed_zero: ; AVX: # %bb.0: -; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX10_2-LABEL: test_fminimumnum_vector_signed_zero: @@ -1420,9 +1420,9 @@ define <2 x double> @test_fminimumnum_vector_signed_zero(<2 x double> %x) { ; ; X86-LABEL: test_fminimumnum_vector_signed_zero: ; X86: # %bb.0: -; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm1 -; X86-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 -; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; X86-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 +; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> ) ret <2 x double> %r @@ -1577,8 +1577,8 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB33_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm4, %xmm2 @@ -1611,8 +1611,8 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB33_6 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: movdqa %xmm4, %xmm2 @@ -1644,8 +1644,8 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload ; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB33_10 ; SSE2-NEXT: # %bb.9: ; SSE2-NEXT: movdqa %xmm4, %xmm2 @@ -1673,8 +1673,8 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB33_14 ; SSE2-NEXT: # %bb.13: ; SSE2-NEXT: movdqa %xmm4, %xmm2 @@ -1813,21 +1813,21 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-LABEL: test_fmaximumnum_v4f16: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vmovdqa %xmm1, %xmm4 -; AVX512-NEXT: vmovdqa %xmm0, %xmm8 +; AVX512-NEXT: vmovdqa %xmm1, %xmm3 +; AVX512-NEXT: vmovdqa %xmm0, %xmm4 ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 ; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} +; AVX512-NEXT: vucomiss %xmm1, %xmm1 +; AVX512-NEXT: setp %cl +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -1838,47 +1838,42 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm9 -; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 ; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2} +; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: setp %cl +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX512-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm3 -; AVX512-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm2 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} -; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm7 +; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 ; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2} +; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: setp %cl +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 @@ -1888,232 +1883,240 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm8 +; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm1 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm4[1,0] ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 ; AVX512-NEXT: setp %al +; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: setp %cl +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm5 +; AVX512-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vshufpd {{.*#+}} xmm7 = xmm8[1,0] -; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512-NEXT: vucomiss %xmm7, %xmm7 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm2, %xmm7, %xmm7 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm14 -; AVX512-NEXT: vcvtph2ps %xmm14, %xmm7 -; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm5, %xmm0 +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm7 +; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm15 +; AVX512-NEXT: vucomiss %xmm15, %xmm0 ; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1} -; AVX512-NEXT: vxorps %xmm15, %xmm15, %xmm15 -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5 -; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm3 -; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2 -; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrlq $48, %xmm8, %xmm1 +; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2 +; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: setp %cl ; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm12 -; AVX512-NEXT: vcvtph2ps %xmm12, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13 -; AVX512-NEXT: vcvtph2ps %xmm13, %xmm6 -; AVX512-NEXT: vucomiss %xmm6, %xmm1 -; AVX512-NEXT: seta %al +; AVX512-NEXT: setp %dl +; AVX512-NEXT: kmovw %edx, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm6, %xmm6 {%k1} -; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm9 +; AVX512-NEXT: vmovss %xmm0, %xmm15, %xmm15 {%k1} +; AVX512-NEXT: vucomiss %xmm9, %xmm1 +; AVX512-NEXT: seta %al +; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm4[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: setp %dl +; AVX512-NEXT: vucomiss %xmm2, %xmm2 +; AVX512-NEXT: setp %sil +; AVX512-NEXT: kmovw %esi, %k1 +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm14 +; AVX512-NEXT: kmovw %edx, %k1 +; AVX512-NEXT: vcvtph2ps %xmm14, %xmm6 +; AVX512-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5 +; AVX512-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm8[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm10 -; AVX512-NEXT: vcvtph2ps %xmm10, %xmm3 -; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm11 -; AVX512-NEXT: vcvtph2ps %xmm11, %xmm5 -; AVX512-NEXT: vucomiss %xmm5, %xmm3 +; AVX512-NEXT: vucomiss %xmm5, %xmm6 ; AVX512-NEXT: seta %al +; AVX512-NEXT: vmovss %xmm1, %xmm9, %xmm9 {%k1} ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm3, %xmm5, %xmm5 {%k1} -; AVX512-NEXT: vcvtph2ps %xmm4, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vcvtph2ps %xmm8, %xmm3 -; AVX512-NEXT: vucomiss %xmm3, %xmm3 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm7 -; AVX512-NEXT: vcvtph2ps %xmm7, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm1 -; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512-NEXT: vpsrld $16, %xmm4, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmovss %xmm6, %xmm5, %xmm5 {%k1} ; AVX512-NEXT: vucomiss %xmm1, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm6 ; AVX512-NEXT: setp %al +; AVX512-NEXT: vucomiss %xmm6, %xmm6 +; AVX512-NEXT: setp %cl +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm6, %xmm6 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm6, %xmm12 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrld $16, %xmm8, %xmm4 +; AVX512-NEXT: vcvtph2ps %xmm12, %xmm7 +; AVX512-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm13 +; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512-NEXT: vcvtph2ps %xmm13, %xmm1 +; AVX512-NEXT: vucomiss %xmm1, %xmm7 +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovss %xmm7, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vucomiss %xmm3, %xmm3 +; AVX512-NEXT: vpsrld $16, %xmm4, %xmm4 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512-NEXT: vucomiss %xmm4, %xmm4 ; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm1, %xmm4, %xmm4 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm8 -; AVX512-NEXT: vcvtph2ps %xmm8, %xmm4 -; AVX512-NEXT: vmovss %xmm4, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm4 +; AVX512-NEXT: vucomiss %xmm4, %xmm4 +; AVX512-NEXT: setp %cl +; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm10 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vcvtph2ps %xmm10, %xmm0 +; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm11 +; AVX512-NEXT: vcvtph2ps %xmm8, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm11, %xmm8 +; AVX512-NEXT: vucomiss %xmm8, %xmm0 ; AVX512-NEXT: seta %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm6, %xmm4 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512-NEXT: vmulss %xmm4, %xmm9, %xmm4 +; AVX512-NEXT: vmovss %xmm0, %xmm8, %xmm8 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm15, %xmm0 +; AVX512-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm2, %xmm6, %xmm6 +; AVX512-NEXT: vcvtps2ph $4, %xmm9, %xmm9 +; AVX512-NEXT: vmulss %xmm2, %xmm3, %xmm7 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm9, %xmm9 +; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm4 ; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm3 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512-NEXT: vmulss %xmm5, %xmm9, %xmm5 -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; AVX512-NEXT: vmulss %xmm2, %xmm5, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm5 +; AVX512-NEXT: vcvtps2ph $4, %xmm8, %xmm1 +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm2 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0],xmm1[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm8, %xmm8 +; AVX512-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm1[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm6, %xmm9 +; AVX512-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm1[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm6, %xmm7 +; AVX512-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm1[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm15 +; AVX512-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm6 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm3 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0],xmm1[1,2,3] ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 -; AVX512-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm15[1,2,3] -; AVX512-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm15[1,2,3] -; AVX512-NEXT: vblendps {{.*#+}} xmm9 = xmm2[0],xmm15[1,2,3] -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm2 -; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm6 -; AVX512-NEXT: vcvtps2ph $4, %xmm9, %xmm4 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX512-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm9[0] -; AVX512-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX512-NEXT: vpcmpeqw %xmm3, %xmm8, %xmm9 -; AVX512-NEXT: vpblendvb %xmm9, %xmm3, %xmm0, %xmm3 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX512-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX512-NEXT: # xmm5 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] +; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512-NEXT: vpcmpeqw %xmm5, %xmm4, %xmm10 +; AVX512-NEXT: vpblendvb %xmm10, %xmm4, %xmm2, %xmm4 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload ; AVX512-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm9[0] -; AVX512-NEXT: vpcmpeqw %xmm1, %xmm8, %xmm7 -; AVX512-NEXT: vpblendvb %xmm7, %xmm1, %xmm3, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm5, %xmm3 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX512-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX512-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm11[0],xmm10[0] +; AVX512-NEXT: vpcmpeqw %xmm5, %xmm10, %xmm5 +; AVX512-NEXT: vpblendvb %xmm5, %xmm10, %xmm4, %xmm4 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512-NEXT: xorl %eax, %eax ; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512-NEXT: vucomiss %xmm5, %xmm3 +; AVX512-NEXT: vucomiss %xmm5, %xmm1 ; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF ; AVX512-NEXT: movl $0, %edx ; AVX512-NEXT: cmovel %ecx, %edx -; AVX512-NEXT: vcvtph2ps %xmm4, %xmm3 -; AVX512-NEXT: vucomiss %xmm5, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vucomiss %xmm5, %xmm0 ; AVX512-NEXT: movl $0, %esi ; AVX512-NEXT: cmovel %ecx, %esi -; AVX512-NEXT: vcvtph2ps %xmm6, %xmm3 -; AVX512-NEXT: vucomiss %xmm5, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm0 +; AVX512-NEXT: vucomiss %xmm5, %xmm0 ; AVX512-NEXT: movl $0, %edi ; AVX512-NEXT: cmovel %ecx, %edi -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm5, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm6, %xmm0 +; AVX512-NEXT: vucomiss %xmm5, %xmm0 ; AVX512-NEXT: movl $0, %r8d ; AVX512-NEXT: cmovel %ecx, %r8d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm5, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm15, %xmm0 +; AVX512-NEXT: vucomiss %xmm5, %xmm0 ; AVX512-NEXT: movl $0, %r9d ; AVX512-NEXT: cmovel %ecx, %r9d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm5, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm7, %xmm0 +; AVX512-NEXT: vucomiss %xmm5, %xmm0 ; AVX512-NEXT: movl $0, %r10d ; AVX512-NEXT: cmovel %ecx, %r10d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm5, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm9, %xmm0 +; AVX512-NEXT: vucomiss %xmm5, %xmm0 ; AVX512-NEXT: movl $0, %r11d ; AVX512-NEXT: cmovel %ecx, %r11d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm5, %xmm2 -; AVX512-NEXT: vmovd %esi, %xmm2 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $2, %edi, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $3, %r8d, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $4, %r9d, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $5, %r10d, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $6, %r11d, %xmm2, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm8, %xmm0 +; AVX512-NEXT: vucomiss %xmm5, %xmm0 +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $4, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $5, %r10d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $6, %r11d, %xmm0, %xmm0 ; AVX512-NEXT: cmovel %ecx, %eax -; AVX512-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpblendvb %xmm0, %xmm4, %xmm2, %xmm0 ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq ; @@ -2277,8 +2280,8 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; SSE2-NEXT: pushq %r14 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: subq $56, %rsp -; SSE2-NEXT: pextrw $0, %xmm1, %r14d -; SSE2-NEXT: pextrw $0, %xmm0, %r15d +; SSE2-NEXT: pextrw $0, %xmm1, %r15d +; SSE2-NEXT: pextrw $0, %xmm0, %r14d ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $16, %xmm2 ; SSE2-NEXT: pextrw $0, %xmm2, %eax @@ -2289,8 +2292,8 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: testl %ecx, %ecx ; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: testl %ecx, %ecx ; SSE2-NEXT: js .LBB34_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm2, %xmm7 @@ -2316,12 +2319,12 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; SSE2-NEXT: orps %xmm4, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: shll $16, %r15d -; SSE2-NEXT: movd %r15d, %xmm3 ; SSE2-NEXT: shll $16, %r14d -; SSE2-NEXT: movd %r14d, %xmm2 -; SSE2-NEXT: testl %r15d, %r15d +; SSE2-NEXT: movd %r14d, %xmm3 +; SSE2-NEXT: shll $16, %r15d +; SSE2-NEXT: movd %r15d, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: testl %r14d, %r14d ; SSE2-NEXT: js .LBB34_6 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: movdqa %xmm2, %xmm1 @@ -2349,8 +2352,8 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; SSE2-NEXT: movd %ebx, %xmm1 ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movd %ebp, %xmm3 -; SSE2-NEXT: testl %ebx, %ebx ; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: testl %ebx, %ebx ; SSE2-NEXT: js .LBB34_10 ; SSE2-NEXT: # %bb.9: ; SSE2-NEXT: movdqa %xmm3, %xmm2 @@ -2372,8 +2375,8 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; SSE2-NEXT: movd %r14d, %xmm1 ; SSE2-NEXT: shll $16, %r15d ; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: testl %r14d, %r14d ; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: testl %r14d, %r14d ; SSE2-NEXT: js .LBB34_14 ; SSE2-NEXT: # %bb.13: ; SSE2-NEXT: movdqa %xmm3, %xmm2 @@ -2412,35 +2415,35 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx ; AVX1-NEXT: subq $56, %rsp -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm3 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX1-NEXT: vpextrw $0, %xmm4, %ebx -; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX1-NEXT: vpextrw $0, %xmm4, %r14d +; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpextrw $0, %xmm2, %ebx +; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpextrw $0, %xmm2, %r14d ; AVX1-NEXT: vpextrw $0, %xmm0, %r12d ; AVX1-NEXT: vpextrw $0, %xmm1, %r13d +; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm2 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, %eax -; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0 -; AVX1-NEXT: vpextrw $0, %xmm0, %ecx +; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpextrw $0, %xmm1, %ecx ; AVX1-NEXT: shll $16, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vmovd %ecx, %xmm1 ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: vmovd %eax, %xmm4 ; AVX1-NEXT: js .LBB34_1 ; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vmovdqa %xmm4, %xmm1 +; AVX1-NEXT: vmovdqa %xmm4, %xmm3 ; AVX1-NEXT: jmp .LBB34_3 ; AVX1-NEXT: .LBB34_1: -; AVX1-NEXT: vmovdqa %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa %xmm4, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa %xmm4, %xmm1 ; AVX1-NEXT: .LBB34_3: ; AVX1-NEXT: vpextrw $0, %xmm2, %ebp -; AVX1-NEXT: vpextrw $0, %xmm3, %r15d -; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpextrw $0, %xmm0, %r15d +; AVX1-NEXT: vmaxss %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: callq __truncsfbf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: shll $16, %r13d @@ -2607,35 +2610,35 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $68, %esp -; X86-NEXT: vpsrlq $48, %xmm0, %xmm2 -; X86-NEXT: vpsrlq $48, %xmm1, %xmm3 -; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; X86-NEXT: vpextrw $0, %xmm4, %esi -; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; X86-NEXT: vpextrw $0, %xmm4, %ebx +; X86-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: vpextrw $0, %xmm2, %esi +; X86-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X86-NEXT: vpextrw $0, %xmm2, %ebx ; X86-NEXT: vpextrw $0, %xmm0, %eax ; X86-NEXT: vpextrw $0, %xmm1, %ecx +; X86-NEXT: vpsrlq $48, %xmm0, %xmm2 ; X86-NEXT: vpsrld $16, %xmm0, %xmm0 ; X86-NEXT: vpextrw $0, %xmm0, %edx -; X86-NEXT: vpsrld $16, %xmm1, %xmm0 -; X86-NEXT: vpextrw $0, %xmm0, %edi +; X86-NEXT: vpsrlq $48, %xmm1, %xmm0 +; X86-NEXT: vpsrld $16, %xmm1, %xmm1 +; X86-NEXT: vpextrw $0, %xmm1, %edi ; X86-NEXT: shll $16, %edi -; X86-NEXT: vmovd %edi, %xmm0 +; X86-NEXT: vmovd %edi, %xmm1 ; X86-NEXT: shll $16, %edx ; X86-NEXT: vmovd %edx, %xmm4 ; X86-NEXT: js .LBB34_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: vmovdqa %xmm4, %xmm1 +; X86-NEXT: vmovdqa %xmm4, %xmm3 ; X86-NEXT: jmp .LBB34_3 ; X86-NEXT: .LBB34_1: -; X86-NEXT: vmovdqa %xmm0, %xmm1 -; X86-NEXT: vmovdqa %xmm4, %xmm0 +; X86-NEXT: vmovdqa %xmm1, %xmm3 +; X86-NEXT: vmovdqa %xmm4, %xmm1 ; X86-NEXT: .LBB34_3: ; X86-NEXT: vpextrw $0, %xmm2, %edi -; X86-NEXT: vpextrw $0, %xmm3, %ebp -; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 -; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpextrw $0, %xmm0, %ebp +; X86-NEXT: vmaxss %xmm3, %xmm1, %xmm0 +; X86-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 +; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: shll $16, %ecx ; X86-NEXT: vmovd %ecx, %xmm0 diff --git a/llvm/test/CodeGen/X86/fold-add.ll b/llvm/test/CodeGen/X86/fold-add.ll index 3a4b1e6fcf77f..8cdd0ae500bda 100644 --- a/llvm/test/CodeGen/X86/fold-add.ll +++ b/llvm/test/CodeGen/X86/fold-add.ll @@ -52,7 +52,8 @@ define dso_local i64 @one() #0 { ; MPIC: # %bb.0: # %entry ; MPIC-NEXT: leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax ; MPIC-NEXT: movabsq $foo@GOTOFF, %rcx -; MPIC-NEXT: leaq 1(%rax,%rcx), %rax +; MPIC-NEXT: addq %rcx, %rax +; MPIC-NEXT: incq %rax ; MPIC-NEXT: retq entry: ret i64 add (i64 ptrtoint (ptr @foo to i64), i64 1) @@ -83,7 +84,8 @@ define dso_local i64 @large() #0 { ; MPIC: # %bb.0: # %entry ; MPIC-NEXT: leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax ; MPIC-NEXT: movabsq $foo@GOTOFF, %rcx -; MPIC-NEXT: leaq 1701208431(%rax,%rcx), %rax +; MPIC-NEXT: addq %rcx, %rax +; MPIC-NEXT: addq $1701208431, %rax # imm = 0x6566616F ; MPIC-NEXT: retq entry: ret i64 add (i64 ptrtoint (ptr @foo to i64), i64 1701208431) @@ -112,7 +114,8 @@ define dso_local i64 @neg_1() #0 { ; MPIC: # %bb.0: # %entry ; MPIC-NEXT: leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax ; MPIC-NEXT: movabsq $foo@GOTOFF, %rcx -; MPIC-NEXT: leaq -1(%rax,%rcx), %rax +; MPIC-NEXT: addq %rcx, %rax +; MPIC-NEXT: decq %rax ; MPIC-NEXT: retq entry: ret i64 add (i64 ptrtoint (ptr @foo to i64), i64 -1) @@ -141,7 +144,8 @@ define dso_local i64 @neg_0x80000000() #0 { ; MPIC: # %bb.0: # %entry ; MPIC-NEXT: leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax ; MPIC-NEXT: movabsq $foo@GOTOFF, %rcx -; MPIC-NEXT: leaq -2147483648(%rax,%rcx), %rax +; MPIC-NEXT: addq %rcx, %rax +; MPIC-NEXT: addq $-2147483648, %rax # imm = 0x80000000 ; MPIC-NEXT: retq entry: ret i64 add (i64 ptrtoint (ptr @foo to i64), i64 -2147483648) diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 5519d9b787b7f..9a261c6f78a2c 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -266,8 +266,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: subq $120, %rsp ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 128 -; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; CHECK-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -373,8 +373,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow2_8xhalf: ; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -652,14 +652,14 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) { ; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] ; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpsllw $10, %xmm0, %xmm0 -; CHECK-NO-FASTFMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] +; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672] ; CHECK-NO-FASTFMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; @@ -924,7 +924,7 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2] ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm0, %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm1 @@ -937,7 +937,7 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: ; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] +; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2] ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax ; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm2, %xmm1 @@ -1001,16 +1001,16 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float ; ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-NO-FASTFMA: # %bb.0: -; CHECK-NO-FASTFMA-NEXT: vpslld $23, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192] +; CHECK-NO-FASTFMA-NEXT: vpslld $23, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq @@ -1104,29 +1104,29 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: subq $56, %rsp ; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,u,u,u,u] ; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 ; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT +; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; CHECK-AVX2-NEXT: vpextrw $0, %xmm1, %eax ; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax ; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 ; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT -; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; CHECK-AVX2-NEXT: addq $56, %rsp ; CHECK-AVX2-NEXT: retq @@ -1134,7 +1134,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-NO-FASTFMA: # %bb.0: ; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NO-FASTFMA-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] +; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,u,u,u,u] ; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm0, %ymm0 ; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -1253,12 +1253,26 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_vec: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] -; CHECK-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_vec: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] +; CHECK-AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_vec: +; CHECK-NO-FASTFMA: # %bb.0: +; CHECK-NO-FASTFMA-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vmovdqa {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] +; CHECK-NO-FASTFMA-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; CHECK-NO-FASTFMA-NEXT: retq +; +; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_vec: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: vpsllq $52, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408] +; CHECK-FMA-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq %shl = shl nuw <2 x i64> , %cnt %conv = uitofp <2 x i64> %shl to <2 x double> %mul = fdiv <2 x double> , %conv diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll index c1beb7c803b2b..e45423905db42 100644 --- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll +++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll @@ -462,8 +462,8 @@ define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind { ; CHECK-NEXT: psubq %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq get.i1@PLT -; CHECK-NEXT: testb $1, %al ; CHECK-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: testb $1, %al ; CHECK-NEXT: je .LBB6_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: addq $56, %rsp @@ -974,8 +974,8 @@ define void @simple_urem_fail_bad_loop(i32 %N, i32 %rem_amt) nounwind { ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: callq get.i32@PLT -; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: # implicit-def: $r14d +; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: jne .LBB16_4 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: xorl %r14d, %r14d @@ -1046,11 +1046,12 @@ define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind { ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax +; CHECK-NEXT: leal (%r14,%r15), %eax +; CHECK-NEXT: incl %eax ; CHECK-NEXT: movl %r15d, %ecx ; CHECK-NEXT: incl %ecx -; CHECK-NEXT: cmpl $1, %eax ; CHECK-NEXT: movl %ecx, %r15d +; CHECK-NEXT: cmpl $1, %eax ; CHECK-NEXT: jne .LBB17_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx @@ -1215,11 +1216,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_ ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax +; CHECK-NEXT: leal (%r14,%r15), %eax +; CHECK-NEXT: incl %eax ; CHECK-NEXT: movl %r15d, %ecx ; CHECK-NEXT: incl %ecx -; CHECK-NEXT: cmpl $5, %eax ; CHECK-NEXT: movl %ecx, %r15d +; CHECK-NEXT: cmpl $5, %eax ; CHECK-NEXT: jne .LBB21_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx @@ -1267,11 +1269,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax +; CHECK-NEXT: leal (%r14,%r15), %eax +; CHECK-NEXT: incl %eax ; CHECK-NEXT: movl %r15d, %ecx ; CHECK-NEXT: incl %ecx -; CHECK-NEXT: cmpl $5, %eax ; CHECK-NEXT: movl %ecx, %r15d +; CHECK-NEXT: cmpl $5, %eax ; CHECK-NEXT: jne .LBB22_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx @@ -1318,11 +1321,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem( ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax +; CHECK-NEXT: leal (%r14,%r15), %eax +; CHECK-NEXT: incl %eax ; CHECK-NEXT: movl %r15d, %ecx ; CHECK-NEXT: incl %ecx -; CHECK-NEXT: cmpl $5, %eax ; CHECK-NEXT: movl %ecx, %r15d +; CHECK-NEXT: cmpl $5, %eax ; CHECK-NEXT: jne .LBB23_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx @@ -1420,11 +1424,12 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3 ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax +; CHECK-NEXT: leal (%r14,%r15), %eax +; CHECK-NEXT: incl %eax ; CHECK-NEXT: movl %r15d, %ecx ; CHECK-NEXT: incl %ecx -; CHECK-NEXT: cmpl $-2, %eax ; CHECK-NEXT: movl %ecx, %r15d +; CHECK-NEXT: cmpl $-2, %eax ; CHECK-NEXT: jne .LBB25_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll index 5ea2964057588..8d243d300b004 100644 --- a/llvm/test/CodeGen/X86/fold-tied-op.ll +++ b/llvm/test/CodeGen/X86/fold-tied-op.ll @@ -25,81 +25,86 @@ define i64 @fn1() #0 { ; CHECK-NEXT: .cfi_offset %edi, -16 ; CHECK-NEXT: .cfi_offset %ebx, -12 ; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D -; CHECK-NEXT: movl $668265295, %esi # imm = 0x27D4EB4F -; CHECK-NEXT: movl a, %edi -; CHECK-NEXT: cmpl $0, (%edi) +; CHECK-NEXT: movl $668265295, %edi # imm = 0x27D4EB4F +; CHECK-NEXT: movl a, %ebx +; CHECK-NEXT: cmpl $0, (%ebx) ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl 8(%edi), %ecx -; CHECK-NEXT: movl 12(%edi), %edx -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movl 8(%ebx), %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 12(%ebx), %esi +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: shldl $1, %ecx, %eax -; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: leal (%ecx,%ecx), %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 16(%edi), %ebx -; CHECK-NEXT: movl 20(%edi), %edx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: shldl $2, %ebx, %edx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: shldl $31, %ebx, %ecx -; CHECK-NEXT: shll $2, %ebx -; CHECK-NEXT: orl %ecx, %ebx +; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: movl 16(%ebx), %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 20(%ebx), %edx +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: shldl $2, %ecx, %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: leal (%esi,%esi), %ecx +; CHECK-NEXT: orl %esi, %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %edx, %esi ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: shrl %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: adcl %eax, %ecx +; CHECK-NEXT: shldl $31, %ecx, %esi +; CHECK-NEXT: shll $2, %ecx +; CHECK-NEXT: orl %esi, %ecx +; CHECK-NEXT: shrl %edx +; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 24(%edi), %eax +; CHECK-NEXT: adcl %eax, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 24(%ebx), %ecx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: mull %edi ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D -; CHECK-NEXT: imull %eax, %ebx -; CHECK-NEXT: mull %esi -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: addl %ebx, %edx -; CHECK-NEXT: movl 28(%edi), %edi -; CHECK-NEXT: imull %edi, %esi -; CHECK-NEXT: addl %edx, %esi +; CHECK-NEXT: movl $-1028477379, %eax # imm = 0xC2B2AE3D +; CHECK-NEXT: imull %ecx, %eax +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: movl 28(%ebx), %ebx +; CHECK-NEXT: imull %ebx, %edi +; CHECK-NEXT: addl %edx, %edi ; CHECK-NEXT: movl $1336530590, %edx # imm = 0x4FA9D69E +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: mull %edx +; CHECK-NEXT: imull $-2056954758, %ecx, %esi # imm = 0x85655C7A +; CHECK-NEXT: addl %edx, %esi +; CHECK-NEXT: imull $1336530590, %ebx, %edx # imm = 0x4FA9D69E +; CHECK-NEXT: addl %esi, %edx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; CHECK-NEXT: shrdl $3, %edi, %ebx +; CHECK-NEXT: sarl $3, %edi +; CHECK-NEXT: orl %edx, %edi +; CHECK-NEXT: orl %eax, %ebx +; CHECK-NEXT: movl $-66860409, %edx # imm = 0xFC03CA87 ; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: mull %edx -; CHECK-NEXT: imull $-2056954758, %ebx, %ebx # imm = 0x85655C7A -; CHECK-NEXT: addl %edx, %ebx -; CHECK-NEXT: imull $1336530590, %edi, %edx # imm = 0x4FA9D69E -; CHECK-NEXT: addl %ebx, %edx -; CHECK-NEXT: shrdl $3, %esi, %ecx -; CHECK-NEXT: sarl $3, %esi -; CHECK-NEXT: orl %edx, %esi -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movl $-66860409, %ebx # imm = 0xFC03CA87 -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: mull %ebx -; CHECK-NEXT: movl %eax, %edi -; CHECK-NEXT: imull $326129324, %ecx, %eax # imm = 0x137056AC -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: imull $-66860409, %esi, %ecx # imm = 0xFC03CA87 +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: imull $-66860409, %edi, %ecx # imm = 0xFC03CA87 +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $326129324, %ebx, %eax # imm = 0x137056AC ; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; CHECK-NEXT: movl %edi, b -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: mull %ebx -; CHECK-NEXT: imull $326129324, %edi, %esi # imm = 0x137056AC +; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movl $-66860409, %edx # imm = 0xFC03CA87 +; CHECK-NEXT: mull %edx +; CHECK-NEXT: movl %esi, b +; CHECK-NEXT: imull $326129324, %esi, %esi # imm = 0x137056AC ; CHECK-NEXT: addl %edx, %esi ; CHECK-NEXT: movl %ecx, b+4 ; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87 ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.else -; CHECK-NEXT: xorl b+4, %ecx -; CHECK-NEXT: xorl b, %esi +; CHECK-NEXT: xorl b, %edi ; CHECK-NEXT: movl $1419758215, %edx # imm = 0x549FCA87 -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: mull %edx -; CHECK-NEXT: imull $93298681, %esi, %esi # imm = 0x58F9FF9 +; CHECK-NEXT: xorl b+4, %ecx +; CHECK-NEXT: imull $93298681, %edi, %esi # imm = 0x58F9FF9 ; CHECK-NEXT: addl %edx, %esi ; CHECK-NEXT: imull $1419758215, %ecx, %ecx # imm = 0x549FCA87 ; CHECK-NEXT: .LBB0_3: # %if.end diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll b/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll index 9a955ce5a24cb..7076820d01d20 100644 --- a/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll +++ b/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll @@ -106,10 +106,10 @@ define <2 x i256> @test_zext1() { ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movups %xmm0, 40(%rdi) ; X64-NEXT: movaps %xmm0, 16(%rdi) ; X64-NEXT: movaps %xmm0, (%rdi) -; X64-NEXT: movq $0, 40(%rdi) +; X64-NEXT: movq $0, 56(%rdi) ; X64-NEXT: movq $254, 32(%rdi) ; X64-NEXT: retq %Se = zext <2 x i8> to <2 x i256> diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll index e98fb8e374c0b..82327ddde12d4 100644 --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -567,9 +567,9 @@ define <16 x float> @round_v16f32(<16 x float> %x) { ; ; AVX1-LABEL: round_v16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] ; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 @@ -675,9 +675,9 @@ define <8 x double> @round_v8f64(<8 x double> %x) { ; ; AVX1-LABEL: round_v8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] ; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll index 6a6b86e8efa7c..d265a52d5ae6e 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll @@ -29,22 +29,6 @@ define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_oeq_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovnel %esi, %eax -; AVX-NEXT: cmovpl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_oeq_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -93,21 +77,6 @@ define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ogt_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbel %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ogt_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -154,21 +123,6 @@ define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_oge_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_oge_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -217,21 +171,6 @@ define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_olt_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm1, %ecx -; AVX-NEXT: vpextrw $0, %xmm0, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbel %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_olt_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -280,21 +219,6 @@ define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ole_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm1, %ecx -; AVX-NEXT: vpextrw $0, %xmm0, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ole_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -341,21 +265,6 @@ define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_one_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovel %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_one_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -402,21 +311,6 @@ define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ord_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovpl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ord_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -463,21 +357,6 @@ define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ueq_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovnel %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ueq_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -526,21 +405,6 @@ define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ugt_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm1, %ecx -; AVX-NEXT: vpextrw $0, %xmm0, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovael %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ugt_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -589,21 +453,6 @@ define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_uge_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm1, %ecx -; AVX-NEXT: vpextrw $0, %xmm0, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmoval %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_uge_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -650,21 +499,6 @@ define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ult_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovael %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ult_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -711,21 +545,6 @@ define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ule_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmoval %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ule_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -837,21 +656,6 @@ define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_uno_q: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: cmovnpl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_uno_q: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -899,22 +703,6 @@ define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_oeq_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovnel %esi, %eax -; AVX-NEXT: cmovpl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_oeq_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -963,21 +751,6 @@ define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ogt_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbel %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ogt_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1024,21 +797,6 @@ define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_oge_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_oge_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1087,21 +845,6 @@ define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_olt_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm1, %ecx -; AVX-NEXT: vpextrw $0, %xmm0, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbel %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_olt_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1150,21 +893,6 @@ define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ole_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm1, %ecx -; AVX-NEXT: vpextrw $0, %xmm0, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovbl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ole_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1211,21 +939,6 @@ define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_one_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovel %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_one_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1272,21 +985,6 @@ define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ord_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovpl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ord_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1333,21 +1031,6 @@ define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ueq_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovnel %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ueq_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1396,21 +1079,6 @@ define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ugt_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm1, %ecx -; AVX-NEXT: vpextrw $0, %xmm0, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovael %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ugt_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1459,21 +1127,6 @@ define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_uge_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm1, %ecx -; AVX-NEXT: vpextrw $0, %xmm0, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmoval %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_uge_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1520,21 +1173,6 @@ define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ult_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovael %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ult_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1581,21 +1219,6 @@ define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_ule_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmoval %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_ule_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero @@ -1707,21 +1330,6 @@ define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: test_f16_uno_s: -; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: vpextrw $0, %xmm0, %ecx -; AVX-NEXT: vpextrw $0, %xmm1, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcomiss %xmm0, %xmm1 -; AVX-NEXT: cmovnpl %esi, %eax -; AVX-NEXT: retq -; ; X86-FP16-LABEL: test_f16_uno_s: ; X86-FP16: # %bb.0: ; X86-FP16-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll index e3e2b6225a7ba..dd67f889343dc 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll @@ -172,8 +172,8 @@ define i32 @test_f32_oge_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_oge_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovbl %esi, %eax ; SSE-64-NEXT: retq ; @@ -187,13 +187,6 @@ define i32 @test_f32_oge_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_oge_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovbl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_oge_q: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -318,8 +311,8 @@ define i32 @test_f32_ole_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ole_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomiss %xmm0, %xmm1 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovbl %esi, %eax ; SSE-64-NEXT: retq ; @@ -333,13 +326,6 @@ define i32 @test_f32_ole_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ole_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomiss %xmm0, %xmm1 -; AVX-64-NEXT: cmovbl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ole_q: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -391,8 +377,8 @@ define i32 @test_f32_one_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_one_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovel %esi, %eax ; SSE-64-NEXT: retq ; @@ -406,13 +392,6 @@ define i32 @test_f32_one_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_one_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovel %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_one_q: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -464,8 +443,8 @@ define i32 @test_f32_ord_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ord_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovpl %esi, %eax ; SSE-64-NEXT: retq ; @@ -479,13 +458,6 @@ define i32 @test_f32_ord_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ord_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovpl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ord_q: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -537,8 +509,8 @@ define i32 @test_f32_ueq_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ueq_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovnel %esi, %eax ; SSE-64-NEXT: retq ; @@ -552,13 +524,6 @@ define i32 @test_f32_ueq_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ueq_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovnel %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ueq_q: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -610,8 +575,8 @@ define i32 @test_f32_ugt_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ugt_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomiss %xmm0, %xmm1 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovael %esi, %eax ; SSE-64-NEXT: retq ; @@ -625,13 +590,6 @@ define i32 @test_f32_ugt_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ugt_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomiss %xmm0, %xmm1 -; AVX-64-NEXT: cmovael %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ugt_q: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -756,8 +714,8 @@ define i32 @test_f32_ult_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ult_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovael %esi, %eax ; SSE-64-NEXT: retq ; @@ -771,13 +729,6 @@ define i32 @test_f32_ult_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ult_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovael %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ult_q: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -980,8 +931,8 @@ define i32 @test_f32_uno_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_uno_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovnpl %esi, %eax ; SSE-64-NEXT: retq ; @@ -995,13 +946,6 @@ define i32 @test_f32_uno_q(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_uno_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovnpl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_uno_q: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -1204,8 +1148,8 @@ define i32 @test_f64_oge_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_oge_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovbl %esi, %eax ; SSE-64-NEXT: retq ; @@ -1219,13 +1163,6 @@ define i32 @test_f64_oge_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_oge_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovbl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_oge_q: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -1350,8 +1287,8 @@ define i32 @test_f64_ole_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ole_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomisd %xmm0, %xmm1 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovbl %esi, %eax ; SSE-64-NEXT: retq ; @@ -1365,13 +1302,6 @@ define i32 @test_f64_ole_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ole_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomisd %xmm0, %xmm1 -; AVX-64-NEXT: cmovbl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ole_q: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -1423,8 +1353,8 @@ define i32 @test_f64_one_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_one_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovel %esi, %eax ; SSE-64-NEXT: retq ; @@ -1438,13 +1368,6 @@ define i32 @test_f64_one_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_one_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovel %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_one_q: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -1496,8 +1419,8 @@ define i32 @test_f64_ord_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ord_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovpl %esi, %eax ; SSE-64-NEXT: retq ; @@ -1511,13 +1434,6 @@ define i32 @test_f64_ord_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ord_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovpl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ord_q: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -1569,8 +1485,8 @@ define i32 @test_f64_ueq_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ueq_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovnel %esi, %eax ; SSE-64-NEXT: retq ; @@ -1584,13 +1500,6 @@ define i32 @test_f64_ueq_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ueq_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovnel %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ueq_q: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -1642,8 +1551,8 @@ define i32 @test_f64_ugt_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ugt_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomisd %xmm0, %xmm1 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovael %esi, %eax ; SSE-64-NEXT: retq ; @@ -1657,13 +1566,6 @@ define i32 @test_f64_ugt_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ugt_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomisd %xmm0, %xmm1 -; AVX-64-NEXT: cmovael %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ugt_q: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -1788,8 +1690,8 @@ define i32 @test_f64_ult_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ult_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovael %esi, %eax ; SSE-64-NEXT: retq ; @@ -1803,13 +1705,6 @@ define i32 @test_f64_ult_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ult_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovael %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ult_q: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -2012,8 +1907,8 @@ define i32 @test_f64_uno_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_uno_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: ucomisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovnpl %esi, %eax ; SSE-64-NEXT: retq ; @@ -2027,13 +1922,6 @@ define i32 @test_f64_uno_q(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_uno_q: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vucomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovnpl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_uno_q: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -2236,8 +2124,8 @@ define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_oge_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovbl %esi, %eax ; SSE-64-NEXT: retq ; @@ -2251,13 +2139,6 @@ define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_oge_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovbl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_oge_s: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -2382,8 +2263,8 @@ define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ole_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comiss %xmm0, %xmm1 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovbl %esi, %eax ; SSE-64-NEXT: retq ; @@ -2397,13 +2278,6 @@ define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ole_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomiss %xmm0, %xmm1 -; AVX-64-NEXT: cmovbl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ole_s: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -2455,8 +2329,8 @@ define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_one_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovel %esi, %eax ; SSE-64-NEXT: retq ; @@ -2470,13 +2344,6 @@ define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_one_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovel %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_one_s: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -2528,8 +2395,8 @@ define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ord_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovpl %esi, %eax ; SSE-64-NEXT: retq ; @@ -2543,13 +2410,6 @@ define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ord_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovpl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ord_s: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -2601,8 +2461,8 @@ define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ueq_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovnel %esi, %eax ; SSE-64-NEXT: retq ; @@ -2616,13 +2476,6 @@ define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ueq_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovnel %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ueq_s: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -2674,8 +2527,8 @@ define i32 @test_f32_ugt_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ugt_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comiss %xmm0, %xmm1 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovael %esi, %eax ; SSE-64-NEXT: retq ; @@ -2689,13 +2542,6 @@ define i32 @test_f32_ugt_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ugt_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomiss %xmm0, %xmm1 -; AVX-64-NEXT: cmovael %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ugt_s: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -2820,8 +2666,8 @@ define i32 @test_f32_ult_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_ult_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovael %esi, %eax ; SSE-64-NEXT: retq ; @@ -2835,13 +2681,6 @@ define i32 @test_f32_ult_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_ult_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovael %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_ult_s: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -3044,8 +2883,8 @@ define i32 @test_f32_uno_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; ; SSE-64-LABEL: test_f32_uno_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comiss %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovnpl %esi, %eax ; SSE-64-NEXT: retq ; @@ -3059,13 +2898,6 @@ define i32 @test_f32_uno_s(i32 %a, i32 %b, float %f1, float %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f32_uno_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomiss %xmm1, %xmm0 -; AVX-64-NEXT: cmovnpl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f32_uno_s: ; X87: # %bb.0: ; X87-NEXT: flds {{[0-9]+}}(%esp) @@ -3268,8 +3100,8 @@ define i32 @test_f64_oge_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_oge_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovbl %esi, %eax ; SSE-64-NEXT: retq ; @@ -3283,13 +3115,6 @@ define i32 @test_f64_oge_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_oge_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovbl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_oge_s: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -3414,8 +3239,8 @@ define i32 @test_f64_ole_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ole_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comisd %xmm0, %xmm1 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovbl %esi, %eax ; SSE-64-NEXT: retq ; @@ -3429,13 +3254,6 @@ define i32 @test_f64_ole_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ole_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomisd %xmm0, %xmm1 -; AVX-64-NEXT: cmovbl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ole_s: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -3487,8 +3305,8 @@ define i32 @test_f64_one_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_one_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovel %esi, %eax ; SSE-64-NEXT: retq ; @@ -3502,13 +3320,6 @@ define i32 @test_f64_one_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_one_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovel %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_one_s: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -3560,8 +3371,8 @@ define i32 @test_f64_ord_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ord_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovpl %esi, %eax ; SSE-64-NEXT: retq ; @@ -3575,13 +3386,6 @@ define i32 @test_f64_ord_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ord_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovpl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ord_s: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -3633,8 +3437,8 @@ define i32 @test_f64_ueq_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ueq_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovnel %esi, %eax ; SSE-64-NEXT: retq ; @@ -3648,13 +3452,6 @@ define i32 @test_f64_ueq_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ueq_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovnel %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ueq_s: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -3706,8 +3503,8 @@ define i32 @test_f64_ugt_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ugt_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comisd %xmm0, %xmm1 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovael %esi, %eax ; SSE-64-NEXT: retq ; @@ -3721,13 +3518,6 @@ define i32 @test_f64_ugt_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ugt_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomisd %xmm0, %xmm1 -; AVX-64-NEXT: cmovael %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ugt_s: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -3852,8 +3642,8 @@ define i32 @test_f64_ult_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_ult_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovael %esi, %eax ; SSE-64-NEXT: retq ; @@ -3867,13 +3657,6 @@ define i32 @test_f64_ult_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_ult_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovael %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_ult_s: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) @@ -4076,8 +3859,8 @@ define i32 @test_f64_uno_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; ; SSE-64-LABEL: test_f64_uno_s: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: comisd %xmm1, %xmm0 +; SSE-64-NEXT: movl %edi, %eax ; SSE-64-NEXT: cmovnpl %esi, %eax ; SSE-64-NEXT: retq ; @@ -4091,13 +3874,6 @@ define i32 @test_f64_uno_s(i32 %a, i32 %b, double %f1, double %f2) #0 { ; AVX-32-NEXT: movl (%ecx), %eax ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_f64_uno_s: -; AVX-64: # %bb.0: -; AVX-64-NEXT: movl %edi, %eax -; AVX-64-NEXT: vcomisd %xmm1, %xmm0 -; AVX-64-NEXT: cmovnpl %esi, %eax -; AVX-64-NEXT: retq -; ; X87-LABEL: test_f64_uno_s: ; X87: # %bb.0: ; X87-NEXT: fldl {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar.ll b/llvm/test/CodeGen/X86/fp-strict-scalar.ll index f1be74f5c3ac4..e584318ebf846 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar.ll @@ -613,12 +613,10 @@ define double @fma_f64(double %a, double %b, double %c) nounwind strictfp { ; SSE-X86-LABEL: fma_f64: ; SSE-X86: # %bb.0: ; SSE-X86-NEXT: subl $24, %esp -; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 ; SSE-X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-X86-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-X86-NEXT: movsd %xmm2, {{[0-9]+}}(%esp) ; SSE-X86-NEXT: movsd %xmm1, {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movsd %xmm0, (%esp) +; SSE-X86-NEXT: movups %xmm0, (%esp) ; SSE-X86-NEXT: calll fma ; SSE-X86-NEXT: addl $24, %esp ; SSE-X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll index 1de2484d47ba1..061e0cd057983 100644 --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -1206,8 +1206,8 @@ define fp128 @TestPair128(i64 %a, i64 %b) nounwind { ; X64-SSE-LABEL: TestPair128: ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: addq $3, %rsi -; X64-SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; X64-SSE-NEXT: adcq $0, %rdi +; X64-SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; X64-SSE-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-SSE-NEXT: retq @@ -1220,10 +1220,10 @@ define fp128 @TestPair128(i64 %a, i64 %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: addl $3, %ecx ; X86-NEXT: adcl $0, %edx ; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %esi, 8(%eax) ; X86-NEXT: movl %edx, 4(%eax) @@ -1236,8 +1236,8 @@ define fp128 @TestPair128(i64 %a, i64 %b) nounwind { ; X64-AVX-LABEL: TestPair128: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: addq $3, %rsi -; X64-AVX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: adcq $0, %rdi +; X64-AVX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; X64-AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll index 3ac4415d075c9..c9dfbbd11ddb0 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -42,25 +42,20 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp { ; X86-LABEL: add: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __addtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -142,25 +137,20 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp { ; X86-LABEL: sub: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __subtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -242,25 +232,20 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp { ; X86-LABEL: mul: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __multf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -342,25 +327,20 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp { ; X86-LABEL: div: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __divtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -435,9 +415,14 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp { ; X86-LABEL: fma: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $88, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: movups %xmm2, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -457,7 +442,7 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $88, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -539,9 +524,12 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp { ; X86-LABEL: frem: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -557,7 +545,7 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -632,9 +620,10 @@ define fp128 @ceil(fp128 %x) nounwind strictfp { ; X86-LABEL: ceil: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -646,7 +635,7 @@ define fp128 @ceil(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -714,9 +703,10 @@ define fp128 @acos(fp128 %x) nounwind strictfp { ; X86-LABEL: acos: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -728,7 +718,7 @@ define fp128 @acos(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -796,9 +786,10 @@ define fp128 @cos(fp128 %x) nounwind strictfp { ; X86-LABEL: cos: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -810,7 +801,7 @@ define fp128 @cos(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -878,9 +869,10 @@ define fp128 @cosh(fp128 %x) nounwind strictfp { ; X86-LABEL: cosh: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -892,7 +884,7 @@ define fp128 @cosh(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -960,9 +952,10 @@ define fp128 @exp(fp128 %x) nounwind strictfp { ; X86-LABEL: exp: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -974,7 +967,7 @@ define fp128 @exp(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1042,9 +1035,10 @@ define fp128 @exp2(fp128 %x) nounwind strictfp { ; X86-LABEL: exp2: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1056,7 +1050,7 @@ define fp128 @exp2(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1124,9 +1118,10 @@ define fp128 @floor(fp128 %x) nounwind strictfp { ; X86-LABEL: floor: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1138,7 +1133,7 @@ define fp128 @floor(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1206,9 +1201,10 @@ define fp128 @log(fp128 %x) nounwind strictfp { ; X86-LABEL: log: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1220,7 +1216,7 @@ define fp128 @log(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1288,9 +1284,10 @@ define fp128 @log10(fp128 %x) nounwind strictfp { ; X86-LABEL: log10: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1302,7 +1299,7 @@ define fp128 @log10(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1370,9 +1367,10 @@ define fp128 @log2(fp128 %x) nounwind strictfp { ; X86-LABEL: log2: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1384,7 +1382,7 @@ define fp128 @log2(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1452,9 +1450,12 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp { ; X86-LABEL: maxnum: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1470,7 +1471,7 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1545,9 +1546,12 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp { ; X86-LABEL: minnum: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1563,7 +1567,7 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1638,9 +1642,10 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp { ; X86-LABEL: nearbyint: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1652,7 +1657,7 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1720,9 +1725,12 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp { ; X86-LABEL: pow: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1738,7 +1746,7 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1820,22 +1828,20 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp { ; X86-LABEL: powi: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $8, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __powitf2 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1904,9 +1910,10 @@ define fp128 @rint(fp128 %x) nounwind strictfp { ; X86-LABEL: rint: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1918,7 +1925,7 @@ define fp128 @rint(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1986,9 +1993,10 @@ define fp128 @round(fp128 %x) nounwind strictfp { ; X86-LABEL: round: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2000,7 +2008,7 @@ define fp128 @round(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2068,9 +2076,10 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp { ; X86-LABEL: roundeven: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2082,7 +2091,7 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2150,9 +2159,10 @@ define fp128 @asin(fp128 %x) nounwind strictfp { ; X86-LABEL: asin: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2164,7 +2174,7 @@ define fp128 @asin(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2232,9 +2242,10 @@ define fp128 @sin(fp128 %x) nounwind strictfp { ; X86-LABEL: sin: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2246,7 +2257,7 @@ define fp128 @sin(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2314,9 +2325,10 @@ define fp128 @sinh(fp128 %x) nounwind strictfp { ; X86-LABEL: sinh: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2328,7 +2340,7 @@ define fp128 @sinh(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2396,9 +2408,10 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp { ; X86-LABEL: sqrt: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2410,7 +2423,7 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2478,9 +2491,10 @@ define fp128 @atan(fp128 %x) nounwind strictfp { ; X86-LABEL: atan: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2492,7 +2506,7 @@ define fp128 @atan(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2560,9 +2574,12 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp { ; X86-LABEL: atan2: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2578,7 +2595,7 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2653,9 +2670,10 @@ define fp128 @tan(fp128 %x) nounwind strictfp { ; X86-LABEL: tan: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2667,7 +2685,7 @@ define fp128 @tan(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2735,9 +2753,10 @@ define fp128 @tanh(fp128 %x) nounwind strictfp { ; X86-LABEL: tanh: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2749,7 +2768,7 @@ define fp128 @tanh(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2817,9 +2836,10 @@ define fp128 @trunc(fp128 %x) nounwind strictfp { ; X86-LABEL: trunc: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2831,7 +2851,7 @@ define fp128 @trunc(fp128 %x) nounwind strictfp { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -3132,24 +3152,19 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 { ; ; X86-LABEL: cmp: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: subl $44, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, (%esp) ; X86-NEXT: calll __eqtf2 -; X86-NEXT: addl $32, %esp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovel %eax, %ecx ; X86-NEXT: movl (%ecx), %eax ; X86-NEXT: movl 4(%ecx), %edx -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl $44, %esp ; X86-NEXT: retl ; ; WIN-LABEL: cmp: @@ -3256,24 +3271,19 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 { ; ; X86-LABEL: cmps: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: subl $44, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, (%esp) ; X86-NEXT: calll __eqtf2 -; X86-NEXT: addl $32, %esp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovel %eax, %ecx ; X86-NEXT: movl (%ecx), %eax ; X86-NEXT: movl 4(%ecx), %edx -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl $44, %esp ; X86-NEXT: retl ; ; WIN-LABEL: cmps: diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll index f727a79078627..37430a4dd19fc 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll @@ -42,22 +42,18 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Add: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __addtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Add: @@ -144,22 +140,18 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Add: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps vf128, %xmm1 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __addtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Add: @@ -241,22 +233,18 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Sub: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __subtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Sub: @@ -343,22 +331,18 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Sub: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps vf128, %xmm1 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __subtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Sub: @@ -440,22 +424,18 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Mul: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __multf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Mul: @@ -542,22 +522,18 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Mul: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps vf128, %xmm1 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __multf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Mul: @@ -639,22 +615,18 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Div: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __divtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Div: @@ -741,22 +713,18 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Div: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps vf128, %xmm1 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __divtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Div: @@ -830,7 +798,11 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Rem: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -845,7 +817,7 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind { ; X86-NEXT: addl $44, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Rem: @@ -922,7 +894,11 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Rem: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $76, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps vf128, %xmm1 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -937,7 +913,7 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind { ; X86-NEXT: addl $44, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Rem: @@ -1011,7 +987,9 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Sqrt: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $60, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1022,7 +1000,7 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Sqrt: @@ -1089,7 +1067,9 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Sin: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $60, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1100,7 +1080,7 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Sin: @@ -1167,7 +1147,9 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Cos: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $60, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1178,7 +1160,7 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Cos: @@ -1245,7 +1227,9 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Ceil: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $60, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1256,7 +1240,7 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Ceil: @@ -1323,7 +1307,9 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Floor: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $60, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1334,7 +1320,7 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Floor: @@ -1401,7 +1387,9 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Trunc: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $60, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1412,7 +1400,7 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Trunc: @@ -1479,7 +1467,9 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Nearbyint: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $60, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1490,7 +1480,7 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Nearbyint: @@ -1557,7 +1547,9 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Rint: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $60, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1568,7 +1560,7 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Rint: @@ -1635,7 +1627,9 @@ define dso_local void @Test128Round(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Round: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $60, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1646,7 +1640,7 @@ define dso_local void @Test128Round(fp128 %d1) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Round: @@ -1706,9 +1700,14 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind { ; X86-LABEL: Test128FMA: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $88, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: movups %xmm2, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1728,7 +1727,7 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $88, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1805,9 +1804,10 @@ define fp128 @Test128Acos(fp128 %a) nounwind { ; X86-LABEL: Test128Acos: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1819,7 +1819,7 @@ define fp128 @Test128Acos(fp128 %a) nounwind { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1880,9 +1880,10 @@ define fp128 @Test128Asin(fp128 %a) nounwind { ; X86-LABEL: Test128Asin: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1894,7 +1895,7 @@ define fp128 @Test128Asin(fp128 %a) nounwind { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -1955,9 +1956,10 @@ define fp128 @Test128Atan(fp128 %a) nounwind { ; X86-LABEL: Test128Atan: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -1969,7 +1971,7 @@ define fp128 @Test128Atan(fp128 %a) nounwind { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2030,9 +2032,12 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind { ; X86-LABEL: Test128Atan2: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: movups %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2048,7 +2053,7 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2116,9 +2121,10 @@ define fp128 @Test128Cosh(fp128 %a) nounwind { ; X86-LABEL: Test128Cosh: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2130,7 +2136,7 @@ define fp128 @Test128Cosh(fp128 %a) nounwind { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2191,9 +2197,10 @@ define fp128 @Test128Sinh(fp128 %a) nounwind { ; X86-LABEL: Test128Sinh: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2205,7 +2212,7 @@ define fp128 @Test128Sinh(fp128 %a) nounwind { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2266,9 +2273,10 @@ define fp128 @Test128Tan(fp128 %a) nounwind { ; X86-LABEL: Test128Tan: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2280,7 +2288,7 @@ define fp128 @Test128Tan(fp128 %a) nounwind { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; @@ -2341,9 +2349,10 @@ define fp128 @Test128Tanh(fp128 %a) nounwind { ; X86-LABEL: Test128Tanh: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movups %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) @@ -2355,7 +2364,7 @@ define fp128 @Test128Tanh(fp128 %a) nounwind { ; X86-NEXT: movaps (%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl $4 ; diff --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll index 3f5ec7b530fe0..91239c7266aa7 100644 --- a/llvm/test/CodeGen/X86/fpclamptosat.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat.ll @@ -639,9 +639,9 @@ define i32 @utest_f64i32_mm(double %x) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si %xmm0, %rcx ; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rdx, %rax ; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF @@ -700,9 +700,9 @@ define i32 @utest_f32i32_mm(float %x) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si %xmm0, %rcx ; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rdx, %rax ; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF @@ -769,9 +769,9 @@ define i32 @utesth_f16i32_mm(half %x) nounwind { ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rcx ; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rdx, %rax ; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll index 1a2cfd69650b8..7a458207d1031 100644 --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -9,10 +9,10 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) nounwind { ; SSE-LABEL: stest_f64i32: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -32,8 +32,8 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) nounwind { ; SSE-NEXT: pxor %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] @@ -87,30 +87,30 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) nounwind { ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: subsd %xmm1, %xmm0 ; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] -; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE-NEXT: retq ; ; AVX2-LABEL: utest_f64i32: @@ -230,16 +230,16 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) nounwind { ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] ; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: cvttss2si %xmm1, %rcx ; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: cvttss2si %xmm2, %rax -; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movq %rcx, %xmm2 ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movq %rax, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] @@ -357,30 +357,30 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) nounwind { ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx ; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: subss %xmm2, %xmm3 ; SSE-NEXT: cvttss2si %xmm3, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] ; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: movq %rdx, %xmm4 ; SSE-NEXT: subss %xmm2, %xmm3 ; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: subss %xmm2, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %rdx @@ -1180,8 +1180,8 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) nounwind { ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttps2dq %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -1700,8 +1700,8 @@ define <4 x i8> @utest_f32i8(<4 x float> %x) nounwind { ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttps2dq %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: cvttps2dq %xmm0, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 @@ -1797,11 +1797,11 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) nounwind { ; SSE-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 ; SSE-NEXT: cmpq %rsi, %rdi ; SSE-NEXT: movq $-1, %r8 -; SSE-NEXT: movq $-1, %r9 -; SSE-NEXT: sbbq %rcx, %r9 +; SSE-NEXT: sbbq %rcx, %r8 +; SSE-NEXT: movq $-1, %rcx ; SSE-NEXT: cmovgeq %rdi, %rsi ; SSE-NEXT: cmpq %rax, %rdi -; SSE-NEXT: sbbq %rdx, %r8 +; SSE-NEXT: sbbq %rdx, %rcx ; SSE-NEXT: cmovgeq %rdi, %rax ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: movq %rsi, %xmm1 @@ -2074,11 +2074,11 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) nounwind { ; SSE-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 ; SSE-NEXT: cmpq %rsi, %rdi ; SSE-NEXT: movq $-1, %r8 -; SSE-NEXT: movq $-1, %r9 -; SSE-NEXT: sbbq %rcx, %r9 +; SSE-NEXT: sbbq %rcx, %r8 +; SSE-NEXT: movq $-1, %rcx ; SSE-NEXT: cmovgeq %rdi, %rsi ; SSE-NEXT: cmpq %rax, %rdi -; SSE-NEXT: sbbq %rdx, %r8 +; SSE-NEXT: sbbq %rdx, %rcx ; SSE-NEXT: cmovgeq %rdi, %rax ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: movq %rsi, %xmm1 @@ -2351,11 +2351,11 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) nounwind { ; SSE-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 ; SSE-NEXT: cmpq %rsi, %rdi ; SSE-NEXT: movq $-1, %r8 -; SSE-NEXT: movq $-1, %r9 -; SSE-NEXT: sbbq %rcx, %r9 +; SSE-NEXT: sbbq %rcx, %r8 +; SSE-NEXT: movq $-1, %rcx ; SSE-NEXT: cmovgeq %rdi, %rsi ; SSE-NEXT: cmpq %rax, %rdi -; SSE-NEXT: sbbq %rdx, %r8 +; SSE-NEXT: sbbq %rdx, %rcx ; SSE-NEXT: cmovgeq %rdi, %rax ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: movq %rsi, %xmm1 @@ -2633,10 +2633,10 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) nounwind { ; SSE-LABEL: stest_f64i32_mm: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -2656,8 +2656,8 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) nounwind { ; SSE-NEXT: pxor %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] @@ -2709,30 +2709,30 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) nounwind { ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: subsd %xmm1, %xmm0 ; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] -; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE-NEXT: retq ; ; AVX2-LABEL: utest_f64i32_mm: @@ -2849,16 +2849,16 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) nounwind { ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] ; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: cvttss2si %xmm1, %rcx ; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: cvttss2si %xmm2, %rax -; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movq %rcx, %xmm2 ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movq %rax, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] @@ -2974,30 +2974,30 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) nounwind { ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx ; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: subss %xmm2, %xmm3 ; SSE-NEXT: cvttss2si %xmm3, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] ; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: movq %rdx, %xmm4 ; SSE-NEXT: subss %xmm2, %xmm3 ; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: subss %xmm2, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %rdx @@ -3782,8 +3782,8 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) nounwind { ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttps2dq %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -4197,11 +4197,11 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) nounwind { ; SSE-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 ; SSE-NEXT: cmpq %rsi, %rdi ; SSE-NEXT: movq $-1, %r8 -; SSE-NEXT: movq $-1, %r9 -; SSE-NEXT: sbbq %rcx, %r9 +; SSE-NEXT: sbbq %rcx, %r8 +; SSE-NEXT: movq $-1, %rcx ; SSE-NEXT: cmovgeq %rdi, %rsi ; SSE-NEXT: cmpq %rax, %rdi -; SSE-NEXT: sbbq %rdx, %r8 +; SSE-NEXT: sbbq %rdx, %rcx ; SSE-NEXT: cmovgeq %rdi, %rax ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: movq %rsi, %xmm1 @@ -4457,11 +4457,11 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) nounwind { ; SSE-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 ; SSE-NEXT: cmpq %rsi, %rdi ; SSE-NEXT: movq $-1, %r8 -; SSE-NEXT: movq $-1, %r9 -; SSE-NEXT: sbbq %rcx, %r9 +; SSE-NEXT: sbbq %rcx, %r8 +; SSE-NEXT: movq $-1, %rcx ; SSE-NEXT: cmovgeq %rdi, %rsi ; SSE-NEXT: cmpq %rax, %rdi -; SSE-NEXT: sbbq %rdx, %r8 +; SSE-NEXT: sbbq %rdx, %rcx ; SSE-NEXT: cmovgeq %rdi, %rax ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: movq %rsi, %xmm1 @@ -4717,11 +4717,11 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) nounwind { ; SSE-NEXT: movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000 ; SSE-NEXT: cmpq %rsi, %rdi ; SSE-NEXT: movq $-1, %r8 -; SSE-NEXT: movq $-1, %r9 -; SSE-NEXT: sbbq %rcx, %r9 +; SSE-NEXT: sbbq %rcx, %r8 +; SSE-NEXT: movq $-1, %rcx ; SSE-NEXT: cmovgeq %rdi, %rsi ; SSE-NEXT: cmpq %rax, %rdi -; SSE-NEXT: sbbq %rdx, %r8 +; SSE-NEXT: sbbq %rdx, %rcx ; SSE-NEXT: cmovgeq %rdi, %rax ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: movq %rsi, %xmm1 diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll index c79e19f07cda5..38256b8bbc7df 100644 --- a/llvm/test/CodeGen/X86/fpenv.ll +++ b/llvm/test/CodeGen/X86/fpenv.ll @@ -186,14 +186,14 @@ define void @func_05(i32 %x) nounwind { ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: leal 4(%eax,%eax), %ecx +; X86-NOSSE-NEXT: leal 4(,%eax,2), %ecx ; X86-NOSSE-NEXT: movl $201, %eax ; X86-NOSSE-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOSSE-NEXT: shll %cl, %eax -; X86-NOSSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-NOSSE-NEXT: fnstcw (%esp) ; X86-NOSSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF ; X86-NOSSE-NEXT: andl (%esp), %ecx +; X86-NOSSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-NOSSE-NEXT: orl %eax, %ecx ; X86-NOSSE-NEXT: movw %cx, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -204,14 +204,14 @@ define void @func_05(i32 %x) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: leal 4(%eax,%eax), %ecx +; X86-SSE-NEXT: leal 4(,%eax,2), %ecx ; X86-SSE-NEXT: movl $201, %eax ; X86-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SSE-NEXT: shll %cl, %eax -; X86-SSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-SSE-NEXT: fnstcw (%esp) ; X86-SSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF ; X86-SSE-NEXT: andl (%esp), %ecx +; X86-SSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-SSE-NEXT: orl %eax, %ecx ; X86-SSE-NEXT: movw %cx, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -227,14 +227,14 @@ define void @func_05(i32 %x) nounwind { ; X64-LABEL: func_05: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal 4(%rdi,%rdi), %ecx +; X64-NEXT: leal 4(,%rdi,2), %ecx ; X64-NEXT: movl $201, %eax ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shll %cl, %eax -; X64-NEXT: andl $3072, %eax # imm = 0xC00 ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) ; X64-NEXT: movl $-3073, %ecx # imm = 0xF3FF ; X64-NEXT: andl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: andl $3072, %eax # imm = 0xC00 ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: movw %cx, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll index 85f4c945230e1..410fd1faa2501 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -4013,9 +4013,9 @@ define i50 @test_signed_i50_f80(x86_fp80 %f) nounwind { ; X86-SSE-NEXT: fucomi %st(1), %st ; X86-SSE-NEXT: fstp %st(1) ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: cmovbl %ecx, %esi ; X86-SSE-NEXT: movl $-131072, %eax # imm = 0xFFFE0000 ; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cmovbl %ecx, %esi ; X86-SSE-NEXT: fldl {{\.?LCPI[0-9]+_[0-9]+}} ; X86-SSE-NEXT: fxch %st(1) ; X86-SSE-NEXT: fucomi %st(1), %st @@ -4152,9 +4152,9 @@ define i64 @test_signed_i64_f80(x86_fp80 %f) nounwind { ; X86-SSE-NEXT: fucomi %st(1), %st ; X86-SSE-NEXT: fstp %st(1) ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: cmovbl %ecx, %esi ; X86-SSE-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: cmovbl %ecx, %esi ; X86-SSE-NEXT: fldt {{\.?LCPI[0-9]+_[0-9]+}} ; X86-SSE-NEXT: fxch %st(1) ; X86-SSE-NEXT: fucomi %st(1), %st @@ -4516,9 +4516,9 @@ define i128 @test_signed_i128_f80(x86_fp80 %f) nounwind { ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: cmovbl %ecx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: cmovbl %ecx, %edi ; X86-SSE-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 ; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebp +; X86-SSE-NEXT: cmovbl %ecx, %edi ; X86-SSE-NEXT: fldt {{\.?LCPI[0-9]+_[0-9]+}} ; X86-SSE-NEXT: fxch %st(1) ; X86-SSE-NEXT: fucomi %st(1), %st diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll index 7f6d64c21724a..098a7f4798b58 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -25,15 +25,15 @@ define <4 x i1> @test_signed_v4i1_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: minss %xmm3, %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %ecx ; CHECK-NEXT: cmovpl %eax, %ecx +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: ucomiss %xmm1, %xmm1 +; CHECK-NEXT: maxss %xmm2, %xmm1 +; CHECK-NEXT: minss %xmm3, %xmm1 +; CHECK-NEXT: cvttss2si %xmm1, %edx ; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm4 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; CHECK-NEXT: ucomiss %xmm4, %xmm4 -; CHECK-NEXT: maxss %xmm2, %xmm4 -; CHECK-NEXT: minss %xmm3, %xmm4 -; CHECK-NEXT: cvttss2si %xmm4, %ecx -; CHECK-NEXT: cmovpl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm4 +; CHECK-NEXT: cmovpl %eax, %edx +; CHECK-NEXT: movd %edx, %xmm4 ; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: maxss %xmm2, %xmm1 @@ -67,7 +67,6 @@ define <4 x i8> @test_signed_v4i8_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: movaps %xmm2, %xmm4 ; CHECK-NEXT: minss %xmm3, %xmm4 ; CHECK-NEXT: cvttss2si %xmm4, %eax -; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; CHECK-NEXT: movaps %xmm1, %xmm4 @@ -75,6 +74,7 @@ define <4 x i8> @test_signed_v4i8_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: movaps %xmm2, %xmm3 ; CHECK-NEXT: minss %xmm4, %xmm3 ; CHECK-NEXT: cvttss2si %xmm3, %ecx +; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: shll $8, %ecx ; CHECK-NEXT: orl %eax, %ecx @@ -141,35 +141,35 @@ define <4 x i16> @test_signed_v4i16_v4f32(<4 x float> %f) nounwind { define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_signed_v4i32_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm1, %edx -; CHECK-NEXT: movss {{.*#+}} xmm2 = [2.14748352E+9,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: ucomiss %xmm2, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; CHECK-NEXT: cvttss2si %xmm2, %edx +; CHECK-NEXT: movss {{.*#+}} xmm1 = [2.14748352E+9,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: ucomiss %xmm1, %xmm2 ; CHECK-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF ; CHECK-NEXT: cmoval %eax, %edx ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: ucomiss %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm2, %xmm2 ; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm3, %edx -; CHECK-NEXT: ucomiss %xmm2, %xmm3 -; CHECK-NEXT: cmoval %eax, %edx -; CHECK-NEXT: ucomiss %xmm3, %xmm3 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm3 -; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-NEXT: cvttss2si %xmm0, %edx -; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmoval %eax, %edx +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; CHECK-NEXT: cvttss2si %xmm2, %esi +; CHECK-NEXT: ucomiss %xmm1, %xmm2 +; CHECK-NEXT: cmoval %eax, %esi +; CHECK-NEXT: ucomiss %xmm2, %xmm2 +; CHECK-NEXT: cmovpl %ecx, %esi +; CHECK-NEXT: cvttss2si %xmm0, %edi +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: cmoval %eax, %edi ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: cmovpl %ecx, %edi +; CHECK-NEXT: movd %edx, %xmm2 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: movd %esi, %xmm3 +; CHECK-NEXT: movd %edi, %xmm1 ; CHECK-NEXT: cvttss2si %xmm0, %edx -; CHECK-NEXT: ucomiss %xmm2, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; CHECK-NEXT: cmoval %eax, %edx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ecx, %edx @@ -193,27 +193,27 @@ define <4 x i64> @test_signed_v4i64_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rcx, %rdx +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] +; CHECK-NEXT: cvttss2si %xmm2, %rsi +; CHECK-NEXT: ucomiss %xmm1, %xmm2 +; CHECK-NEXT: cmovaq %rax, %rsi +; CHECK-NEXT: ucomiss %xmm2, %xmm2 +; CHECK-NEXT: cmovpq %rcx, %rsi +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; CHECK-NEXT: cvttss2si %xmm2, %rdi +; CHECK-NEXT: ucomiss %xmm1, %xmm2 +; CHECK-NEXT: cmovaq %rax, %rdi +; CHECK-NEXT: ucomiss %xmm2, %xmm2 +; CHECK-NEXT: cmovpq %rcx, %rdi ; CHECK-NEXT: movq %rdx, %xmm2 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; CHECK-NEXT: cvttss2si %xmm3, %rdx -; CHECK-NEXT: ucomiss %xmm1, %xmm3 -; CHECK-NEXT: cmovaq %rax, %rdx -; CHECK-NEXT: ucomiss %xmm3, %xmm3 -; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm3 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm3, %rdx -; CHECK-NEXT: ucomiss %xmm1, %xmm3 -; CHECK-NEXT: cmovaq %rax, %rdx -; CHECK-NEXT: ucomiss %xmm3, %xmm3 -; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm3 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: cvttss2si %xmm0, %rdx ; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: movq %rsi, %xmm1 +; CHECK-NEXT: movq %rdi, %xmm3 +; CHECK-NEXT: cvttss2si %xmm0, %rdx +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: cmovaq %rax, %rdx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rcx, %rdx @@ -240,11 +240,11 @@ define <4 x i128> @test_signed_v4i128_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT -; CHECK-NEXT: movq %rdx, %r15 -; CHECK-NEXT: xorl %r14d, %r14d +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbq %r14, %rax +; CHECK-NEXT: cmovbq %rcx, %rax +; CHECK-NEXT: movq %rdx, %r15 ; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-NEXT: cmovbq %rcx, %r15 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -253,9 +253,11 @@ define <4 x i128> @test_signed_v4i128_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %r14, %rax +; CHECK-NEXT: movl $0, %ecx +; CHECK-NEXT: cmovpq %rcx, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: cmovpq %r14, %r15 +; CHECK-NEXT: cmovpq %rcx, %r15 +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -347,25 +349,24 @@ declare <2 x i128> @llvm.fptosi.sat.v2i128.v2f64(<2 x double>) define <2 x i1> @test_signed_v2i1_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_signed_v2i1_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: movsd {{.*#+}} xmm2 = [-1.0E+0,0.0E+0] -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: maxsd %xmm2, %xmm1 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [-1.0E+0,0.0E+0] +; CHECK-NEXT: movapd %xmm0, %xmm2 +; CHECK-NEXT: maxsd %xmm1, %xmm2 ; CHECK-NEXT: xorpd %xmm3, %xmm3 -; CHECK-NEXT: minsd %xmm3, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax +; CHECK-NEXT: minsd %xmm3, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: ucomisd %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rcx, %rax -; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: ucomisd %xmm0, %xmm0 -; CHECK-NEXT: maxsd %xmm2, %xmm0 +; CHECK-NEXT: maxsd %xmm1, %xmm0 ; CHECK-NEXT: minsd %xmm3, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %rax -; CHECK-NEXT: cmovpq %rcx, %rax +; CHECK-NEXT: cvttsd2si %xmm0, %rdx +; CHECK-NEXT: cmovpq %rcx, %rdx ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: movq %rdx, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %x = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> %f) ret <2 x i1> %x @@ -397,17 +398,18 @@ define <2 x i8> @test_signed_v2i8_v2f64(<2 x double> %f) nounwind { define <2 x i16> @test_signed_v2i16_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_signed_v2i16_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: movsd {{.*#+}} xmm1 = [-3.2768E+4,0.0E+0] -; CHECK-NEXT: movapd %xmm1, %xmm2 -; CHECK-NEXT: maxsd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: movsd {{.*#+}} xmm2 = [-3.2768E+4,0.0E+0] +; CHECK-NEXT: movapd %xmm2, %xmm3 +; CHECK-NEXT: maxsd %xmm1, %xmm3 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [3.2767E+4,0.0E+0] +; CHECK-NEXT: movapd %xmm1, %xmm4 +; CHECK-NEXT: minsd %xmm3, %xmm4 +; CHECK-NEXT: cvttsd2si %xmm4, %eax ; CHECK-NEXT: maxsd %xmm0, %xmm2 -; CHECK-NEXT: movsd {{.*#+}} xmm0 = [3.2767E+4,0.0E+0] -; CHECK-NEXT: movapd %xmm0, %xmm3 -; CHECK-NEXT: minsd %xmm2, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %eax -; CHECK-NEXT: minsd %xmm1, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %ecx +; CHECK-NEXT: minsd %xmm2, %xmm1 +; CHECK-NEXT: cvttsd2si %xmm1, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: pinsrw $1, %eax, %xmm0 ; CHECK-NEXT: retq @@ -418,25 +420,24 @@ define <2 x i16> @test_signed_v2i16_v2f64(<2 x double> %f) nounwind { define <2 x i32> @test_signed_v2i32_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_signed_v2i32_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: movsd {{.*#+}} xmm2 = [-2.147483648E+9,0.0E+0] -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: maxsd %xmm2, %xmm1 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [-2.147483648E+9,0.0E+0] +; CHECK-NEXT: movapd %xmm0, %xmm2 +; CHECK-NEXT: maxsd %xmm1, %xmm2 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = [2.147483647E+9,0.0E+0] -; CHECK-NEXT: minsd %xmm3, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %eax +; CHECK-NEXT: minsd %xmm3, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: ucomisd %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ecx, %eax -; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: ucomisd %xmm0, %xmm0 -; CHECK-NEXT: maxsd %xmm2, %xmm0 +; CHECK-NEXT: maxsd %xmm1, %xmm0 ; CHECK-NEXT: minsd %xmm3, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %eax -; CHECK-NEXT: cmovpl %ecx, %eax +; CHECK-NEXT: cvttsd2si %xmm0, %edx +; CHECK-NEXT: cmovpl %ecx, %edx ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq %x = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> %f) ret <2 x i32> %x @@ -446,23 +447,22 @@ define <2 x i64> @test_signed_v2i64_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_signed_v2i64_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: cvttsd2si %xmm0, %rax -; CHECK-NEXT: movsd {{.*#+}} xmm2 = [9.2233720368547748E+18,0.0E+0] -; CHECK-NEXT: ucomisd %xmm2, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [9.2233720368547748E+18,0.0E+0] +; CHECK-NEXT: ucomisd %xmm1, %xmm0 ; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: ucomisd %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rdx, %rax -; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: cvttsd2si %xmm0, %rax -; CHECK-NEXT: ucomisd %xmm2, %xmm0 -; CHECK-NEXT: cmovaq %rcx, %rax +; CHECK-NEXT: cvttsd2si %xmm0, %rsi +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: cmovaq %rcx, %rsi ; CHECK-NEXT: ucomisd %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rdx, %rax +; CHECK-NEXT: cmovpq %rdx, %rsi ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: movq %rsi, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %x = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> %f) ret <2 x i64> %x @@ -484,11 +484,11 @@ define <2 x i128> @test_signed_v2i128_v2f64(<2 x double> %f) nounwind { ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __fixdfti@PLT ; CHECK-NEXT: movq %rax, %r14 -; CHECK-NEXT: movq %rdx, %r15 ; CHECK-NEXT: xorl %r12d, %r12d ; CHECK-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %r14 +; CHECK-NEXT: movq %rdx, %r15 ; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 ; CHECK-NEXT: cmovbq %rax, %r15 ; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll index 47dc3ca3616ea..9c9e92571d5d1 100644 --- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -1309,9 +1309,9 @@ define i32 @test_unsigned_i32_f64(double %f) nounwind { ; X86-SSE-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: cvttsd2si %xmm0, %ecx ; X86-SSE-NEXT: movl %ecx, %edx -; X86-SSE-NEXT: sarl $31, %edx ; X86-SSE-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE-NEXT: sarl $31, %edx ; X86-SSE-NEXT: andl %edx, %eax ; X86-SSE-NEXT: orl %ecx, %eax ; X86-SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll index ffbdd66529f5c..81e946a514aaa 100644 --- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll @@ -58,7 +58,6 @@ define <4 x i8> @test_unsigned_v4i8_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: movaps %xmm2, %xmm4 ; CHECK-NEXT: minss %xmm3, %xmm4 ; CHECK-NEXT: cvttss2si %xmm4, %eax -; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; CHECK-NEXT: xorps %xmm4, %xmm4 @@ -66,6 +65,7 @@ define <4 x i8> @test_unsigned_v4i8_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: movaps %xmm2, %xmm3 ; CHECK-NEXT: minss %xmm4, %xmm3 ; CHECK-NEXT: cvttss2si %xmm3, %ecx +; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: shll $8, %ecx ; CHECK-NEXT: orl %eax, %ecx @@ -97,31 +97,31 @@ define <4 x i16> @test_unsigned_v4i16_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: maxss %xmm1, %xmm3 -; CHECK-NEXT: movss {{.*#+}} xmm4 = [6.5535E+4,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: movaps %xmm4, %xmm1 -; CHECK-NEXT: minss %xmm3, %xmm1 +; CHECK-NEXT: maxss %xmm1, %xmm2 +; CHECK-NEXT: movss {{.*#+}} xmm3 = [6.5535E+4,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: movaps %xmm3, %xmm1 +; CHECK-NEXT: minss %xmm2, %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %eax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm4, %xmm3 -; CHECK-NEXT: minss %xmm1, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %ecx +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: minss %xmm1, %xmm2 +; CHECK-NEXT: cvttss2si %xmm2, %ecx ; CHECK-NEXT: movd %ecx, %xmm1 ; CHECK-NEXT: pinsrw $1, %eax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; CHECK-NEXT: xorps %xmm5, %xmm5 -; CHECK-NEXT: maxss %xmm3, %xmm5 -; CHECK-NEXT: movaps %xmm4, %xmm3 -; CHECK-NEXT: minss %xmm5, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %eax +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; CHECK-NEXT: xorps %xmm4, %xmm4 +; CHECK-NEXT: maxss %xmm2, %xmm4 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: minss %xmm4, %xmm2 +; CHECK-NEXT: cvttss2si %xmm2, %eax +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: pinsrw $2, %eax, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: maxss %xmm0, %xmm2 -; CHECK-NEXT: minss %xmm2, %xmm4 -; CHECK-NEXT: cvttss2si %xmm4, %eax +; CHECK-NEXT: minss %xmm2, %xmm3 +; CHECK-NEXT: cvttss2si %xmm3, %eax ; CHECK-NEXT: pinsrw $3, %eax, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -132,43 +132,42 @@ define <4 x i16> @test_unsigned_v4i16_v4f32(<4 x float> %f) nounwind { define <4 x i32> @test_unsigned_v4i32_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_unsigned_v4i32_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm1, %rdx +; CHECK-NEXT: movaps %xmm0, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] +; CHECK-NEXT: cvttss2si %xmm3, %rdx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: ucomiss %xmm2, %xmm1 +; CHECK-NEXT: ucomiss %xmm2, %xmm3 ; CHECK-NEXT: cmovbl %eax, %edx -; CHECK-NEXT: movss {{.*#+}} xmm3 = [4.29496704E+9,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: ucomiss %xmm3, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [4.29496704E+9,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: ucomiss %xmm1, %xmm3 ; CHECK-NEXT: movl $-1, %ecx ; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm4 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm4, %rdx -; CHECK-NEXT: ucomiss %xmm2, %xmm4 -; CHECK-NEXT: cmovbl %eax, %edx -; CHECK-NEXT: ucomiss %xmm3, %xmm4 -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm4 -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: cvttss2si %xmm0, %rdx +; CHECK-NEXT: movaps %xmm0, %xmm3 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; CHECK-NEXT: cvttss2si %xmm3, %rsi +; CHECK-NEXT: ucomiss %xmm2, %xmm3 +; CHECK-NEXT: cmovbl %eax, %esi +; CHECK-NEXT: ucomiss %xmm1, %xmm3 +; CHECK-NEXT: cmoval %ecx, %esi +; CHECK-NEXT: cvttss2si %xmm0, %rdi ; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmovbl %eax, %edx -; CHECK-NEXT: ucomiss %xmm3, %xmm0 -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: cmovbl %eax, %edi +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: cmoval %ecx, %edi +; CHECK-NEXT: movd %edx, %xmm3 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: cvttss2si %xmm0, %rdx ; CHECK-NEXT: ucomiss %xmm2, %xmm0 ; CHECK-NEXT: cmovbl %eax, %edx -; CHECK-NEXT: ucomiss %xmm3, %xmm0 +; CHECK-NEXT: movd %esi, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: movd %edi, %xmm0 ; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: retq %x = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> %f) ret <4 x i32> %x @@ -194,53 +193,53 @@ define <4 x i64> @test_unsigned_v4i64_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: ucomiss %xmm4, %xmm0 ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm2 -; CHECK-NEXT: movaps %xmm0, %xmm5 -; CHECK-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; CHECK-NEXT: movaps %xmm5, %xmm6 -; CHECK-NEXT: subss %xmm1, %xmm6 -; CHECK-NEXT: cvttss2si %xmm6, %rdx -; CHECK-NEXT: cvttss2si %xmm5, %rsi -; CHECK-NEXT: movq %rsi, %rdi -; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: andq %rdx, %rdi -; CHECK-NEXT: orq %rsi, %rdi -; CHECK-NEXT: ucomiss %xmm3, %xmm5 -; CHECK-NEXT: cmovbq %rax, %rdi -; CHECK-NEXT: ucomiss %xmm4, %xmm5 -; CHECK-NEXT: cmovaq %rcx, %rdi -; CHECK-NEXT: movq %rdi, %xmm5 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; CHECK-NEXT: movaps %xmm0, %xmm5 -; CHECK-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3] -; CHECK-NEXT: movaps %xmm5, %xmm6 -; CHECK-NEXT: subss %xmm1, %xmm6 -; CHECK-NEXT: cvttss2si %xmm6, %rdx -; CHECK-NEXT: cvttss2si %xmm5, %rsi -; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm2, %xmm5 +; CHECK-NEXT: subss %xmm1, %xmm5 +; CHECK-NEXT: cvttss2si %xmm5, %rdi +; CHECK-NEXT: cvttss2si %xmm2, %r8 +; CHECK-NEXT: movq %r8, %rsi +; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: andq %rdi, %rsi +; CHECK-NEXT: orq %r8, %rsi +; CHECK-NEXT: ucomiss %xmm3, %xmm2 +; CHECK-NEXT: cmovbq %rax, %rsi +; CHECK-NEXT: ucomiss %xmm4, %xmm2 +; CHECK-NEXT: cmovaq %rcx, %rsi +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; CHECK-NEXT: movaps %xmm2, %xmm5 +; CHECK-NEXT: subss %xmm1, %xmm5 +; CHECK-NEXT: cvttss2si %xmm5, %r8 +; CHECK-NEXT: cvttss2si %xmm2, %r9 +; CHECK-NEXT: movq %r9, %rdi ; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: andq %rdx, %rdi -; CHECK-NEXT: orq %rsi, %rdi -; CHECK-NEXT: ucomiss %xmm3, %xmm5 +; CHECK-NEXT: andq %r8, %rdi +; CHECK-NEXT: orq %r9, %rdi +; CHECK-NEXT: ucomiss %xmm3, %xmm2 ; CHECK-NEXT: cmovbq %rax, %rdi -; CHECK-NEXT: ucomiss %xmm4, %xmm5 +; CHECK-NEXT: ucomiss %xmm4, %xmm2 ; CHECK-NEXT: cmovaq %rcx, %rdi -; CHECK-NEXT: movq %rdi, %xmm5 +; CHECK-NEXT: movq %rdx, %xmm2 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps %xmm0, %xmm6 -; CHECK-NEXT: subss %xmm1, %xmm6 -; CHECK-NEXT: cvttss2si %xmm6, %rdx -; CHECK-NEXT: cvttss2si %xmm0, %rsi -; CHECK-NEXT: movq %rsi, %rdi -; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: andq %rdx, %rdi -; CHECK-NEXT: orq %rsi, %rdi +; CHECK-NEXT: movaps %xmm0, %xmm5 +; CHECK-NEXT: subss %xmm1, %xmm5 +; CHECK-NEXT: cvttss2si %xmm5, %rdx +; CHECK-NEXT: cvttss2si %xmm0, %r8 +; CHECK-NEXT: movq %rsi, %xmm1 +; CHECK-NEXT: movq %r8, %rsi +; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: andq %rdx, %rsi +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: orq %r8, %rsi ; CHECK-NEXT: ucomiss %xmm3, %xmm0 -; CHECK-NEXT: cmovbq %rax, %rdi +; CHECK-NEXT: cmovbq %rax, %rsi ; CHECK-NEXT: ucomiss %xmm4, %xmm0 -; CHECK-NEXT: cmovaq %rcx, %rdi -; CHECK-NEXT: movq %rdi, %xmm1 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; CHECK-NEXT: movq %rdi, %xmm0 +; CHECK-NEXT: cmovaq %rcx, %rsi +; CHECK-NEXT: movq %rsi, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %x = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> %f) @@ -352,20 +351,19 @@ declare <2 x i128> @llvm.fptoui.sat.v2i128.v2f64(<2 x double>) define <2 x i1> @test_unsigned_v2i1_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_unsigned_v2i1_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorpd %xmm2, %xmm2 -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: maxsd %xmm2, %xmm1 +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: movapd %xmm0, %xmm2 +; CHECK-NEXT: maxsd %xmm1, %xmm2 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0] -; CHECK-NEXT: minsd %xmm3, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax -; CHECK-NEXT: movq %rax, %xmm1 +; CHECK-NEXT: minsd %xmm3, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %rax ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: maxsd %xmm2, %xmm0 +; CHECK-NEXT: maxsd %xmm1, %xmm0 ; CHECK-NEXT: minsd %xmm3, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: cvttsd2si %xmm0, %rcx ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: movq %rcx, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %x = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> %f) ret <2 x i1> %x @@ -397,17 +395,18 @@ define <2 x i8> @test_unsigned_v2i8_v2f64(<2 x double> %f) nounwind { define <2 x i16> @test_unsigned_v2i16_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_unsigned_v2i16_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorpd %xmm1, %xmm1 -; CHECK-NEXT: maxsd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; CHECK-NEXT: xorpd %xmm2, %xmm2 +; CHECK-NEXT: xorpd %xmm3, %xmm3 +; CHECK-NEXT: maxsd %xmm1, %xmm3 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [6.5535E+4,0.0E+0] +; CHECK-NEXT: movapd %xmm1, %xmm4 +; CHECK-NEXT: minsd %xmm3, %xmm4 +; CHECK-NEXT: cvttsd2si %xmm4, %eax ; CHECK-NEXT: maxsd %xmm0, %xmm2 -; CHECK-NEXT: movsd {{.*#+}} xmm0 = [6.5535E+4,0.0E+0] -; CHECK-NEXT: movapd %xmm0, %xmm3 -; CHECK-NEXT: minsd %xmm2, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %eax -; CHECK-NEXT: minsd %xmm1, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %ecx +; CHECK-NEXT: minsd %xmm2, %xmm1 +; CHECK-NEXT: cvttsd2si %xmm1, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: pinsrw $1, %eax, %xmm0 ; CHECK-NEXT: retq @@ -418,21 +417,20 @@ define <2 x i16> @test_unsigned_v2i16_v2f64(<2 x double> %f) nounwind { define <2 x i32> @test_unsigned_v2i32_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_unsigned_v2i32_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorpd %xmm2, %xmm2 ; CHECK-NEXT: xorpd %xmm1, %xmm1 -; CHECK-NEXT: maxsd %xmm0, %xmm1 +; CHECK-NEXT: xorpd %xmm2, %xmm2 +; CHECK-NEXT: maxsd %xmm0, %xmm2 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = [4.294967295E+9,0.0E+0] ; CHECK-NEXT: movapd %xmm3, %xmm4 -; CHECK-NEXT: minsd %xmm1, %xmm4 +; CHECK-NEXT: minsd %xmm2, %xmm4 ; CHECK-NEXT: cvttsd2si %xmm4, %rax -; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: maxsd %xmm0, %xmm2 -; CHECK-NEXT: minsd %xmm2, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %rax +; CHECK-NEXT: maxsd %xmm0, %xmm1 +; CHECK-NEXT: minsd %xmm1, %xmm3 +; CHECK-NEXT: cvttsd2si %xmm3, %rcx ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq %x = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> %f) ret <2 x i32> %x @@ -441,40 +439,39 @@ define <2 x i32> @test_unsigned_v2i32_v2f64(<2 x double> %f) nounwind { define <2 x i64> @test_unsigned_v2i64_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_unsigned_v2i64_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0] -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: subsd %xmm2, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax -; CHECK-NEXT: cvttsd2si %xmm0, %rcx -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rax, %rdx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorpd %xmm3, %xmm3 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0] +; CHECK-NEXT: movapd %xmm0, %xmm2 +; CHECK-NEXT: subsd %xmm1, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %rcx +; CHECK-NEXT: cvttsd2si %xmm0, %rdx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: andq %rcx, %rax +; CHECK-NEXT: orq %rdx, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: xorpd %xmm2, %xmm2 +; CHECK-NEXT: ucomisd %xmm2, %xmm0 +; CHECK-NEXT: cmovbq %rcx, %rax +; CHECK-NEXT: movsd {{.*#+}} xmm3 = [1.844674407370955E+19,0.0E+0] ; CHECK-NEXT: ucomisd %xmm3, %xmm0 -; CHECK-NEXT: cmovbq %rax, %rdx -; CHECK-NEXT: movsd {{.*#+}} xmm4 = [1.844674407370955E+19,0.0E+0] -; CHECK-NEXT: ucomisd %xmm4, %xmm0 -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: cmovaq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm1 +; CHECK-NEXT: movq $-1, %rdx +; CHECK-NEXT: cmovaq %rdx, %rax ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movapd %xmm0, %xmm5 -; CHECK-NEXT: subsd %xmm2, %xmm5 -; CHECK-NEXT: cvttsd2si %xmm5, %rdx -; CHECK-NEXT: cvttsd2si %xmm0, %rsi -; CHECK-NEXT: movq %rsi, %rdi -; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: andq %rdx, %rdi -; CHECK-NEXT: orq %rsi, %rdi +; CHECK-NEXT: movapd %xmm0, %xmm4 +; CHECK-NEXT: subsd %xmm1, %xmm4 +; CHECK-NEXT: cvttsd2si %xmm4, %rsi +; CHECK-NEXT: cvttsd2si %xmm0, %rdi +; CHECK-NEXT: movq %rdi, %r8 +; CHECK-NEXT: sarq $63, %r8 +; CHECK-NEXT: andq %rsi, %r8 +; CHECK-NEXT: orq %rdi, %r8 +; CHECK-NEXT: ucomisd %xmm2, %xmm0 +; CHECK-NEXT: cmovbq %rcx, %r8 ; CHECK-NEXT: ucomisd %xmm3, %xmm0 -; CHECK-NEXT: cmovbq %rax, %rdi -; CHECK-NEXT: ucomisd %xmm4, %xmm0 -; CHECK-NEXT: cmovaq %rcx, %rdi -; CHECK-NEXT: movq %rdi, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: cmovaq %rdx, %r8 +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %r8, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %x = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> %f) ret <2 x i64> %x diff --git a/llvm/test/CodeGen/X86/frame-base.ll b/llvm/test/CodeGen/X86/frame-base.ll index 72c062b77a5ee..906b8981cff9f 100644 --- a/llvm/test/CodeGen/X86/frame-base.ll +++ b/llvm/test/CodeGen/X86/frame-base.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s ; The issue here was a conflict between forming a %rip-relative lea and a @@ -8,8 +9,15 @@ define void @test_frame_rip_conflict() { ; CHECK-LABEL: test_frame_rip_conflict: -; CHECK: leaq _var(%rip), [[TMPADDR:%r.*]] -; CHECK: leaq {{-?[0-9]+}}(%rsp,[[TMPADDR]]), +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: leaq _var(%rip), %rax +; CHECK-NEXT: leaq (%rsp,%rax), %rdi +; CHECK-NEXT: addq $4, %rdi +; CHECK-NEXT: callq _eat_i64 +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq %stackvar = alloca i32 %stackint = ptrtoint ptr %stackvar to i64 diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index 506a08808ff91..5884af5d9223d 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -805,17 +805,17 @@ define i32 @freeze_fshr(i32 %a0, i32 %a1, i32 %a2) nounwind { define void @pr59676_frozen(ptr %dst, i32 %x.orig) { ; X86-LABEL: pr59676_frozen: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull %eax, %eax ; X86-NEXT: imull $84, %eax, %eax -; X86-NEXT: movl $818089009, %edx # imm = 0x30C30C31 -; X86-NEXT: imull %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrl $31, %eax +; X86-NEXT: movl $818089009, %ecx # imm = 0x30C30C31 +; X86-NEXT: imull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: shrl $31, %ecx ; X86-NEXT: sarl $3, %edx -; X86-NEXT: addl %eax, %edx -; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: pr59676_frozen: @@ -843,17 +843,17 @@ define void @pr59676_frozen(ptr %dst, i32 %x.orig) { define void @pr59676_nsw_frozen(ptr %dst, i32 %x.orig) { ; X86-LABEL: pr59676_nsw_frozen: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull %eax, %eax ; X86-NEXT: imull $84, %eax, %eax -; X86-NEXT: movl $818089009, %edx # imm = 0x30C30C31 -; X86-NEXT: imull %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrl $31, %eax +; X86-NEXT: movl $818089009, %ecx # imm = 0x30C30C31 +; X86-NEXT: imull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: shrl $31, %ecx ; X86-NEXT: sarl $3, %edx -; X86-NEXT: addl %eax, %edx -; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: pr59676_nsw_frozen: @@ -882,16 +882,16 @@ define void @pr59676_nsw(ptr %dst, i32 %x) { ; X86-LABEL: pr59676_nsw: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: imull %eax, %eax ; X86-NEXT: imull $84, %eax, %eax -; X86-NEXT: movl $818089009, %edx # imm = 0x30C30C31 -; X86-NEXT: imull %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrl $31, %eax +; X86-NEXT: movl $818089009, %ecx # imm = 0x30C30C31 +; X86-NEXT: imull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: shrl $31, %ecx ; X86-NEXT: sarl $3, %edx -; X86-NEXT: addl %eax, %edx -; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: pr59676_nsw: diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index 362b3b945f962..5b339a1059c1d 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -170,9 +170,9 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: vmovdqa (%edx), %xmm0 -; X86-NEXT: vpand (%ecx), %xmm0, %xmm0 +; X86-NEXT: vmovdqa (%ecx), %xmm0 +; X86-NEXT: vpand (%eax), %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrb $6, %xmm0, (%eax) ; X86-NEXT: retl ; @@ -228,26 +228,24 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $16, %esp +; X86-NEXT: subl $32, %esp ; X86-NEXT: movl 24(%ebp), %eax ; X86-NEXT: andl $15, %eax ; X86-NEXT: movl 16(%ebp), %ecx ; X86-NEXT: andl $15, %ecx -; X86-NEXT: movl 32(%ebp), %edx -; X86-NEXT: movl 12(%ebp), %esi -; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: vmovaps (%edi), %xmm0 -; X86-NEXT: vandps (%esi), %xmm0, %xmm0 +; X86-NEXT: movl 12(%ebp), %edx +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: vmovaps (%esi), %xmm0 +; X86-NEXT: vandps (%edx), %xmm0, %xmm0 ; X86-NEXT: vmovaps %xmm0, (%esp) ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: cmpb (%esp,%eax), %cl -; X86-NEXT: sete (%edx) -; X86-NEXT: leal -8(%ebp), %esp +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: sete (%eax) +; X86-NEXT: leal -4(%ebp), %esp ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; @@ -277,16 +275,16 @@ define void @freeze_buildvector_single_maybe_poison_operand(ptr %origin, ptr %ds ; X86-LABEL: freeze_buildvector_single_maybe_poison_operand: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42] -; X86-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0 +; X86-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: freeze_buildvector_single_maybe_poison_operand: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42] +; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [u,42,42,42] ; X64-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -320,7 +318,7 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin ; ; X64-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42] +; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [u,42,u,u] ; X64-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] @@ -481,21 +479,19 @@ define void @freeze_two_buildvectors_one_undef_elt(ptr %origin0, ptr %origin1, p define void @freeze_buildvector(ptr %origin0, ptr %origin1, ptr %origin2, ptr %origin3, ptr %dst) nounwind { ; X86-LABEL: freeze_buildvector: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vpinsrd $1, (%esi), %xmm0, %xmm0 -; X86-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, (%ecx), %xmm0, %xmm0 +; X86-NEXT: vpinsrd $1, (%edx), %xmm0, %xmm0 +; X86-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0 +; X86-NEXT: vpinsrd $3, (%eax), %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: freeze_buildvector: @@ -529,18 +525,16 @@ define void @freeze_buildvector(ptr %origin0, ptr %origin1, ptr %origin2, ptr %o define void @freeze_buildvector_one_undef_elt(ptr %origin0, ptr %origin1, ptr %origin2, ptr %origin3, ptr %dst) nounwind { ; X86-LABEL: freeze_buildvector_one_undef_elt: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vpinsrd $1, (%edx), %xmm0, %xmm0 +; X86-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, (%ecx), %xmm0, %xmm0 +; X86-NEXT: vpinsrd $3, (%eax), %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: freeze_buildvector_one_undef_elt: diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll index e8c8ccfa8d37f..0c86c3aefacbc 100644 --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -103,10 +103,10 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind { ; ; X86-SLOW-LABEL: var_shift_i32: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: notb %cl ; X86-SLOW-NEXT: shrl %eax ; X86-SLOW-NEXT: shrl %cl, %eax @@ -124,8 +124,8 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind { ; X64-SLOW-LABEL: var_shift_i32: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: movl %esi, %eax ; X64-SLOW-NEXT: shll %cl, %edi +; X64-SLOW-NEXT: movl %esi, %eax ; X64-SLOW-NEXT: shrl %eax ; X64-SLOW-NEXT: notb %cl ; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx @@ -226,11 +226,11 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-SLOW-NEXT: shrl %eax ; X86-SLOW-NEXT: notb %cl ; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: orl %edi, %eax ; X86-SLOW-NEXT: shrl %esi ; X86-SLOW-NEXT: shrl %cl, %esi ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: orl %edi, %eax ; X86-SLOW-NEXT: orl %esi, %edx ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi @@ -248,8 +248,8 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X64-SLOW-LABEL: var_shift_i64: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movq %rdx, %rcx -; X64-SLOW-NEXT: movq %rsi, %rax ; X64-SLOW-NEXT: shlq %cl, %rdi +; X64-SLOW-NEXT: movq %rsi, %rax ; X64-SLOW-NEXT: shrq %rax ; X64-SLOW-NEXT: notb %cl ; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $rcx @@ -267,25 +267,25 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: pushl %ebx ; X86-FAST-NEXT: pushl %edi ; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-FAST-NEXT: testb $64, %cl ; X86-FAST-NEXT: jne .LBB6_1 ; X86-FAST-NEXT: # %bb.2: -; X86-FAST-NEXT: movl %ebx, %ebp -; X86-FAST-NEXT: movl %esi, %ebx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-FAST-NEXT: movl %edi, %eax -; X86-FAST-NEXT: movl %edx, %edi +; X86-FAST-NEXT: movl %esi, %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl %ebx, %ebp +; X86-FAST-NEXT: movl %edx, %ebx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: je .LBB6_5 ; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %esi, %edx -; X86-FAST-NEXT: movl %edi, %esi +; X86-FAST-NEXT: movl %edx, %esi +; X86-FAST-NEXT: movl %edi, %edx ; X86-FAST-NEXT: movl %ebx, %edi ; X86-FAST-NEXT: movl %eax, %ebx ; X86-FAST-NEXT: jmp .LBB6_6 @@ -301,12 +301,12 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: shldl %cl, %ebp, %eax ; X86-FAST-NEXT: movl %edi, %ebp ; X86-FAST-NEXT: shldl %cl, %ebx, %ebp -; X86-FAST-NEXT: movl %esi, %ebx +; X86-FAST-NEXT: movl %edx, %ebx ; X86-FAST-NEXT: shldl %cl, %edi, %ebx ; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-FAST-NEXT: shldl %cl, %esi, %edx +; X86-FAST-NEXT: shldl %cl, %edx, %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: movl %edx, 12(%ecx) +; X86-FAST-NEXT: movl %esi, 12(%ecx) ; X86-FAST-NEXT: movl %ebx, 8(%ecx) ; X86-FAST-NEXT: movl %ebp, 4(%ecx) ; X86-FAST-NEXT: movl %eax, (%ecx) @@ -323,28 +323,28 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: pushl %ebx ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: pushl %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: subl $12, %esp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: testb $64, %al ; X86-SLOW-NEXT: jne .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: movl %edx, %ebp -; X86-SLOW-NEXT: movl %ebx, %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl %edi, %ecx -; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: movl %edi, %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl %ebx, %ebp +; X86-SLOW-NEXT: movl %esi, %ebx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: testb $32, %al ; X86-SLOW-NEXT: je .LBB6_5 ; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %edi, %ebx -; X86-SLOW-NEXT: movl %edx, %edi -; X86-SLOW-NEXT: movl %ecx, %edx +; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %edx, %esi +; X86-SLOW-NEXT: movl %ebx, %edx +; X86-SLOW-NEXT: movl %ecx, %ebx ; X86-SLOW-NEXT: jmp .LBB6_6 ; X86-SLOW-NEXT: .LBB6_1: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp @@ -353,44 +353,52 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: jne .LBB6_4 ; X86-SLOW-NEXT: .LBB6_5: ; X86-SLOW-NEXT: movl %ecx, %ebp -; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: .LBB6_6: -; X86-SLOW-NEXT: movl %edx, %esi -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: movl %ebx, %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: shll %cl, %eax ; X86-SLOW-NEXT: shrl %ebp -; X86-SLOW-NEXT: movb %al, %ch -; X86-SLOW-NEXT: notb %ch -; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movl %ebp, %edi +; X86-SLOW-NEXT: movl %ebx, %ebp +; X86-SLOW-NEXT: movl %ecx, %ebx +; X86-SLOW-NEXT: notb %bl +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %edx, %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: shrl %ebp +; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: orl %esi, %ebp -; X86-SLOW-NEXT: movl %edi, %esi -; X86-SLOW-NEXT: movb %al, %cl -; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-SLOW-NEXT: shrl %edx -; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: orl %esi, %edx -; X86-SLOW-NEXT: movl %ebx, %esi -; X86-SLOW-NEXT: movb %al, %cl -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: shrl %edi -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: movb %al, %cl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: shrl %ebx -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %ebx -; X86-SLOW-NEXT: orl %eax, %ebx +; X86-SLOW-NEXT: shrl %esi +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-SLOW-NEXT: orl %edi, %edx +; X86-SLOW-NEXT: orl %eax, %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %ebx, 12(%eax) -; X86-SLOW-NEXT: movl %edi, 8(%eax) -; X86-SLOW-NEXT: movl %edx, 4(%eax) -; X86-SLOW-NEXT: movl %ebp, (%eax) -; X86-SLOW-NEXT: addl $4, %esp +; X86-SLOW-NEXT: movl %esi, 12(%eax) +; X86-SLOW-NEXT: movl %edx, 8(%eax) +; X86-SLOW-NEXT: movl %ebp, 4(%eax) +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SLOW-NEXT: movl %ecx, (%eax) +; X86-SLOW-NEXT: addl $12, %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: popl %ebx @@ -424,12 +432,12 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X64-SLOW-NEXT: notb %r9b ; X64-SLOW-NEXT: movl %r9d, %ecx ; X64-SLOW-NEXT: shrq %cl, %rdx -; X64-SLOW-NEXT: orq %rdx, %rax ; X64-SLOW-NEXT: movl %r8d, %ecx ; X64-SLOW-NEXT: shlq %cl, %rsi ; X64-SLOW-NEXT: shrq %rdi ; X64-SLOW-NEXT: movl %r9d, %ecx ; X64-SLOW-NEXT: shrq %cl, %rdi +; X64-SLOW-NEXT: orq %rdx, %rax ; X64-SLOW-NEXT: orq %rsi, %rdi ; X64-SLOW-NEXT: movq %rdi, %rdx ; X64-SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll index 4340f8fd484ae..be82747699c2e 100644 --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -100,10 +100,10 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind { ; ; X86-SLOW-LABEL: var_shift_i32: ; X86-SLOW: # %bb.0: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: shrl %cl, %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: notb %cl ; X86-SLOW-NEXT: addl %eax, %eax ; X86-SLOW-NEXT: shll %cl, %eax @@ -121,8 +121,8 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind { ; X64-SLOW-LABEL: var_shift_i32: ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: movl %edx, %ecx -; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-SLOW-NEXT: shrl %cl, %esi +; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax ; X64-SLOW-NEXT: notb %cl ; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx @@ -221,11 +221,11 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-SLOW-NEXT: shll %cl, %edi ; X86-SLOW-NEXT: movb %bl, %cl ; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: orl %edi, %eax ; X86-SLOW-NEXT: shrl %cl, %esi ; X86-SLOW-NEXT: addl %edx, %edx ; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: orl %edi, %eax ; X86-SLOW-NEXT: orl %esi, %edx ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi @@ -263,20 +263,20 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: pushl %esi ; X86-FAST-NEXT: pushl %eax ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-FAST-NEXT: testb $64, %cl ; X86-FAST-NEXT: je .LBB6_1 ; X86-FAST-NEXT: # %bb.2: -; X86-FAST-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-FAST-NEXT: movl %edi, %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: movl %esi, %ebp -; X86-FAST-NEXT: movl %ebx, %esi +; X86-FAST-NEXT: movl %edi, %ebp +; X86-FAST-NEXT: movl %ebx, %edi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-FAST-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, %edx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: je .LBB6_4 ; X86-FAST-NEXT: jmp .LBB6_5 @@ -287,19 +287,19 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: jne .LBB6_5 ; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %edi, %ebx -; X86-FAST-NEXT: movl %esi, %edi -; X86-FAST-NEXT: movl %edx, %esi +; X86-FAST-NEXT: movl %esi, %ebx +; X86-FAST-NEXT: movl %edi, %esi +; X86-FAST-NEXT: movl %edx, %edi ; X86-FAST-NEXT: movl %ebp, %edx ; X86-FAST-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-FAST-NEXT: .LBB6_5: ; X86-FAST-NEXT: shrdl %cl, %edx, %ebp -; X86-FAST-NEXT: shrdl %cl, %esi, %edx -; X86-FAST-NEXT: shrdl %cl, %edi, %esi +; X86-FAST-NEXT: shrdl %cl, %edi, %edx +; X86-FAST-NEXT: shrdl %cl, %esi, %edi ; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-FAST-NEXT: shrdl %cl, %ebx, %edi -; X86-FAST-NEXT: movl %edi, 12(%eax) -; X86-FAST-NEXT: movl %esi, 8(%eax) +; X86-FAST-NEXT: shrdl %cl, %ebx, %esi +; X86-FAST-NEXT: movl %esi, 12(%eax) +; X86-FAST-NEXT: movl %edi, 8(%eax) ; X86-FAST-NEXT: movl %edx, 4(%eax) ; X86-FAST-NEXT: movl %ebp, (%eax) ; X86-FAST-NEXT: addl $4, %esp @@ -315,75 +315,80 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: pushl %ebx ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: subl $8, %esp +; X86-SLOW-NEXT: subl $12, %esp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: testb $64, %cl ; X86-SLOW-NEXT: je .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: movl %ebp, %eax -; X86-SLOW-NEXT: movl %ebx, %ebp +; X86-SLOW-NEXT: movl %edx, %eax +; X86-SLOW-NEXT: movl %ebx, %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl %edi, %edx -; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl %edi, %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl %ecx, %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, %ecx +; X86-SLOW-NEXT: movl %eax, %ebp +; X86-SLOW-NEXT: testb $32, %cl +; X86-SLOW-NEXT: je .LBB6_4 +; X86-SLOW-NEXT: jmp .LBB6_5 +; X86-SLOW-NEXT: .LBB6_1: ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SLOW-NEXT: testb $32, %cl ; X86-SLOW-NEXT: jne .LBB6_5 ; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %ebx, %esi -; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %edi, %edx ; X86-SLOW-NEXT: movl %ebp, %edi -; X86-SLOW-NEXT: movl %edx, %ebp -; X86-SLOW-NEXT: movl %eax, %edx -; X86-SLOW-NEXT: jmp .LBB6_6 -; X86-SLOW-NEXT: .LBB6_1: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: testb $32, %cl -; X86-SLOW-NEXT: je .LBB6_4 +; X86-SLOW-NEXT: movl %esi, %ebp ; X86-SLOW-NEXT: .LBB6_5: -; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: .LBB6_6: -; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: movl %ecx, %ebx -; X86-SLOW-NEXT: notb %bl -; X86-SLOW-NEXT: leal (%ebp,%ebp), %eax -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: orl %edx, %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: movl %ecx, %eax ; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: leal (%edi,%edi), %edx -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: orl %ebp, %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: movb %al, %ch +; X86-SLOW-NEXT: notb %ch +; X86-SLOW-NEXT: leal (%edi,%edi), %esi +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: movb %al, %cl ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, %edi +; X86-SLOW-NEXT: movl %edx, %ebp +; X86-SLOW-NEXT: addl %edx, %edx +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: movb %al, %cl +; X86-SLOW-NEXT: shrl %cl, %ebp +; X86-SLOW-NEXT: orl %edi, %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-SLOW-NEXT: leal (%edi,%edi), %ebp -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: addl %esi, %esi -; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: leal (%edi,%edi), %esi +; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: orl %edi, %esi +; X86-SLOW-NEXT: movb %al, %cl +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: addl %ebx, %ebx +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shll %cl, %ebx +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-SLOW-NEXT: orl %ebp, %esi +; X86-SLOW-NEXT: orl %edi, %ebx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movl %esi, 12(%ecx) -; X86-SLOW-NEXT: movl %ebp, 8(%ecx) +; X86-SLOW-NEXT: movl %ebx, 12(%ecx) +; X86-SLOW-NEXT: movl %esi, 8(%ecx) ; X86-SLOW-NEXT: movl %edx, 4(%ecx) +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SLOW-NEXT: movl %eax, (%ecx) ; X86-SLOW-NEXT: movl %ecx, %eax -; X86-SLOW-NEXT: addl $8, %esp +; X86-SLOW-NEXT: addl $12, %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: popl %ebx @@ -392,10 +397,10 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; ; X64-FAST-LABEL: var_shift_i128: ; X64-FAST: # %bb.0: -; X64-FAST-NEXT: movq %rdx, %rax ; X64-FAST-NEXT: testb $64, %r8b ; X64-FAST-NEXT: cmoveq %rdi, %rsi ; X64-FAST-NEXT: cmoveq %rcx, %rdi +; X64-FAST-NEXT: movq %rdx, %rax ; X64-FAST-NEXT: cmovneq %rcx, %rax ; X64-FAST-NEXT: movl %r8d, %ecx ; X64-FAST-NEXT: shrdq %cl, %rdi, %rax @@ -416,13 +421,14 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X64-SLOW-NEXT: notb %r9b ; X64-SLOW-NEXT: movl %r9d, %ecx ; X64-SLOW-NEXT: shlq %cl, %rax -; X64-SLOW-NEXT: orq %rdx, %rax ; X64-SLOW-NEXT: movl %r8d, %ecx ; X64-SLOW-NEXT: shrq %cl, %rdi -; X64-SLOW-NEXT: leaq (%rsi,%rsi), %rdx +; X64-SLOW-NEXT: addq %rsi, %rsi ; X64-SLOW-NEXT: movl %r9d, %ecx -; X64-SLOW-NEXT: shlq %cl, %rdx -; X64-SLOW-NEXT: orq %rdi, %rdx +; X64-SLOW-NEXT: shlq %cl, %rsi +; X64-SLOW-NEXT: orq %rdx, %rax +; X64-SLOW-NEXT: orq %rdi, %rsi +; X64-SLOW-NEXT: movq %rsi, %rdx ; X64-SLOW-NEXT: retq %tmp = tail call i128 @llvm.fshr.i128(i128 %x, i128 %y, i128 %z) ret i128 %tmp diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll index da44b5ec1371e..c7c35a88546d3 100644 --- a/llvm/test/CodeGen/X86/ftrunc.ll +++ b/llvm/test/CodeGen/X86/ftrunc.ll @@ -45,9 +45,9 @@ define double @trunc_unsigned_f64(double %x) #0 { ; SSE2: # %bb.0: ; SSE2-NEXT: cvttsd2si %xmm0, %rax ; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: cvttsd2si %xmm0, %rdx +; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: andq %rcx, %rdx ; SSE2-NEXT: orq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 @@ -91,8 +91,8 @@ define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 { ; SSE2: # %bb.0: ; SSE2-NEXT: cvttps2dq %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 @@ -169,63 +169,63 @@ define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 { define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 { ; SSE2-LABEL: trunc_unsigned_v4f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm1, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm3 = [9.2233720368547758E+18,0.0E+0] -; SSE2-NEXT: subsd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: movsd {{.*#+}} xmm4 = [9.2233720368547758E+18,0.0E+0] +; SSE2-NEXT: subsd %xmm4, %xmm1 ; SSE2-NEXT: cvttsd2si %xmm1, %rax -; SSE2-NEXT: cvttsd2si %xmm2, %rcx +; SSE2-NEXT: cvttsd2si %xmm3, %rcx +; SSE2-NEXT: movapd %xmm0, %xmm2 ; SSE2-NEXT: movq %rcx, %rdx ; SSE2-NEXT: sarq $63, %rdx ; SSE2-NEXT: andq %rax, %rdx ; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE2-NEXT: cvttsd2si %xmm3, %rax ; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: cvttsd2si %xmm2, %rax -; SSE2-NEXT: subsd %xmm3, %xmm2 -; SSE2-NEXT: cvttsd2si %xmm2, %rcx +; SSE2-NEXT: subsd %xmm4, %xmm3 +; SSE2-NEXT: cvttsd2si %xmm3, %rcx ; SSE2-NEXT: movq %rax, %rdx ; SSE2-NEXT: sarq $63, %rdx ; SSE2-NEXT: andq %rcx, %rdx ; SSE2-NEXT: orq %rax, %rdx -; SSE2-NEXT: movq %rdx, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: movapd %xmm0, %xmm2 -; SSE2-NEXT: subsd %xmm3, %xmm2 -; SSE2-NEXT: cvttsd2si %xmm2, %rax -; SSE2-NEXT: cvttsd2si %xmm0, %rcx +; SSE2-NEXT: movq %rdx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: subsd %xmm4, %xmm0 +; SSE2-NEXT: cvttsd2si %xmm0, %rax +; SSE2-NEXT: cvttsd2si %xmm2, %rcx ; SSE2-NEXT: movq %rcx, %rdx ; SSE2-NEXT: sarq $63, %rdx ; SSE2-NEXT: andq %rax, %rdx ; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: cvttsd2si %xmm0, %rax -; SSE2-NEXT: subsd %xmm3, %xmm0 -; SSE2-NEXT: cvttsd2si %xmm0, %rcx +; SSE2-NEXT: movq %rdx, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: cvttsd2si %xmm2, %rax +; SSE2-NEXT: subsd %xmm4, %xmm2 +; SSE2-NEXT: cvttsd2si %xmm2, %rcx ; SSE2-NEXT: movq %rax, %rdx ; SSE2-NEXT: sarq $63, %rdx ; SSE2-NEXT: andq %rcx, %rdx ; SSE2-NEXT: orq %rax, %rdx -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: movq %rdx, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: psrlq $32, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] -; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] -; SSE2-NEXT: subpd %xmm6, %xmm2 -; SSE2-NEXT: addpd %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: psrlq $32, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: subpd %xmm6, %xmm1 -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: addpd %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_unsigned_v4f64: @@ -638,13 +638,12 @@ define <4 x double> @trunc_signed_v4f64_nsz(<4 x double> %x) #0 { ; SSE2-NEXT: cvttsd2si %xmm0, %rsi ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sd %rdx, %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sd %rsi, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: cvtsi2sd %rsi, %xmm2 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: cvtsi2sd %rcx, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: cvtsi2sd %rcx, %xmm3 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_signed_v4f64_nsz: @@ -764,10 +763,10 @@ define double @trunc_signed_f64_disable_via_intrinsic(double %x) #0 { ; X86-AVX1-NEXT: subl $32, %esp ; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX1-NEXT: fldl (%esp) -; X86-AVX1-NEXT: fisttpll (%esp) ; X86-AVX1-NEXT: xorl %eax, %eax ; X86-AVX1-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-AVX1-NEXT: fldl (%esp) +; X86-AVX1-NEXT: fisttpll (%esp) ; X86-AVX1-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; X86-AVX1-NEXT: movl $0, %edx ; X86-AVX1-NEXT: jb .LBB19_2 diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index a464d78f9af38..97c24eea24675 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -77,40 +77,39 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: testb $64, %cl -; X86-SSE2-NEXT: movl %esi, %eax -; X86-SSE2-NEXT: cmovnel %ebx, %eax -; X86-SSE2-NEXT: movl %edx, %ebp +; X86-SSE2-NEXT: movl %eax, %ebp ; X86-SSE2-NEXT: cmovnel %edi, %ebp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movl %edx, %ebx +; X86-SSE2-NEXT: cmovnel %esi, %ebx +; X86-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %esi ; X86-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %edi -; X86-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %ebx ; X86-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %esi +; X86-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: testb $32, %cl -; X86-SSE2-NEXT: cmovnel %esi, %edx -; X86-SSE2-NEXT: cmovnel %ebp, %esi -; X86-SSE2-NEXT: cmovnel %eax, %ebp -; X86-SSE2-NEXT: cmovel %edi, %ebx -; X86-SSE2-NEXT: cmovel %eax, %edi -; X86-SSE2-NEXT: movl %edi, %eax -; X86-SSE2-NEXT: shldl %cl, %ebx, %eax -; X86-SSE2-NEXT: movl %ebp, %ebx -; X86-SSE2-NEXT: shldl %cl, %edi, %ebx -; X86-SSE2-NEXT: movl %esi, %edi -; X86-SSE2-NEXT: shldl %cl, %ebp, %edi +; X86-SSE2-NEXT: cmovnel %eax, %edx +; X86-SSE2-NEXT: cmovnel %ebx, %eax +; X86-SSE2-NEXT: cmovnel %ebp, %ebx +; X86-SSE2-NEXT: cmovel %esi, %edi +; X86-SSE2-NEXT: cmovel %ebp, %esi +; X86-SSE2-NEXT: movl %esi, %ebp +; X86-SSE2-NEXT: shldl %cl, %edi, %ebp +; X86-SSE2-NEXT: movl %ebx, %edi +; X86-SSE2-NEXT: shldl %cl, %esi, %edi +; X86-SSE2-NEXT: movl %eax, %esi +; X86-SSE2-NEXT: shldl %cl, %ebx, %esi ; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SSE2-NEXT: shldl %cl, %esi, %edx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl %edx, 12(%ecx) -; X86-SSE2-NEXT: movl %edi, 8(%ecx) -; X86-SSE2-NEXT: movl %ebx, 4(%ecx) -; X86-SSE2-NEXT: movl %eax, (%ecx) -; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shldl %cl, %eax, %edx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl %edx, 12(%eax) +; X86-SSE2-NEXT: movl %esi, 8(%eax) +; X86-SSE2-NEXT: movl %edi, 4(%eax) +; X86-SSE2-NEXT: movl %ebp, (%eax) ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx @@ -561,8 +560,8 @@ define <4 x i32> @fshl_v4i32_undef1_cst(<4 x i32> %a0) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: retl @@ -733,8 +732,8 @@ define <4 x i32> @fshr_v4i32_undef1_cst(<4 x i32> %a0) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll index 0ca3380d188b7..70369f9ded418 100644 --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -106,7 +106,7 @@ define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou ; ; GFNIAVX512VL-LABEL: var_fshl_v16i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1 @@ -143,41 +143,35 @@ define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm2, %xmm3 ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNISSE-NEXT: movdqa %xmm3, %xmm0 -; GFNISSE-NEXT: pand %xmm5, %xmm0 +; GFNISSE-NEXT: pand %xmm4, %xmm0 +; GFNISSE-NEXT: movdqa %xmm1, %xmm5 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: movdqa %xmm0, %xmm4 -; GFNISSE-NEXT: paddb %xmm0, %xmm4 -; GFNISSE-NEXT: movdqa %xmm1, %xmm6 -; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm6 -; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; GFNISSE-NEXT: movdqa %xmm4, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm6 -; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; GFNISSE-NEXT: paddb %xmm4, %xmm4 -; GFNISSE-NEXT: movdqa %xmm4, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1 -; GFNISSE-NEXT: pandn %xmm5, %xmm3 -; GFNISSE-NEXT: psllw $5, %xmm3 -; GFNISSE-NEXT: movdqa %xmm3, %xmm4 -; GFNISSE-NEXT: paddb %xmm3, %xmm4 -; GFNISSE-NEXT: paddb %xmm2, %xmm2 -; GFNISSE-NEXT: movdqa %xmm2, %xmm5 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm1 +; GFNISSE-NEXT: movdqa %xmm1, %xmm5 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm1 +; GFNISSE-NEXT: movdqa %xmm1, %xmm5 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm1 +; GFNISSE-NEXT: pandn %xmm4, %xmm3 +; GFNISSE-NEXT: paddb %xmm2, %xmm2 +; GFNISSE-NEXT: movdqa %xmm2, %xmm4 +; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; GFNISSE-NEXT: psllw $5, %xmm3 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm2 ; GFNISSE-NEXT: movdqa %xmm2, %xmm3 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; GFNISSE-NEXT: movdqa %xmm4, %xmm0 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; GFNISSE-NEXT: movdqa %xmm2, %xmm3 ; GFNISSE-NEXT: paddb %xmm2, %xmm3 -; GFNISSE-NEXT: paddb %xmm4, %xmm4 -; GFNISSE-NEXT: movdqa %xmm4, %xmm0 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; GFNISSE-NEXT: por %xmm1, %xmm2 ; GFNISSE-NEXT: movdqa %xmm2, %xmm0 @@ -212,7 +206,7 @@ define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou ; ; GFNIAVX2-LABEL: var_fshr_v16i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; GFNIAVX2-NEXT: vpsllw $5, %xmm4, %xmm4 ; GFNIAVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5 @@ -239,7 +233,7 @@ define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou ; ; GFNIAVX512VL-LABEL: var_fshr_v16i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero @@ -277,9 +271,9 @@ define <16 x i8> @splatvar_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; GFNISSE-NEXT: psllw %xmm2, %xmm3 -; GFNISSE-NEXT: psrlw $8, %xmm3 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNISSE-NEXT: psllw %xmm2, %xmm1 +; GFNISSE-NEXT: psrlw $8, %xmm3 ; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: packuswb %xmm3, %xmm1 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 @@ -290,11 +284,11 @@ define <16 x i8> @splatvar_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt ; GFNIAVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; GFNIAVX-NEXT: vpsrlw $8, %xmm3, %xmm3 ; GFNIAVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpsrlw $8, %xmm3, %xmm1 ; GFNIAVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; GFNIAVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX-NEXT: retq %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %splat) @@ -308,10 +302,10 @@ define <16 x i8> @splatvar_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; GFNISSE-NEXT: psrlw %xmm2, %xmm4 -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; GFNISSE-NEXT: pand %xmm3, %xmm4 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNISSE-NEXT: psrlw %xmm2, %xmm1 +; GFNISSE-NEXT: pand %xmm3, %xmm4 ; GFNISSE-NEXT: pand %xmm1, %xmm3 ; GFNISSE-NEXT: packuswb %xmm4, %xmm3 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0 @@ -323,11 +317,11 @@ define <16 x i8> @splatvar_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatvar_fshr_v16i8: @@ -335,12 +329,12 @@ define <16 x i8> @splatvar_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; GFNIAVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpand %xmm4, %xmm3, %xmm1 ; GFNIAVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 -; GFNIAVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512VL-LABEL: splatvar_fshr_v16i8: @@ -348,12 +342,12 @@ define <16 x i8> @splatvar_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; GFNIAVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; GFNIAVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm1 ; GFNIAVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0 -; GFNIAVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; GFNIAVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: splatvar_fshr_v16i8: @@ -377,9 +371,9 @@ define <16 x i8> @constant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNISSE-NEXT: movdqa %xmm1, %xmm2 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,128,64,32,16,8,4,2] -; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: packuswb %xmm2, %xmm1 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 @@ -389,22 +383,22 @@ define <16 x i8> @constant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNIAVX1OR2: # %bb.0: ; GFNIAVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] -; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm1 ; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512VL-LABEL: constant_fshl_v16i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] -; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; GFNIAVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; GFNIAVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: constant_fshl_v16i8: @@ -428,9 +422,9 @@ define <16 x i8> @constant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNISSE-NEXT: movdqa %xmm1, %xmm2 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,128,64,32,16,8,4,2] -; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: packuswb %xmm2, %xmm1 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 @@ -440,22 +434,22 @@ define <16 x i8> @constant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNIAVX1OR2: # %bb.0: ; GFNIAVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] -; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm1 ; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512VL-LABEL: constant_fshr_v16i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] -; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; GFNIAVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; GFNIAVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: constant_fshr_v16i8: @@ -696,10 +690,10 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; ; GFNIAVX512VL-LABEL: var_fshl_v32i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 -; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] +; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 @@ -762,7 +756,7 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm2 ; GFNISSE-NEXT: paddb %xmm4, %xmm4 -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm11 = [16909320,16909320] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNISSE-NEXT: movdqa %xmm4, %xmm12 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm12 ; GFNISSE-NEXT: pandn %xmm7, %xmm6 @@ -780,38 +774,38 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNISSE-NEXT: paddb %xmm6, %xmm6 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4 -; GFNISSE-NEXT: por %xmm2, %xmm4 -; GFNISSE-NEXT: movdqa %xmm3, %xmm2 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm2 +; GFNISSE-NEXT: movdqa %xmm3, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm6 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0 ; GFNISSE-NEXT: pand %xmm7, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3 -; GFNISSE-NEXT: movdqa %xmm3, %xmm2 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm2 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm3 +; GFNISSE-NEXT: movdqa %xmm3, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm6 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3 -; GFNISSE-NEXT: movdqa %xmm3, %xmm2 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm2 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm3 +; GFNISSE-NEXT: movdqa %xmm3, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm6 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm3 ; GFNISSE-NEXT: paddb %xmm1, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm2 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm2 +; GFNISSE-NEXT: movdqa %xmm1, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm6 ; GFNISSE-NEXT: pandn %xmm7, %xmm5 ; GFNISSE-NEXT: psllw $5, %xmm5 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm2 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm2 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1 +; GFNISSE-NEXT: movdqa %xmm1, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm6 ; GFNISSE-NEXT: paddb %xmm5, %xmm5 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm2 -; GFNISSE-NEXT: paddb %xmm1, %xmm2 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1 +; GFNISSE-NEXT: movdqa %xmm1, %xmm6 +; GFNISSE-NEXT: paddb %xmm1, %xmm6 ; GFNISSE-NEXT: paddb %xmm5, %xmm5 ; GFNISSE-NEXT: movdqa %xmm5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1 +; GFNISSE-NEXT: por %xmm2, %xmm4 ; GFNISSE-NEXT: por %xmm3, %xmm1 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0 ; GFNISSE-NEXT: retq @@ -822,7 +816,7 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm5 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm4, %xmm6 -; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm8 @@ -853,12 +847,11 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm12 ; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm12, %xmm8, %xmm7 -; GFNIAVX1-NEXT: vpor %xmm4, %xmm7, %xmm4 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5 -; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm7 -; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm8 +; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm5, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm1, %xmm5 -; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm6 +; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm6 ; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm1, %xmm5 ; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 @@ -874,8 +867,9 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm4, %xmm7, %xmm2 ; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: var_fshr_v32i8: @@ -907,7 +901,7 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; ; GFNIAVX512VL-LABEL: var_fshr_v32i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5 @@ -975,8 +969,8 @@ define <32 x i8> @splatvar_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] @@ -998,11 +992,11 @@ define <32 x i8> @splatvar_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX2-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; GFNIAVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; GFNIAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsrlw $8, %ymm3, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatvar_fshl_v32i8: @@ -1010,11 +1004,11 @@ define <32 x i8> @splatvar_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt ; GFNIAVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX512-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; GFNIAVX512-NEXT: vpsrlw $8, %ymm3, %ymm3 ; GFNIAVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; GFNIAVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpsrlw $8, %ymm3, %ymm1 ; GFNIAVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %splat) @@ -1028,7 +1022,7 @@ define <32 x i8> @splatvar_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; GFNISSE-NEXT: psrlw %xmm4, %xmm6 -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm5, %xmm6 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; GFNISSE-NEXT: psrlw %xmm4, %xmm2 @@ -1075,38 +1069,25 @@ define <32 x i8> @splatvar_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX2-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; GFNIAVX2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm1 ; GFNIAVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; -; GFNIAVX512VL-LABEL: splatvar_fshr_v32i8: -; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: retq -; -; GFNIAVX512BW-LABEL: splatvar_fshr_v32i8: -; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; GFNIAVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm3 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; GFNIAVX512BW-NEXT: vpand %ymm4, %ymm0, %ymm0 -; GFNIAVX512BW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; GFNIAVX512BW-NEXT: retq +; GFNIAVX512-LABEL: splatvar_fshr_v32i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; GFNIAVX512-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 +; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; GFNIAVX512-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpand %ymm4, %ymm3, %ymm1 +; GFNIAVX512-NEXT: vpand %ymm4, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: retq %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %splat) ret <32 x i8> %res @@ -1117,11 +1098,11 @@ define <32 x i8> @constant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm2, %xmm5 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2] ; GFNISSE-NEXT: pmullw %xmm6, %xmm5 ; GFNISSE-NEXT: psrlw $8, %xmm5 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: pmullw %xmm4, %xmm2 ; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: packuswb %xmm5, %xmm2 @@ -1142,11 +1123,11 @@ define <32 x i8> @constant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 @@ -1164,22 +1145,22 @@ define <32 x i8> @constant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512VL-LABEL: constant_fshl_v32i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: constant_fshl_v32i8: @@ -1201,11 +1182,11 @@ define <32 x i8> @constant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm2, %xmm5 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2] ; GFNISSE-NEXT: pmullw %xmm6, %xmm5 ; GFNISSE-NEXT: psrlw $8, %xmm5 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: pmullw %xmm4, %xmm2 ; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: packuswb %xmm5, %xmm2 @@ -1226,11 +1207,11 @@ define <32 x i8> @constant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 @@ -1248,22 +1229,22 @@ define <32 x i8> @constant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512VL-LABEL: constant_fshr_v32i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: constant_fshr_v32i8: @@ -1285,7 +1266,7 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm5 = [16909320,16909320] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 ; GFNISSE-NEXT: por %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 @@ -1493,7 +1474,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; ; GFNIAVX1-LABEL: var_fshl_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX1-NEXT: vandps %ymm7, %ymm4, %ymm8 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm8, %xmm9 ; GFNIAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 @@ -1681,7 +1662,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm3 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm6 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX512VL-NEXT: vpandq %zmm7, %zmm2, %zmm2 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm8 ; GFNIAVX512VL-NEXT: vpxor %ymm7, %ymm8, %ymm9 @@ -1705,29 +1686,29 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm4 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm5 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5 -; GFNIAVX512VL-NEXT: vpsllw $5, %ymm8, %ymm6 -; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm7 -; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm7 -; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm4 +; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm4, %ymm6 +; GFNIAVX512VL-NEXT: vpsllw $5, %ymm8, %ymm7 +; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm6, %ymm4, %ymm4 +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm4, %ymm8 +; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm8, %ymm4, %ymm4 +; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm8 +; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm8, %ymm4, %ymm4 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm5 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm4 +; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm5 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; GFNIAVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; GFNIAVX512VL-NEXT: retq ; @@ -1753,116 +1734,111 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-LABEL: var_fshr_v64i8: ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm7, %xmm10 -; GFNISSE-NEXT: movdqa %xmm6, %xmm7 -; GFNISSE-NEXT: movdqa %xmm5, %xmm6 -; GFNISSE-NEXT: movdqa %xmm4, %xmm5 -; GFNISSE-NEXT: movdqa %xmm3, %xmm4 -; GFNISSE-NEXT: movdqa %xmm2, %xmm3 -; GFNISSE-NEXT: movdqa %xmm1, %xmm2 -; GFNISSE-NEXT: movdqa %xmm0, %xmm1 +; GFNISSE-NEXT: movdqa %xmm4, %xmm7 +; GFNISSE-NEXT: movdqa %xmm0, %xmm4 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; GFNISSE-NEXT: movdqa %xmm5, %xmm12 +; GFNISSE-NEXT: movdqa %xmm7, %xmm12 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 ; GFNISSE-NEXT: pand %xmm11, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm5 -; GFNISSE-NEXT: movdqa %xmm5, %xmm13 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm7 +; GFNISSE-NEXT: movdqa %xmm7, %xmm13 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm13 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm5 -; GFNISSE-NEXT: movdqa %xmm5, %xmm14 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm7 +; GFNISSE-NEXT: movdqa %xmm7, %xmm14 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm5 -; GFNISSE-NEXT: paddb %xmm1, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm15 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm7 +; GFNISSE-NEXT: paddb %xmm4, %xmm4 +; GFNISSE-NEXT: movdqa %xmm4, %xmm15 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15 ; GFNISSE-NEXT: movdqa %xmm11, %xmm12 ; GFNISSE-NEXT: pandn %xmm11, %xmm9 ; GFNISSE-NEXT: psllw $5, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm15, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm15, %xmm4 +; GFNISSE-NEXT: movdqa %xmm4, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 ; GFNISSE-NEXT: paddb %xmm9, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm8 -; GFNISSE-NEXT: paddb %xmm1, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4 +; GFNISSE-NEXT: movdqa %xmm4, %xmm8 +; GFNISSE-NEXT: paddb %xmm4, %xmm8 ; GFNISSE-NEXT: paddb %xmm9, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; GFNISSE-NEXT: movdqa %xmm6, %xmm8 +; GFNISSE-NEXT: movdqa %xmm5, %xmm8 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm8 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 ; GFNISSE-NEXT: pand %xmm12, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6 -; GFNISSE-NEXT: movdqa %xmm6, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm5 +; GFNISSE-NEXT: movdqa %xmm5, %xmm8 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm8 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6 -; GFNISSE-NEXT: movdqa %xmm6, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm5 +; GFNISSE-NEXT: movdqa %xmm5, %xmm8 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm14 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm14, %xmm8 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6 -; GFNISSE-NEXT: paddb %xmm2, %xmm2 -; GFNISSE-NEXT: movdqa %xmm2, %xmm8 -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm15 = [16909320,16909320] +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm5 +; GFNISSE-NEXT: paddb %xmm1, %xmm1 +; GFNISSE-NEXT: movdqa %xmm1, %xmm8 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm15 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm15, %xmm8 ; GFNISSE-NEXT: pandn %xmm12, %xmm9 ; GFNISSE-NEXT: psllw $5, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2 -; GFNISSE-NEXT: movdqa %xmm2, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1 +; GFNISSE-NEXT: movdqa %xmm1, %xmm8 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm0 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm0, %xmm8 ; GFNISSE-NEXT: paddb %xmm9, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2 -; GFNISSE-NEXT: movdqa %xmm2, %xmm8 -; GFNISSE-NEXT: paddb %xmm2, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1 +; GFNISSE-NEXT: movdqa %xmm1, %xmm8 +; GFNISSE-NEXT: paddb %xmm1, %xmm8 ; GFNISSE-NEXT: paddb %xmm9, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; GFNISSE-NEXT: movdqa %xmm7, %xmm8 +; GFNISSE-NEXT: movdqa %xmm6, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm8 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 ; GFNISSE-NEXT: pand %xmm12, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm7 -; GFNISSE-NEXT: movdqa %xmm7, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6 +; GFNISSE-NEXT: movdqa %xmm6, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm8 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm7 -; GFNISSE-NEXT: movdqa %xmm7, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6 +; GFNISSE-NEXT: movdqa %xmm6, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm14, %xmm8 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm7 -; GFNISSE-NEXT: paddb %xmm3, %xmm3 -; GFNISSE-NEXT: movdqa %xmm3, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6 +; GFNISSE-NEXT: paddb %xmm2, %xmm2 +; GFNISSE-NEXT: movdqa %xmm2, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm15, %xmm8 ; GFNISSE-NEXT: pandn %xmm12, %xmm9 ; GFNISSE-NEXT: psllw $5, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3 -; GFNISSE-NEXT: movdqa %xmm3, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2 +; GFNISSE-NEXT: movdqa %xmm2, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 ; GFNISSE-NEXT: paddb %xmm9, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3 -; GFNISSE-NEXT: movdqa %xmm3, %xmm8 -; GFNISSE-NEXT: paddb %xmm3, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2 +; GFNISSE-NEXT: movdqa %xmm2, %xmm8 +; GFNISSE-NEXT: paddb %xmm2, %xmm8 ; GFNISSE-NEXT: paddb %xmm9, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; GFNISSE-NEXT: movdqa %xmm10, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm8 @@ -1878,31 +1854,28 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm14, %xmm8 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm10 -; GFNISSE-NEXT: paddb %xmm4, %xmm4 -; GFNISSE-NEXT: movdqa %xmm4, %xmm8 +; GFNISSE-NEXT: paddb %xmm3, %xmm3 +; GFNISSE-NEXT: movdqa %xmm3, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm15, %xmm8 ; GFNISSE-NEXT: pandn %xmm12, %xmm9 ; GFNISSE-NEXT: psllw $5, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4 -; GFNISSE-NEXT: movdqa %xmm4, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3 +; GFNISSE-NEXT: movdqa %xmm3, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 ; GFNISSE-NEXT: paddb %xmm9, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4 -; GFNISSE-NEXT: movdqa %xmm4, %xmm8 -; GFNISSE-NEXT: paddb %xmm4, %xmm8 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3 +; GFNISSE-NEXT: movdqa %xmm3, %xmm8 +; GFNISSE-NEXT: paddb %xmm3, %xmm8 ; GFNISSE-NEXT: paddb %xmm9, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3 +; GFNISSE-NEXT: por %xmm7, %xmm4 ; GFNISSE-NEXT: por %xmm5, %xmm1 ; GFNISSE-NEXT: por %xmm6, %xmm2 -; GFNISSE-NEXT: por %xmm7, %xmm3 -; GFNISSE-NEXT: por %xmm10, %xmm4 -; GFNISSE-NEXT: movdqa %xmm1, %xmm0 -; GFNISSE-NEXT: movdqa %xmm2, %xmm1 -; GFNISSE-NEXT: movdqa %xmm3, %xmm2 -; GFNISSE-NEXT: movdqa %xmm4, %xmm3 +; GFNISSE-NEXT: por %xmm10, %xmm3 +; GFNISSE-NEXT: movdqa %xmm4, %xmm0 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: var_fshr_v64i8: @@ -1911,7 +1884,7 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm7 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm8, %xmm9 -; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX1-NEXT: vandps %ymm6, %ymm4, %ymm11 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm11, %xmm10 ; GFNIAVX1-NEXT: vpsllw $5, %xmm10, %xmm12 @@ -1942,54 +1915,54 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm14 ; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15 ; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm13, %xmm13 -; GFNIAVX1-NEXT: vpor %xmm12, %xmm13, %xmm12 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm13 -; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm14 -; GFNIAVX1-NEXT: vpblendvb %xmm14, %xmm13, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm13 -; GFNIAVX1-NEXT: vpaddb %xmm14, %xmm14, %xmm14 -; GFNIAVX1-NEXT: vpblendvb %xmm14, %xmm13, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm13 -; GFNIAVX1-NEXT: vpaddb %xmm14, %xmm14, %xmm14 -; GFNIAVX1-NEXT: vpblendvb %xmm14, %xmm13, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm14 +; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm15 +; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm14 +; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15 +; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm14 +; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15 +; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm2, %xmm14 ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm0, %xmm13 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm0, %xmm2 ; GFNIAVX1-NEXT: vpxor %xmm6, %xmm11, %xmm11 ; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm13, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm0, %xmm13 +; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm0, %xmm2 ; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm13, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm13 +; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 ; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm13, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm11, %xmm12 +; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpor %xmm12, %xmm13, %xmm11 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm12 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm12, %xmm13 ; GFNIAVX1-NEXT: vandps %ymm6, %ymm5, %ymm2 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm13 -; GFNIAVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm11, %xmm12 -; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm13 -; GFNIAVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm11, %xmm12 +; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm15 +; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm13, %xmm12, %xmm12 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm12, %xmm13 +; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15 +; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm13, %xmm12, %xmm12 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm12, %xmm13 +; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15 +; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm13, %xmm12, %xmm12 +; GFNIAVX1-NEXT: vpor %xmm0, %xmm14, %xmm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm13 ; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm13 -; GFNIAVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 -; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm12, %xmm13 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm13, %xmm14 ; GFNIAVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm13, %xmm12, %xmm12 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm12, %xmm13 +; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm14, %xmm13, %xmm13 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm13, %xmm14 ; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm13, %xmm12, %xmm12 -; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm13 +; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm14, %xmm13, %xmm13 +; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm14 ; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm13, %xmm12, %xmm5 -; GFNIAVX1-NEXT: vpor %xmm5, %xmm11, %xmm5 +; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm14, %xmm13, %xmm5 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vpor %xmm5, %xmm12, %xmm5 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm3, %xmm7 ; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm11 ; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm7, %xmm3, %xmm3 @@ -2030,8 +2003,8 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11 ; GFNIAVX2-NEXT: vpaddb %ymm9, %ymm9, %ymm9 ; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] +; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm0, %ymm11 ; GFNIAVX2-NEXT: vpandn %ymm6, %ymm4, %ymm4 ; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4 @@ -2043,29 +2016,29 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm12 ; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm7, %ymm3, %ymm2 -; GFNIAVX2-NEXT: vpand %ymm6, %ymm5, %ymm4 -; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4 -; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm3 -; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm10, %ymm2, %ymm3 -; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm7, %ymm3, %ymm4 +; GFNIAVX2-NEXT: vpand %ymm6, %ymm5, %ymm7 +; GFNIAVX2-NEXT: vpsllw $5, %ymm7, %ymm7 +; GFNIAVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm3, %ymm4 +; GFNIAVX2-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; GFNIAVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm10, %ymm3, %ymm4 +; GFNIAVX2-NEXT: vpaddb %ymm7, %ymm7, %ymm7 +; GFNIAVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm1, %ymm3 -; GFNIAVX2-NEXT: vpandn %ymm6, %ymm5, %ymm4 -; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4 -; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm11, %ymm1, %ymm3 -; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3 -; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm1, %ymm4 +; GFNIAVX2-NEXT: vpandn %ymm6, %ymm5, %ymm5 +; GFNIAVX2-NEXT: vpsllw $5, %ymm5, %ymm5 +; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm11, %ymm1, %ymm4 +; GFNIAVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm4 +; GFNIAVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512VL-LABEL: var_fshr_v64i8: @@ -2073,7 +2046,7 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX512VL-NEXT: vpandq %zmm6, %zmm2, %zmm2 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm7 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm7, %ymm8 @@ -2095,44 +2068,44 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm9, %ymm1, %ymm4 ; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5 +; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm4, %ymm8 ; GFNIAVX512VL-NEXT: vpxor %ymm6, %ymm7, %ymm7 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7 -; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm8 +; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm8, %ymm4, %ymm4 +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm4, %ymm9 ; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm8, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 +; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm9, %ymm4, %ymm4 +; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm9 ; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm8, %ymm3, %ymm3 +; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm9, %ymm4, %ymm4 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm4 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm5 ; GFNIAVX512VL-NEXT: vpxor %ymm6, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm4 +; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm5 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; GFNIAVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: var_fshr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 +; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; GFNIAVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; GFNIAVX512BW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 -; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX512BW-NEXT: vpandq %zmm5, %zmm3, %zmm3 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] @@ -2194,11 +2167,11 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm7, %xmm7 -; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm6 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 @@ -2228,8 +2201,8 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; ; GFNIAVX2-LABEL: splatvar_fshl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] ; GFNIAVX2-NEXT: vpsllw %xmm4, %ymm5, %ymm5 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm5, %ymm5 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] @@ -2249,8 +2222,8 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] @@ -2272,11 +2245,11 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX512BW-NEXT: vpsllw %xmm2, %zmm3, %zmm3 -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; GFNIAVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %splat) @@ -2288,10 +2261,10 @@ define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; GFNISSE-NEXT: movdqa %xmm4, %xmm10 -; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 +; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] ; GFNISSE-NEXT: psrlw %xmm9, %xmm10 -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm8, %xmm10 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] ; GFNISSE-NEXT: psrlw %xmm9, %xmm4 @@ -2335,11 +2308,11 @@ define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm8 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; GFNIAVX1-NEXT: vpand %xmm5, %xmm8, %xmm8 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm6 +; GFNIAVX1-NEXT: vpand %xmm5, %xmm8, %xmm7 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 -; GFNIAVX1-NEXT: vpackuswb %xmm8, %xmm6, %xmm6 +; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpand %xmm5, %xmm7, %xmm7 @@ -2394,7 +2367,7 @@ define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 @@ -2415,12 +2388,12 @@ define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 -; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; GFNIAVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm1 ; GFNIAVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %splat) @@ -2432,11 +2405,11 @@ define <64 x i8> @constant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm4, %xmm10 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2] ; GFNISSE-NEXT: pmullw %xmm9, %xmm10 ; GFNISSE-NEXT: psrlw $8, %xmm10 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: pmullw %xmm8, %xmm4 ; GFNISSE-NEXT: psrlw $8, %xmm4 ; GFNISSE-NEXT: packuswb %xmm10, %xmm4 @@ -2475,11 +2448,11 @@ define <64 x i8> @constant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 @@ -2561,11 +2534,11 @@ define <64 x i8> @constant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> ) ret <64 x i8> %res @@ -2576,11 +2549,11 @@ define <64 x i8> @constant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm4, %xmm10 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2] ; GFNISSE-NEXT: pmullw %xmm9, %xmm10 ; GFNISSE-NEXT: psrlw $8, %xmm10 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: pmullw %xmm8, %xmm4 ; GFNISSE-NEXT: psrlw $8, %xmm4 ; GFNISSE-NEXT: packuswb %xmm10, %xmm4 @@ -2619,11 +2592,11 @@ define <64 x i8> @constant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 @@ -2705,11 +2678,11 @@ define <64 x i8> @constant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> ) ret <64 x i8> %res @@ -2787,7 +2760,7 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm4 -; GFNISSE-NEXT: pmovsxwq {{.*#+}} xmm9 = [258,258] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 ; GFNISSE-NEXT: por %xmm4, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm5 @@ -2804,8 +2777,8 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; GFNIAVX1-LABEL: splatconstant_fshr_v64i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 ; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2 @@ -2816,8 +2789,8 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/gfni-lzcnt.ll b/llvm/test/CodeGen/X86/gfni-lzcnt.ll index 8e48950c32cd8..03a6438f76566 100644 --- a/llvm/test/CodeGen/X86/gfni-lzcnt.ll +++ b/llvm/test/CodeGen/X86/gfni-lzcnt.ll @@ -137,8 +137,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; GFNIAVX2-LABEL: testv32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX2-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -150,8 +149,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; GFNIAVX512-LABEL: testv32i8: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX512-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; GFNIAVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -212,8 +210,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; GFNIAVX2-LABEL: testv32i8u: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX2-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -225,8 +222,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; GFNIAVX512-LABEL: testv32i8u: ; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX512-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 ; GFNIAVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -317,8 +313,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; GFNIAVX2-LABEL: testv64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm3 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm0 @@ -338,8 +333,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; GFNIAVX512VL-LABEL: testv64i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX512VL-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX512VL-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1 @@ -361,8 +355,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 ; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 -; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z} ; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 ; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 @@ -449,8 +442,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; ; GFNIAVX2-LABEL: testv64i8u: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm3 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm0 @@ -470,8 +462,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; GFNIAVX512VL-LABEL: testv64i8u: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX512VL-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX512VL-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1 @@ -493,8 +484,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 ; GFNIAVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 -; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z} ; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 ; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll index 967f26f70946a..06654a33892ba 100644 --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -176,9 +176,9 @@ define <16 x i8> @splatvar_rotl_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; GFNISSE-NEXT: psllw %xmm1, %xmm2 -; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNISSE-NEXT: psllw %xmm1, %xmm0 +; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: packuswb %xmm2, %xmm0 ; GFNISSE-NEXT: retq @@ -188,11 +188,11 @@ define <16 x i8> @splatvar_rotl_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind { ; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; GFNIAVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpsrlw $8, %xmm2, %xmm1 ; GFNIAVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; GFNIAVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; GFNIAVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX-NEXT: retq %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %splat) @@ -206,10 +206,10 @@ define <16 x i8> @splatvar_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; GFNISSE-NEXT: psrlw %xmm1, %xmm2 -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; GFNISSE-NEXT: pand %xmm3, %xmm2 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNISSE-NEXT: psrlw %xmm1, %xmm0 +; GFNISSE-NEXT: pand %xmm3, %xmm2 ; GFNISSE-NEXT: pand %xmm3, %xmm0 ; GFNISSE-NEXT: packuswb %xmm2, %xmm0 ; GFNISSE-NEXT: retq @@ -220,11 +220,11 @@ define <16 x i8> @splatvar_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind { ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatvar_rotr_v16i8: @@ -232,12 +232,12 @@ define <16 x i8> @splatvar_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind { ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; GFNIAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpand %xmm3, %xmm2, %xmm1 ; GFNIAVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512VL-LABEL: splatvar_rotr_v16i8: @@ -245,12 +245,12 @@ define <16 x i8> @splatvar_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind { ; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; GFNIAVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; GFNIAVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm1 ; GFNIAVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; GFNIAVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: splatvar_rotr_v16i8: @@ -274,9 +274,9 @@ define <16 x i8> @constant_rotl_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm1 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,128,64,32,16,8,4,2] -; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: packuswb %xmm1, %xmm0 ; GFNISSE-NEXT: retq @@ -285,9 +285,9 @@ define <16 x i8> @constant_rotl_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX1OR2: # %bb.0: ; GFNIAVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq @@ -296,9 +296,9 @@ define <16 x i8> @constant_rotl_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 ; GFNIAVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX512VL-NEXT: retq @@ -307,9 +307,9 @@ define <16 x i8> @constant_rotl_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 ; GFNIAVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX512BW-NEXT: retq @@ -323,9 +323,9 @@ define <16 x i8> @constant_rotr_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm1 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,128,64,32,16,8,4,2] -; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: packuswb %xmm1, %xmm0 ; GFNISSE-NEXT: retq @@ -334,9 +334,9 @@ define <16 x i8> @constant_rotr_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX1OR2: # %bb.0: ; GFNIAVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq @@ -345,9 +345,9 @@ define <16 x i8> @constant_rotr_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 ; GFNIAVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX512VL-NEXT: retq @@ -356,9 +356,9 @@ define <16 x i8> @constant_rotr_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 ; GFNIAVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX512BW-NEXT: retq @@ -417,7 +417,7 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm6 = [16909320,16909320] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNISSE-NEXT: movdqa %xmm2, %xmm7 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm7 ; GFNISSE-NEXT: por %xmm0, %xmm7 @@ -575,61 +575,58 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNISSE-LABEL: var_rotr_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa %xmm0, %xmm5 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm0 -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm7 = [16909320,16909320] -; GFNISSE-NEXT: movdqa %xmm5, %xmm8 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm8 -; GFNISSE-NEXT: por %xmm0, %xmm8 -; GFNISSE-NEXT: pxor %xmm4, %xmm4 +; GFNISSE-NEXT: movdqa %xmm0, %xmm4 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] +; GFNISSE-NEXT: movdqa %xmm4, %xmm7 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm7 +; GFNISSE-NEXT: por %xmm0, %xmm7 ; GFNISSE-NEXT: pxor %xmm0, %xmm0 ; GFNISSE-NEXT: psubb %xmm2, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm5 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm7, %xmm4 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] -; GFNISSE-NEXT: movdqa %xmm5, %xmm9 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm9 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] -; GFNISSE-NEXT: movdqa %xmm5, %xmm10 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm10 -; GFNISSE-NEXT: por %xmm9, %xmm10 +; GFNISSE-NEXT: movdqa %xmm4, %xmm8 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm8 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] +; GFNISSE-NEXT: movdqa %xmm4, %xmm9 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm9 +; GFNISSE-NEXT: por %xmm8, %xmm9 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm5 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] -; GFNISSE-NEXT: movdqa %xmm5, %xmm10 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10 -; GFNISSE-NEXT: movdqa %xmm5, %xmm11 -; GFNISSE-NEXT: paddb %xmm5, %xmm11 -; GFNISSE-NEXT: por %xmm10, %xmm11 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm9, %xmm4 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] +; GFNISSE-NEXT: movdqa %xmm4, %xmm9 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm9 +; GFNISSE-NEXT: movdqa %xmm4, %xmm10 +; GFNISSE-NEXT: paddb %xmm4, %xmm10 +; GFNISSE-NEXT: por %xmm9, %xmm10 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm5 -; GFNISSE-NEXT: movdqa %xmm1, %xmm0 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm0 -; GFNISSE-NEXT: movdqa %xmm1, %xmm6 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm6 -; GFNISSE-NEXT: por %xmm0, %xmm6 -; GFNISSE-NEXT: psubb %xmm3, %xmm4 -; GFNISSE-NEXT: psllw $5, %xmm4 -; GFNISSE-NEXT: movdqa %xmm4, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm0 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm4 +; GFNISSE-NEXT: pxor %xmm0, %xmm0 +; GFNISSE-NEXT: movdqa %xmm1, %xmm9 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm9 +; GFNISSE-NEXT: movdqa %xmm1, %xmm5 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm5 +; GFNISSE-NEXT: por %xmm9, %xmm5 +; GFNISSE-NEXT: psubb %xmm3, %xmm0 +; GFNISSE-NEXT: psllw $5, %xmm0 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm1 +; GFNISSE-NEXT: movdqa %xmm1, %xmm3 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm3 ; GFNISSE-NEXT: movdqa %xmm1, %xmm2 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm2 -; GFNISSE-NEXT: por %xmm0, %xmm2 -; GFNISSE-NEXT: paddb %xmm4, %xmm4 -; GFNISSE-NEXT: movdqa %xmm4, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm2 +; GFNISSE-NEXT: por %xmm3, %xmm2 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm0 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 ; GFNISSE-NEXT: movdqa %xmm1, %xmm2 -; GFNISSE-NEXT: paddb %xmm1, %xmm2 -; GFNISSE-NEXT: por %xmm0, %xmm2 -; GFNISSE-NEXT: paddb %xmm4, %xmm4 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm2 +; GFNISSE-NEXT: movdqa %xmm1, %xmm3 +; GFNISSE-NEXT: paddb %xmm1, %xmm3 +; GFNISSE-NEXT: por %xmm2, %xmm3 +; GFNISSE-NEXT: paddb %xmm0, %xmm0 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; GFNISSE-NEXT: movdqa %xmm5, %xmm0 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: var_rotr_v32i8: @@ -729,7 +726,7 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX512BW-NEXT: vpsrlvw %ymm3, %ymm4, %ymm3 -; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -789,11 +786,11 @@ define <32 x i8> @splatvar_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatvar_rotl_v32i8: @@ -801,11 +798,11 @@ define <32 x i8> @splatvar_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNIAVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX512-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; GFNIAVX512-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> %splat) @@ -819,7 +816,7 @@ define <32 x i8> @splatvar_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm3 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; GFNISSE-NEXT: psrlw %xmm2, %xmm3 -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm4, %xmm3 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNISSE-NEXT: psrlw %xmm2, %xmm0 @@ -863,38 +860,25 @@ define <32 x i8> @splatvar_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm1 ; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; -; GFNIAVX512VL-LABEL: splatvar_rotr_v32i8: -; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: retq -; -; GFNIAVX512BW-LABEL: splatvar_rotr_v32i8: -; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; GFNIAVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; GFNIAVX512BW-NEXT: vpand %ymm3, %ymm0, %ymm0 -; GFNIAVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; GFNIAVX512BW-NEXT: retq +; GFNIAVX512-LABEL: splatvar_rotr_v32i8: +; GFNIAVX512: # %bb.0: +; GFNIAVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; GFNIAVX512-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; GFNIAVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpand %ymm3, %ymm2, %ymm1 +; GFNIAVX512-NEXT: vpand %ymm3, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: retq %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> %splat) ret <32 x i8> %res @@ -905,11 +889,11 @@ define <32 x i8> @constant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] ; GFNISSE-NEXT: pmullw %xmm3, %xmm2 ; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: pmullw %xmm4, %xmm0 ; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: packuswb %xmm2, %xmm0 @@ -927,11 +911,11 @@ define <32 x i8> @constant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 @@ -949,9 +933,9 @@ define <32 x i8> @constant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -960,9 +944,9 @@ define <32 x i8> @constant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: retq @@ -971,9 +955,9 @@ define <32 x i8> @constant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 ; GFNIAVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512BW-NEXT: retq @@ -986,11 +970,11 @@ define <32 x i8> @constant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] ; GFNISSE-NEXT: pmullw %xmm3, %xmm2 ; GFNISSE-NEXT: psrlw $8, %xmm2 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: pmullw %xmm4, %xmm0 ; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: packuswb %xmm2, %xmm0 @@ -1008,11 +992,11 @@ define <32 x i8> @constant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 @@ -1030,9 +1014,9 @@ define <32 x i8> @constant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -1041,9 +1025,9 @@ define <32 x i8> @constant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: retq @@ -1052,9 +1036,9 @@ define <32 x i8> @constant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 ; GFNIAVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512BW-NEXT: retq @@ -1117,7 +1101,7 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm4 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm10 = [16909320,16909320] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNISSE-NEXT: movdqa %xmm4, %xmm11 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm11 ; GFNISSE-NEXT: por %xmm0, %xmm11 @@ -1262,24 +1246,23 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm10, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm9 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm10 -; GFNIAVX1-NEXT: vpor %xmm9, %xmm10, %xmm9 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm10 -; GFNIAVX1-NEXT: vpsllw $5, %xmm10, %xmm10 -; GFNIAVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm9 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm11 -; GFNIAVX1-NEXT: vpor %xmm9, %xmm11, %xmm9 -; GFNIAVX1-NEXT: vpaddb %xmm10, %xmm10, %xmm10 -; GFNIAVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9 -; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm11 -; GFNIAVX1-NEXT: vpor %xmm9, %xmm11, %xmm9 -; GFNIAVX1-NEXT: vpaddb %xmm10, %xmm10, %xmm10 -; GFNIAVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm10 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm11 +; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 +; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm11 +; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm10 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm12 +; GFNIAVX1-NEXT: vpor %xmm10, %xmm12, %xmm10 +; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11 +; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm10 +; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm12 +; GFNIAVX1-NEXT: vpor %xmm10, %xmm12, %xmm10 +; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11 +; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5 ; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 @@ -1295,20 +1278,21 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 ; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: var_rotl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7 -; GFNIAVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 -; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm6 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm7 +; GFNIAVX2-NEXT: vpor %ymm6, %ymm7, %ymm6 +; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm9 ; GFNIAVX2-NEXT: vpor %ymm7, %ymm9, %ymm7 @@ -1321,11 +1305,11 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm9, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm4 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm4 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 ; GFNIAVX2-NEXT: vpsllw $5, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm2 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm1, %ymm4 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 @@ -1339,44 +1323,44 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; ; GFNIAVX512VL-LABEL: var_rotl_v64i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4 +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] +; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm2, %ymm3, %ymm4 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm6 ; GFNIAVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 -; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm7 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm3, %ymm9 ; GFNIAVX512VL-NEXT: vpor %ymm7, %ymm9, %ymm7 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm2, %ymm2 +; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm3, %ymm3 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9 -; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm10 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm3, %ymm9 +; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10 ; GFNIAVX512VL-NEXT: vpor %ymm9, %ymm10, %ymm9 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3 +; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm9, %ymm3, %ymm3 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm5 -; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm5, %ymm3 +; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3 +; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm2 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm4 -; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 ; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm0, %ymm3 +; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm0, %ymm2 ; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; GFNIAVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 ; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: var_rotl_v64i8: @@ -1399,107 +1383,107 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNISSE-LABEL: var_rotr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa %xmm0, %xmm9 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm0 -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm11 = [16909320,16909320] -; GFNISSE-NEXT: movdqa %xmm9, %xmm12 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm12 -; GFNISSE-NEXT: por %xmm0, %xmm12 -; GFNISSE-NEXT: pxor %xmm8, %xmm8 +; GFNISSE-NEXT: movdqa %xmm0, %xmm8 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] +; GFNISSE-NEXT: movdqa %xmm8, %xmm11 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm11 +; GFNISSE-NEXT: por %xmm0, %xmm11 ; GFNISSE-NEXT: pxor %xmm0, %xmm0 ; GFNISSE-NEXT: psubb %xmm4, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm9 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm8 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] -; GFNISSE-NEXT: movdqa %xmm9, %xmm13 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm13 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] -; GFNISSE-NEXT: movdqa %xmm9, %xmm14 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm14 -; GFNISSE-NEXT: por %xmm13, %xmm14 +; GFNISSE-NEXT: movdqa %xmm8, %xmm12 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm12 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] +; GFNISSE-NEXT: movdqa %xmm8, %xmm13 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm13 +; GFNISSE-NEXT: por %xmm12, %xmm13 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm9 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] -; GFNISSE-NEXT: movdqa %xmm9, %xmm14 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm14 -; GFNISSE-NEXT: movdqa %xmm9, %xmm15 -; GFNISSE-NEXT: paddb %xmm9, %xmm15 -; GFNISSE-NEXT: por %xmm14, %xmm15 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm8 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] +; GFNISSE-NEXT: movdqa %xmm8, %xmm13 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm13 +; GFNISSE-NEXT: movdqa %xmm8, %xmm14 +; GFNISSE-NEXT: paddb %xmm8, %xmm14 +; GFNISSE-NEXT: por %xmm13, %xmm14 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm15, %xmm9 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm8 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm0 -; GFNISSE-NEXT: movdqa %xmm1, %xmm14 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm14 -; GFNISSE-NEXT: por %xmm0, %xmm14 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 +; GFNISSE-NEXT: movdqa %xmm1, %xmm13 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm13 +; GFNISSE-NEXT: por %xmm0, %xmm13 ; GFNISSE-NEXT: pxor %xmm0, %xmm0 ; GFNISSE-NEXT: psubb %xmm5, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm5 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm5 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm1 +; GFNISSE-NEXT: pxor %xmm5, %xmm5 +; GFNISSE-NEXT: movdqa %xmm1, %xmm13 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm13 ; GFNISSE-NEXT: movdqa %xmm1, %xmm14 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm14 -; GFNISSE-NEXT: por %xmm5, %xmm14 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm14 +; GFNISSE-NEXT: por %xmm13, %xmm14 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm1 -; GFNISSE-NEXT: movdqa %xmm1, %xmm5 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm5 +; GFNISSE-NEXT: movdqa %xmm1, %xmm13 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm13 ; GFNISSE-NEXT: movdqa %xmm1, %xmm14 ; GFNISSE-NEXT: paddb %xmm1, %xmm14 -; GFNISSE-NEXT: por %xmm5, %xmm14 +; GFNISSE-NEXT: por %xmm13, %xmm14 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm1 ; GFNISSE-NEXT: movdqa %xmm2, %xmm0 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm0 -; GFNISSE-NEXT: movdqa %xmm2, %xmm5 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm5 -; GFNISSE-NEXT: por %xmm0, %xmm5 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 +; GFNISSE-NEXT: movdqa %xmm2, %xmm13 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm13 +; GFNISSE-NEXT: por %xmm0, %xmm13 ; GFNISSE-NEXT: pxor %xmm0, %xmm0 ; GFNISSE-NEXT: psubb %xmm6, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2 -; GFNISSE-NEXT: movdqa %xmm2, %xmm5 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm5 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm2 ; GFNISSE-NEXT: movdqa %xmm2, %xmm6 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm6 -; GFNISSE-NEXT: por %xmm5, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm6 +; GFNISSE-NEXT: movdqa %xmm2, %xmm13 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm13 +; GFNISSE-NEXT: por %xmm6, %xmm13 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm2 -; GFNISSE-NEXT: movdqa %xmm2, %xmm5 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm5 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm2 ; GFNISSE-NEXT: movdqa %xmm2, %xmm6 -; GFNISSE-NEXT: paddb %xmm2, %xmm6 -; GFNISSE-NEXT: por %xmm5, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm6 +; GFNISSE-NEXT: movdqa %xmm2, %xmm13 +; GFNISSE-NEXT: paddb %xmm2, %xmm13 +; GFNISSE-NEXT: por %xmm6, %xmm13 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm2 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm2 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm0 -; GFNISSE-NEXT: movdqa %xmm3, %xmm5 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm5 -; GFNISSE-NEXT: por %xmm0, %xmm5 -; GFNISSE-NEXT: psubb %xmm7, %xmm8 -; GFNISSE-NEXT: psllw $5, %xmm8 -; GFNISSE-NEXT: movdqa %xmm8, %xmm0 -; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm3 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 +; GFNISSE-NEXT: movdqa %xmm3, %xmm6 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm6 +; GFNISSE-NEXT: por %xmm0, %xmm6 +; GFNISSE-NEXT: psubb %xmm7, %xmm5 +; GFNISSE-NEXT: psllw $5, %xmm5 +; GFNISSE-NEXT: movdqa %xmm5, %xmm0 +; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm3 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 ; GFNISSE-NEXT: movdqa %xmm3, %xmm4 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm4 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm4 ; GFNISSE-NEXT: por %xmm0, %xmm4 -; GFNISSE-NEXT: paddb %xmm8, %xmm8 -; GFNISSE-NEXT: movdqa %xmm8, %xmm0 +; GFNISSE-NEXT: paddb %xmm5, %xmm5 +; GFNISSE-NEXT: movdqa %xmm5, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 ; GFNISSE-NEXT: movdqa %xmm3, %xmm0 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm0 +; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm0 ; GFNISSE-NEXT: movdqa %xmm3, %xmm4 ; GFNISSE-NEXT: paddb %xmm3, %xmm4 ; GFNISSE-NEXT: por %xmm0, %xmm4 -; GFNISSE-NEXT: paddb %xmm8, %xmm8 -; GFNISSE-NEXT: movdqa %xmm8, %xmm0 +; GFNISSE-NEXT: paddb %xmm5, %xmm5 +; GFNISSE-NEXT: movdqa %xmm5, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; GFNISSE-NEXT: movdqa %xmm9, %xmm0 +; GFNISSE-NEXT: movdqa %xmm8, %xmm0 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: var_rotr_v64i8: @@ -1549,25 +1533,24 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX1-NEXT: vpor %xmm11, %xmm12, %xmm11 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm11, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm10 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm11 -; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 -; GFNIAVX1-NEXT: vpsubb %xmm11, %xmm6, %xmm11 -; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm10 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm12 -; GFNIAVX1-NEXT: vpor %xmm10, %xmm12, %xmm10 -; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm2, %xmm10 -; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm12 -; GFNIAVX1-NEXT: vpor %xmm10, %xmm12, %xmm10 -; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11 -; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm11 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm12 +; GFNIAVX1-NEXT: vpor %xmm11, %xmm12, %xmm11 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm12 +; GFNIAVX1-NEXT: vpsubb %xmm12, %xmm6, %xmm12 +; GFNIAVX1-NEXT: vpsllw $5, %xmm12, %xmm12 +; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm11, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm11 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm13 +; GFNIAVX1-NEXT: vpor %xmm11, %xmm13, %xmm11 +; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12 +; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm11, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm2, %xmm11 +; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm13 +; GFNIAVX1-NEXT: vpor %xmm11, %xmm13, %xmm11 +; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12 +; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm11, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm1, %xmm5 ; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 @@ -1584,45 +1567,46 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 ; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: var_rotr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7 -; GFNIAVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm6 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm7 +; GFNIAVX2-NEXT: vpor %ymm6, %ymm7, %ymm6 ; GFNIAVX2-NEXT: vpxor %xmm7, %xmm7, %xmm7 ; GFNIAVX2-NEXT: vpsubb %ymm2, %ymm7, %ymm2 ; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm8 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm0, %ymm10 -; GFNIAVX2-NEXT: vpor %ymm8, %ymm10, %ymm8 -; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] +; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm9 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm10 +; GFNIAVX2-NEXT: vpor %ymm9, %ymm10, %ymm9 +; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm9, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm0, %ymm10 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm11 ; GFNIAVX2-NEXT: vpor %ymm10, %ymm11, %ymm10 ; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm2 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm4 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm4 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 ; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm7, %ymm3 ; GFNIAVX2-NEXT: vpsllw $5, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm2 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm1, %ymm4 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm1, %ymm4 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm1, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm1, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm4 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 @@ -1641,14 +1625,14 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9 -; GFNIAVX512VL-NEXT: vpor %ymm7, %ymm9, %ymm7 -; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm8 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9 +; GFNIAVX512VL-NEXT: vpor %ymm8, %ymm9, %ymm8 +; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm8, %ymm2, %ymm2 +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11 ; GFNIAVX512VL-NEXT: vpor %ymm9, %ymm11, %ymm9 @@ -1660,11 +1644,11 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm4 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm0, %ymm4 ; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm0, %ymm3 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm3 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm10, %ymm0, %ymm4 ; GFNIAVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -1679,7 +1663,7 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 -; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] @@ -1768,8 +1752,8 @@ define <64 x i8> @splatvar_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; ; GFNIAVX2-LABEL: splatvar_rotl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX2-NEXT: vpsllw %xmm2, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -1811,11 +1795,11 @@ define <64 x i8> @splatvar_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX512BW-NEXT: vpsllw %xmm1, %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; GFNIAVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> %splat) @@ -1829,7 +1813,7 @@ define <64 x i8> @splatvar_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm6 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] ; GFNISSE-NEXT: psrlw %xmm4, %xmm6 -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm5, %xmm6 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNISSE-NEXT: psrlw %xmm4, %xmm0 @@ -1925,7 +1909,7 @@ define <64 x i8> @splatvar_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 @@ -1946,12 +1930,12 @@ define <64 x i8> @splatvar_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; GFNIAVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm1 ; GFNIAVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> %splat) @@ -1963,11 +1947,11 @@ define <64 x i8> @constant_rotl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm0, %xmm6 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] ; GFNISSE-NEXT: pmullw %xmm4, %xmm6 ; GFNISSE-NEXT: psrlw $8, %xmm6 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: pmullw %xmm5, %xmm0 ; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: packuswb %xmm6, %xmm0 @@ -2001,11 +1985,11 @@ define <64 x i8> @constant_rotl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 @@ -2085,9 +2069,9 @@ define <64 x i8> @constant_rotl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq @@ -2100,11 +2084,11 @@ define <64 x i8> @constant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm0, %xmm6 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] ; GFNISSE-NEXT: pmullw %xmm4, %xmm6 ; GFNISSE-NEXT: psrlw $8, %xmm6 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: pmullw %xmm5, %xmm0 ; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: packuswb %xmm6, %xmm0 @@ -2138,11 +2122,11 @@ define <64 x i8> @constant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 @@ -2222,9 +2206,9 @@ define <64 x i8> @constant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index 5cd1a2c76762e..7fac6b591ffea 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -13,9 +13,9 @@ define <16 x i8> @var_shl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNISSE-LABEL: var_shl_v16i8: ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 -; GFNISSE-NEXT: psllw $5, %xmm1 ; GFNISSE-NEXT: movdqa %xmm0, %xmm3 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; GFNISSE-NEXT: psllw $5, %xmm1 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; GFNISSE-NEXT: movdqa %xmm2, %xmm3 @@ -69,9 +69,9 @@ define <16 x i8> @var_lshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNISSE-LABEL: var_lshr_v16i8: ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 -; GFNISSE-NEXT: psllw $5, %xmm1 ; GFNISSE-NEXT: movdqa %xmm0, %xmm3 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; GFNISSE-NEXT: psllw $5, %xmm1 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; GFNISSE-NEXT: movdqa %xmm2, %xmm3 @@ -139,7 +139,6 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNISSE-NEXT: psraw $1, %xmm4 ; GFNISSE-NEXT: paddw %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; GFNISSE-NEXT: psrlw $8, %xmm3 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; GFNISSE-NEXT: movdqa %xmm1, %xmm2 @@ -153,6 +152,7 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNISSE-NEXT: psraw $1, %xmm2 ; GFNISSE-NEXT: paddw %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; GFNISSE-NEXT: psrlw $8, %xmm3 ; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: packuswb %xmm3, %xmm1 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 @@ -171,7 +171,6 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNIAVX1OR2-NEXT: vpsraw $1, %xmm3, %xmm4 ; GFNIAVX1OR2-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; GFNIAVX1OR2-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1OR2-NEXT: vpsraw $4, %xmm0, %xmm3 @@ -182,8 +181,9 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNIAVX1OR2-NEXT: vpsraw $1, %xmm0, %xmm3 ; GFNIAVX1OR2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 ; GFNIAVX1OR2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm1 ; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512VL-LABEL: var_ashr_v16i8: @@ -323,12 +323,12 @@ define <16 x i8> @splatvar_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNISSE-NEXT: psrlw %xmm1, %xmm0 ; GFNISSE-NEXT: pcmpeqd %xmm2, %xmm2 ; GFNISSE-NEXT: psrlw %xmm1, %xmm2 +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; GFNISSE-NEXT: psrlw %xmm1, %xmm3 ; GFNISSE-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNISSE-NEXT: pand %xmm2, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; GFNISSE-NEXT: psrlw %xmm1, %xmm2 -; GFNISSE-NEXT: pxor %xmm2, %xmm0 -; GFNISSE-NEXT: psubb %xmm2, %xmm0 +; GFNISSE-NEXT: pxor %xmm3, %xmm0 +; GFNISSE-NEXT: psubb %xmm3, %xmm0 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: splatvar_ashr_v16i8: @@ -337,10 +337,10 @@ define <16 x i8> @splatvar_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; GFNIAVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; GFNIAVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; GFNIAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; GFNIAVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: retq @@ -351,11 +351,11 @@ define <16 x i8> @splatvar_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; GFNIAVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; GFNIAVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; GFNIAVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; GFNIAVX2-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; GFNIAVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; GFNIAVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; GFNIAVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; GFNIAVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; GFNIAVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; GFNIAVX2-NEXT: retq @@ -438,8 +438,8 @@ define <16 x i8> @constant_lshr_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16,32,64,128,256] -; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2] +; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: packuswb %xmm0, %xmm1 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 @@ -450,9 +450,9 @@ define <16 x i8> @constant_lshr_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] -; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: retq @@ -493,10 +493,10 @@ define <16 x i8> @constant_ashr_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; GFNISSE-NEXT: psraw $8, %xmm1 ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,4,8,16,32,64,128,256] -; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNISSE-NEXT: psraw $8, %xmm0 ; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] +; GFNISSE-NEXT: psrlw $8, %xmm1 ; GFNISSE-NEXT: psrlw $8, %xmm0 ; GFNISSE-NEXT: packuswb %xmm1, %xmm0 ; GFNISSE-NEXT: retq @@ -506,10 +506,10 @@ define <16 x i8> @constant_ashr_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; GFNIAVX1-NEXT: vpsraw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] -; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1-NEXT: vpsraw $8, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] +; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: retq @@ -609,7 +609,7 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm2, %xmm4 ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm5 = [16909320,16909320] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNISSE-NEXT: movdqa %xmm0, %xmm6 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm6 ; GFNISSE-NEXT: psllw $5, %xmm4 @@ -954,7 +954,6 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX2-NEXT: vpsraw $1, %ymm3, %ymm4 ; GFNIAVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX2-NEXT: vpsraw $4, %ymm0, %ymm3 @@ -965,8 +964,9 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX2-NEXT: vpsraw $1, %ymm0, %ymm3 ; GFNIAVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512VL-LABEL: var_ashr_v32i8: @@ -982,7 +982,6 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX512VL-NEXT: vpsraw $1, %ymm3, %ymm4 ; GFNIAVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512VL-NEXT: vpsraw $4, %ymm0, %ymm3 @@ -993,8 +992,9 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX512VL-NEXT: vpsraw $1, %ymm0, %ymm3 ; GFNIAVX512VL-NEXT: vpaddw %ymm1, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: var_ashr_v32i8: @@ -1017,8 +1017,8 @@ define <32 x i8> @splatvar_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE-NEXT: psllw %xmm2, %xmm3 ; GFNISSE-NEXT: pxor %xmm4, %xmm4 ; GFNISSE-NEXT: pshufb %xmm4, %xmm3 -; GFNISSE-NEXT: pand %xmm3, %xmm0 ; GFNISSE-NEXT: psllw %xmm2, %xmm1 +; GFNISSE-NEXT: pand %xmm3, %xmm0 ; GFNISSE-NEXT: pand %xmm3, %xmm1 ; GFNISSE-NEXT: retq ; @@ -1031,10 +1031,10 @@ define <32 x i8> @splatvar_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatvar_shl_v32i8: @@ -1077,8 +1077,8 @@ define <32 x i8> @splatvar_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE-NEXT: pcmpeqd %xmm3, %xmm3 ; GFNISSE-NEXT: psrlw %xmm2, %xmm3 ; GFNISSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNISSE-NEXT: pand %xmm3, %xmm0 ; GFNISSE-NEXT: psrlw %xmm2, %xmm1 +; GFNISSE-NEXT: pand %xmm3, %xmm0 ; GFNISSE-NEXT: pand %xmm3, %xmm1 ; GFNISSE-NEXT: retq ; @@ -1090,10 +1090,10 @@ define <32 x i8> @splatvar_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatvar_lshr_v32i8: @@ -1175,11 +1175,11 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; GFNIAVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; GFNIAVX2-NEXT: vpsrlw %xmm1, %ymm3, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; GFNIAVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; GFNIAVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; GFNIAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -1188,7 +1188,7 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -1213,10 +1213,10 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: constant_shl_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] ; GFNISSE-NEXT: movdqa %xmm0, %xmm3 ; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3 -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm4, %xmm3 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm0 @@ -1239,7 +1239,7 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1 ; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] ; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 @@ -1326,11 +1326,11 @@ define <32 x i8> @constant_lshr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512VL-LABEL: constant_lshr_v32i8: @@ -1338,11 +1338,11 @@ define <32 x i8> @constant_lshr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: constant_lshr_v32i8: @@ -1413,10 +1413,10 @@ define <32 x i8> @constant_ashr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX2-NEXT: vpsraw $8, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX2-NEXT: vpsraw $8, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -1426,10 +1426,10 @@ define <32 x i8> @constant_ashr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; GFNIAVX512VL-NEXT: vpsraw $8, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512VL-NEXT: vpsraw $8, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: retq @@ -1447,7 +1447,7 @@ define <32 x i8> @constant_ashr_v32i8(<32 x i8> %a) nounwind { define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_shl_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pmovsxwq {{.*#+}} xmm2 = [258,258] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq @@ -1516,7 +1516,7 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm4, %xmm8 ; GFNISSE-NEXT: movdqa %xmm0, %xmm4 -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm9 = [16909320,16909320] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNISSE-NEXT: movdqa %xmm0, %xmm10 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10 ; GFNISSE-NEXT: psllw $5, %xmm8 @@ -1607,18 +1607,17 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm7 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm7, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 -; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm7 -; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6 -; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 -; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm6 -; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 -; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 +; GFNIAVX1-NEXT: vpsllw $5, %xmm8, %xmm8 +; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm7 +; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 +; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7 +; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 +; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4 ; GFNIAVX1-NEXT: vpsllw $5, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 @@ -1628,6 +1627,7 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm4 ; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; @@ -1797,18 +1797,17 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm0, %xmm8 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm8, %xmm0, %xmm0 -; GFNIAVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 -; GFNIAVX1-NEXT: vpsllw $5, %xmm8, %xmm8 -; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm7 -; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 -; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7 -; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 -; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm8 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 +; GFNIAVX1-NEXT: vpsllw $5, %xmm9, %xmm9 +; GFNIAVX1-NEXT: vpblendvb %xmm9, %xmm8, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm8 +; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm9 +; GFNIAVX1-NEXT: vpblendvb %xmm9, %xmm8, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm8 +; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm9 +; GFNIAVX1-NEXT: vpblendvb %xmm9, %xmm8, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm4 ; GFNIAVX1-NEXT: vpsllw $5, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 @@ -1818,6 +1817,7 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm1, %xmm4 ; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; @@ -1848,30 +1848,30 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512VL-LABEL: var_lshr_v64i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4 +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] +; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm2, %ymm3, %ymm4 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 -; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm6 ; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 +; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm3, %ymm3 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm2, %ymm7 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm3, %ymm7 ; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3 +; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm3, %ymm3 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm3 +; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm2 ; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm3 +; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm2 ; GFNIAVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; GFNIAVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: var_lshr_v64i8: @@ -2083,7 +2083,6 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpsraw $1, %xmm6, %xmm7 ; GFNIAVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpblendvb %xmm4, %xmm7, %xmm6, %xmm4 -; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1-NEXT: vpsraw $4, %xmm5, %xmm6 @@ -2094,20 +2093,21 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpsraw $1, %xmm5, %xmm6 ; GFNIAVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpsllw $5, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; GFNIAVX1-NEXT: vpsraw $4, %xmm5, %xmm6 -; GFNIAVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpsraw $2, %xmm5, %xmm6 -; GFNIAVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm5, %xmm5 -; GFNIAVX1-NEXT: vpsraw $1, %xmm5, %xmm6 -; GFNIAVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm5, %xmm4 -; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; GFNIAVX1-NEXT: vpsraw $4, %xmm6, %xmm7 +; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm6, %xmm6 +; GFNIAVX1-NEXT: vpsraw $2, %xmm6, %xmm7 +; GFNIAVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm6, %xmm6 +; GFNIAVX1-NEXT: vpsraw $1, %xmm6, %xmm7 +; GFNIAVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 +; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm6, %xmm5 +; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 +; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm4 ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; GFNIAVX1-NEXT: vpsraw $4, %xmm1, %xmm5 @@ -2276,8 +2276,8 @@ define <64 x i8> @splatvar_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE-NEXT: psllw %xmm4, %xmm5 ; GFNISSE-NEXT: pxor %xmm6, %xmm6 ; GFNISSE-NEXT: pshufb %xmm6, %xmm5 -; GFNISSE-NEXT: pand %xmm5, %xmm0 ; GFNISSE-NEXT: psllw %xmm4, %xmm1 +; GFNISSE-NEXT: pand %xmm5, %xmm0 ; GFNISSE-NEXT: pand %xmm5, %xmm1 ; GFNISSE-NEXT: psllw %xmm4, %xmm2 ; GFNISSE-NEXT: pand %xmm5, %xmm2 @@ -2297,10 +2297,10 @@ define <64 x i8> @splatvar_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 ; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 @@ -2313,8 +2313,8 @@ define <64 x i8> @splatvar_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; GFNIAVX2-NEXT: vpsllw %xmm2, %xmm3, %xmm3 ; GFNIAVX2-NEXT: vpbroadcastb %xmm3, %ymm3 -; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsllw %xmm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; @@ -2354,8 +2354,8 @@ define <64 x i8> @splatvar_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE-NEXT: pcmpeqd %xmm5, %xmm5 ; GFNISSE-NEXT: psrlw %xmm4, %xmm5 ; GFNISSE-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; GFNISSE-NEXT: pand %xmm5, %xmm0 ; GFNISSE-NEXT: psrlw %xmm4, %xmm1 +; GFNISSE-NEXT: pand %xmm5, %xmm0 ; GFNISSE-NEXT: pand %xmm5, %xmm1 ; GFNISSE-NEXT: psrlw %xmm4, %xmm2 ; GFNISSE-NEXT: pand %xmm5, %xmm2 @@ -2374,10 +2374,10 @@ define <64 x i8> @splatvar_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 @@ -2391,8 +2391,8 @@ define <64 x i8> @splatvar_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 ; GFNIAVX2-NEXT: vpsrlw $8, %xmm3, %xmm3 ; GFNIAVX2-NEXT: vpbroadcastb %xmm3, %ymm3 -; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq ; @@ -2446,8 +2446,8 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE-NEXT: psrlw %xmm4, %xmm2 ; GFNISSE-NEXT: pand %xmm5, %xmm2 ; GFNISSE-NEXT: pxor %xmm6, %xmm2 -; GFNISSE-NEXT: psubb %xmm6, %xmm2 ; GFNISSE-NEXT: psrlw %xmm4, %xmm3 +; GFNISSE-NEXT: psubb %xmm6, %xmm2 ; GFNISSE-NEXT: pand %xmm5, %xmm3 ; GFNISSE-NEXT: pxor %xmm6, %xmm3 ; GFNISSE-NEXT: psubb %xmm6, %xmm3 @@ -2470,10 +2470,10 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpsubb %xmm5, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpand %xmm4, %xmm6, %xmm3 ; GFNIAVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3 ; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1 @@ -2507,7 +2507,7 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; GFNIAVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 ; GFNIAVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; GFNIAVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 @@ -2525,7 +2525,7 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX512BW: # %bb.0: ; GFNIAVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 ; GFNIAVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -2542,10 +2542,10 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: constant_shl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,4,16,64,128,32,8,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] ; GFNISSE-NEXT: movdqa %xmm0, %xmm6 ; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6 -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm5, %xmm6 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm0 @@ -2580,11 +2580,11 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm4, %xmm5 ; GFNIAVX1-NEXT: vpsllw $8, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,4,16,64,128,32,8,2] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] ; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; GFNIAVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm3 @@ -2780,11 +2780,11 @@ define <64 x i8> @constant_lshr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, ret <64 x i8> %shift @@ -2942,10 +2942,10 @@ define <64 x i8> @constant_ashr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; GFNIAVX512BW-NEXT: vpsraw $8, %zmm1, %zmm1 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq @@ -2956,7 +2956,7 @@ define <64 x i8> @constant_ashr_v64i8(<64 x i8> %a) nounwind { define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_shl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm4 = [66052,66052] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 diff --git a/llvm/test/CodeGen/X86/gfni-tzcnt.ll b/llvm/test/CodeGen/X86/gfni-tzcnt.ll index f424483c53e2c..da2da0dd1d75e 100644 --- a/llvm/test/CodeGen/X86/gfni-tzcnt.ll +++ b/llvm/test/CodeGen/X86/gfni-tzcnt.ll @@ -15,8 +15,8 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; GFNISSE-NEXT: pand %xmm0, %xmm2 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNISSE-NEXT: movdqa %xmm1, %xmm3 -; GFNISSE-NEXT: pshufb %xmm2, %xmm3 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: pshufb %xmm2, %xmm3 ; GFNISSE-NEXT: pshufb %xmm0, %xmm1 ; GFNISSE-NEXT: paddb %xmm3, %xmm1 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 @@ -29,8 +29,8 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; GFNIAVX1OR2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; GFNIAVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX1OR2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; GFNIAVX1OR2-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq @@ -42,8 +42,8 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; GFNIAVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 ; GFNIAVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; GFNIAVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; GFNIAVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; GFNIAVX512-NEXT: retq @@ -61,8 +61,8 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; GFNISSE-NEXT: pand %xmm0, %xmm2 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNISSE-NEXT: movdqa %xmm1, %xmm3 -; GFNISSE-NEXT: pshufb %xmm2, %xmm3 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE-NEXT: pshufb %xmm2, %xmm3 ; GFNISSE-NEXT: pshufb %xmm0, %xmm1 ; GFNISSE-NEXT: paddb %xmm3, %xmm1 ; GFNISSE-NEXT: movdqa %xmm1, %xmm0 @@ -75,8 +75,8 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; GFNIAVX1OR2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; GFNIAVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX1OR2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1OR2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; GFNIAVX1OR2-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; GFNIAVX1OR2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; GFNIAVX1OR2-NEXT: retq @@ -88,8 +88,8 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; GFNIAVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 ; GFNIAVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; GFNIAVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; GFNIAVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; GFNIAVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; GFNIAVX512-NEXT: retq @@ -158,10 +158,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; GFNIAVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] -; GFNIAVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; GFNIAVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -172,10 +171,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; GFNIAVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; GFNIAVX512-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 -; GFNIAVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX512-NEXT: # ymm2 = mem[0,1,0,1] -; GFNIAVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; GFNIAVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; GFNIAVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq @@ -244,10 +242,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; GFNIAVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] -; GFNIAVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; GFNIAVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -258,10 +255,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; GFNIAVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; GFNIAVX512-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 -; GFNIAVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX512-NEXT: # ymm2 = mem[0,1,0,1] -; GFNIAVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; GFNIAVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; GFNIAVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq @@ -370,8 +366,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; GFNIAVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX2-NEXT: # ymm5 = mem[0,1,0,1] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm0 @@ -392,10 +387,9 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; GFNIAVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm4 -; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX512VL-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm1 @@ -417,10 +411,9 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; GFNIAVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 -; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 ; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 ; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq @@ -529,8 +522,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; GFNIAVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX2-NEXT: # ymm5 = mem[0,1,0,1] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm0 @@ -551,10 +543,9 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; GFNIAVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm4 -; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; GFNIAVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX512VL-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm1, %ymm1 @@ -576,10 +567,9 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; GFNIAVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 -; GFNIAVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; GFNIAVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 ; GFNIAVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 ; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ghc-cc64.ll b/llvm/test/CodeGen/X86/ghc-cc64.ll index 41559b263c0d3..a929f676d8ddd 100644 --- a/llvm/test/CodeGen/X86/ghc-cc64.ll +++ b/llvm/test/CodeGen/X86/ghc-cc64.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -tailcallopt -mtriple=x86_64-linux-gnu | FileCheck %s ; Check the GHC call convention works (x86-64) @@ -21,42 +22,68 @@ @d2 = external dso_local global double ; assigned to register: xmm6 define void @zap(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: zap: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: movq %rdi, %r13 +; CHECK-NEXT: movq %rsi, %rbp +; CHECK-NEXT: callq addtwo@PLT +; CHECK-NEXT: subq $8, %rsp +; CHECK-NEXT: callq foo@PLT +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq entry: - ; CHECK: movq %rdi, %r13 - ; CHECK-NEXT: movq %rsi, %rbp - ; CHECK-NEXT: callq addtwo %0 = call ghccc i64 @addtwo(i64 %a, i64 %b) - ; CHECK: callq foo call void @foo() nounwind ret void } define ghccc i64 @addtwo(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: addtwo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %r13, %rax +; CHECK-NEXT: addq %rbp, %rax +; CHECK-NEXT: retq $8 entry: - ; CHECK: leaq (%r13,%rbp), %rax %0 = add i64 %x, %y - ; CHECK-NEXT: ret ret i64 %0 } define ghccc void @foo() nounwind { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero +; CHECK-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero +; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movq splim(%rip), %r15 +; CHECK-NEXT: movq r6(%rip), %r9 +; CHECK-NEXT: movq r5(%rip), %r8 +; CHECK-NEXT: movq r4(%rip), %rdi +; CHECK-NEXT: movq r3(%rip), %rsi +; CHECK-NEXT: movq r2(%rip), %r14 +; CHECK-NEXT: movq r1(%rip), %rbx +; CHECK-NEXT: movq hp(%rip), %r12 +; CHECK-NEXT: movq sp(%rip), %rbp +; CHECK-NEXT: movq base(%rip), %r13 +; CHECK-NEXT: popq %rax +; CHECK-NEXT: jmp bar@PLT # TAILCALL entry: - ; CHECK: movsd d2(%rip), %xmm6 - ; CHECK-NEXT: movsd d1(%rip), %xmm5 - ; CHECK-NEXT: movss f4(%rip), %xmm4 - ; CHECK-NEXT: movss f3(%rip), %xmm3 - ; CHECK-NEXT: movss f2(%rip), %xmm2 - ; CHECK-NEXT: movss f1(%rip), %xmm1 - ; CHECK-NEXT: movq splim(%rip), %r15 - ; CHECK-NEXT: movq r6(%rip), %r9 - ; CHECK-NEXT: movq r5(%rip), %r8 - ; CHECK-NEXT: movq r4(%rip), %rdi - ; CHECK-NEXT: movq r3(%rip), %rsi - ; CHECK-NEXT: movq r2(%rip), %r14 - ; CHECK-NEXT: movq r1(%rip), %rbx - ; CHECK-NEXT: movq hp(%rip), %r12 - ; CHECK-NEXT: movq sp(%rip), %rbp - ; CHECK-NEXT: movq base(%rip), %r13 %0 = load double, ptr @d2 %1 = load double, ptr @d1 %2 = load float, ptr @f4 @@ -73,7 +100,6 @@ entry: %13 = load i64, ptr @hp %14 = load i64, ptr @sp %15 = load i64, ptr @base - ; CHECK: jmp bar tail call ghccc void @bar( i64 %15, i64 %14, i64 %13, i64 %12, i64 %11, i64 %10, i64 %9, i64 %8, i64 %7, i64 %6, float %5, float %4, float %3, float %2, double %1, diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index bca446fa8fb56..9dccafb3eda93 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -440,8 +440,8 @@ define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) { ; SSE-LABEL: avx_vhadd_pd_test: ; SSE: # %bb.0: ; SSE-NEXT: haddpd %xmm1, %xmm0 -; SSE-NEXT: haddpd %xmm3, %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: haddpd %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: avx_vhadd_pd_test: @@ -479,8 +479,8 @@ define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) { ; SSE-LABEL: avx_vhsub_pd_test: ; SSE: # %bb.0: ; SSE-NEXT: hsubpd %xmm1, %xmm0 -; SSE-NEXT: hsubpd %xmm3, %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: hsubpd %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: avx_vhsub_pd_test: @@ -572,8 +572,8 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { ; SSSE3-LABEL: avx2_vphadd_d_test: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-NEXT: phaddd %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: phaddd %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; AVX1-LABEL: avx2_vphadd_d_test: @@ -638,70 +638,70 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: pextrw $1, %xmm0, %eax ; SSE3-NEXT: addl %ecx, %eax ; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $2, %xmm0, %edx +; SSE3-NEXT: pextrw $2, %xmm0, %esi ; SSE3-NEXT: pextrw $3, %xmm0, %eax -; SSE3-NEXT: addl %edx, %eax +; SSE3-NEXT: pextrw $4, %xmm0, %edi +; SSE3-NEXT: pextrw $5, %xmm0, %edx +; SSE3-NEXT: addl %esi, %eax ; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $4, %xmm0, %edx -; SSE3-NEXT: pextrw $5, %xmm0, %esi -; SSE3-NEXT: addl %edx, %esi -; SSE3-NEXT: pextrw $6, %xmm0, %edx -; SSE3-NEXT: pextrw $7, %xmm0, %r8d -; SSE3-NEXT: addl %edx, %r8d -; SSE3-NEXT: movd %xmm1, %edx -; SSE3-NEXT: pextrw $1, %xmm1, %r10d -; SSE3-NEXT: addl %edx, %r10d -; SSE3-NEXT: pextrw $2, %xmm1, %edx -; SSE3-NEXT: pextrw $3, %xmm1, %ebx -; SSE3-NEXT: addl %edx, %ebx -; SSE3-NEXT: pextrw $4, %xmm1, %edx -; SSE3-NEXT: pextrw $5, %xmm1, %r14d -; SSE3-NEXT: addl %edx, %r14d -; SSE3-NEXT: pextrw $6, %xmm1, %edx -; SSE3-NEXT: pextrw $7, %xmm1, %r12d -; SSE3-NEXT: addl %edx, %r12d -; SSE3-NEXT: movd %xmm2, %edi -; SSE3-NEXT: pextrw $1, %xmm2, %edx ; SSE3-NEXT: addl %edi, %edx -; SSE3-NEXT: pextrw $2, %xmm2, %r9d -; SSE3-NEXT: pextrw $3, %xmm2, %edi -; SSE3-NEXT: addl %r9d, %edi +; SSE3-NEXT: pextrw $6, %xmm0, %esi +; SSE3-NEXT: pextrw $7, %xmm0, %edi +; SSE3-NEXT: addl %esi, %edi +; SSE3-NEXT: movd %xmm1, %esi +; SSE3-NEXT: pextrw $1, %xmm1, %r9d +; SSE3-NEXT: pextrw $2, %xmm1, %r8d +; SSE3-NEXT: addl %esi, %r9d +; SSE3-NEXT: pextrw $3, %xmm1, %ebx +; SSE3-NEXT: addl %r8d, %ebx +; SSE3-NEXT: pextrw $4, %xmm1, %esi +; SSE3-NEXT: pextrw $5, %xmm1, %ebp +; SSE3-NEXT: pextrw $6, %xmm1, %r8d +; SSE3-NEXT: pextrw $7, %xmm1, %r15d +; SSE3-NEXT: addl %esi, %ebp +; SSE3-NEXT: addl %r8d, %r15d +; SSE3-NEXT: pextrw $1, %xmm2, %esi +; SSE3-NEXT: movd %xmm2, %r8d +; SSE3-NEXT: addl %r8d, %esi +; SSE3-NEXT: pextrw $2, %xmm2, %r10d +; SSE3-NEXT: pextrw $3, %xmm2, %r8d ; SSE3-NEXT: pextrw $4, %xmm2, %r11d -; SSE3-NEXT: pextrw $5, %xmm2, %r9d -; SSE3-NEXT: addl %r11d, %r9d -; SSE3-NEXT: pextrw $6, %xmm2, %ebp +; SSE3-NEXT: addl %r10d, %r8d +; SSE3-NEXT: pextrw $5, %xmm2, %r10d +; SSE3-NEXT: addl %r11d, %r10d +; SSE3-NEXT: pextrw $6, %xmm2, %r14d ; SSE3-NEXT: pextrw $7, %xmm2, %r11d -; SSE3-NEXT: addl %ebp, %r11d -; SSE3-NEXT: movd %xmm3, %r15d -; SSE3-NEXT: pextrw $1, %xmm3, %ebp -; SSE3-NEXT: addl %r15d, %ebp -; SSE3-NEXT: pextrw $2, %xmm3, %r13d -; SSE3-NEXT: pextrw $3, %xmm3, %r15d -; SSE3-NEXT: addl %r13d, %r15d -; SSE3-NEXT: pextrw $4, %xmm3, %r13d -; SSE3-NEXT: pextrw $5, %xmm3, %ecx -; SSE3-NEXT: addl %r13d, %ecx -; SSE3-NEXT: pextrw $6, %xmm3, %r13d -; SSE3-NEXT: pextrw $7, %xmm3, %eax -; SSE3-NEXT: addl %r13d, %eax -; SSE3-NEXT: movd %r12d, %xmm4 -; SSE3-NEXT: movd %r14d, %xmm2 +; SSE3-NEXT: addl %r14d, %r11d +; SSE3-NEXT: movd %xmm3, %r13d +; SSE3-NEXT: pextrw $1, %xmm3, %r14d +; SSE3-NEXT: pextrw $2, %xmm3, %eax +; SSE3-NEXT: pextrw $3, %xmm3, %r12d +; SSE3-NEXT: addl %r13d, %r14d +; SSE3-NEXT: addl %eax, %r12d +; SSE3-NEXT: pextrw $4, %xmm3, %ecx +; SSE3-NEXT: pextrw $5, %xmm3, %r13d +; SSE3-NEXT: pextrw $6, %xmm3, %eax +; SSE3-NEXT: addl %ecx, %r13d +; SSE3-NEXT: pextrw $7, %xmm3, %ecx +; SSE3-NEXT: addl %eax, %ecx +; SSE3-NEXT: movd %r15d, %xmm4 +; SSE3-NEXT: movd %ebp, %xmm2 ; SSE3-NEXT: movd %ebx, %xmm5 -; SSE3-NEXT: movd %r10d, %xmm3 -; SSE3-NEXT: movd %r8d, %xmm6 -; SSE3-NEXT: movd %esi, %xmm7 +; SSE3-NEXT: movd %r9d, %xmm3 +; SSE3-NEXT: movd %edi, %xmm6 +; SSE3-NEXT: movd %edx, %xmm7 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload ; SSE3-NEXT: # xmm8 = mem[0],zero,zero,zero ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE3-NEXT: movd %eax, %xmm9 -; SSE3-NEXT: movd %ecx, %xmm10 -; SSE3-NEXT: movd %r15d, %xmm11 -; SSE3-NEXT: movd %ebp, %xmm12 +; SSE3-NEXT: movd %ecx, %xmm9 +; SSE3-NEXT: movd %r13d, %xmm10 +; SSE3-NEXT: movd %r12d, %xmm11 +; SSE3-NEXT: movd %r14d, %xmm12 ; SSE3-NEXT: movd %r11d, %xmm13 -; SSE3-NEXT: movd %r9d, %xmm14 -; SSE3-NEXT: movd %edi, %xmm15 -; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: movd %r10d, %xmm14 +; SSE3-NEXT: movd %r8d, %xmm15 +; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -727,8 +727,8 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSSE3-LABEL: avx2_vphadd_w_test: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddw %xmm1, %xmm0 -; SSSE3-NEXT: phaddw %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: phaddw %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; AVX1-LABEL: avx2_vphadd_w_test: @@ -846,22 +846,22 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { ; ; AVX-LABEL: not_a_hsub_1: ; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpextrd $1, %xmm0, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpextrd $2, %xmm0, %ecx +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: vpextrd $2, %xmm0, %eax ; AVX-NEXT: vpextrd $3, %xmm0, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpextrd $1, %xmm1, %edx -; AVX-NEXT: vmovd %xmm1, %esi -; AVX-NEXT: subl %esi, %edx -; AVX-NEXT: vpextrd $3, %xmm1, %esi -; AVX-NEXT: vpextrd $2, %xmm1, %edi -; AVX-NEXT: subl %edi, %esi -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX-NEXT: vpextrd $1, %xmm1, %esi +; AVX-NEXT: subl %edx, %eax +; AVX-NEXT: vmovd %xmm1, %edx +; AVX-NEXT: vpextrd $3, %xmm1, %edi +; AVX-NEXT: subl %edx, %esi +; AVX-NEXT: vpextrd $2, %xmm1, %edx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 +; AVX-NEXT: subl %edx, %edi +; AVX-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ; AVX-NEXT: retq %vecext = extractelement <4 x i32> %A, i32 0 %vecext1 = extractelement <4 x i32> %A, i32 1 @@ -1247,71 +1247,71 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: pushq %r12 ; SSE3-NEXT: pushq %rbx ; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: pextrw $1, %xmm0, %edx -; SSE3-NEXT: addl %eax, %edx +; SSE3-NEXT: pextrw $1, %xmm0, %ecx +; SSE3-NEXT: addl %eax, %ecx +; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SSE3-NEXT: pextrw $2, %xmm0, %eax ; SSE3-NEXT: pextrw $3, %xmm0, %esi +; SSE3-NEXT: pextrw $4, %xmm0, %edx +; SSE3-NEXT: pextrw $5, %xmm0, %r8d ; SSE3-NEXT: addl %eax, %esi -; SSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE3-NEXT: pextrw $5, %xmm0, %r9d -; SSE3-NEXT: addl %eax, %r9d +; SSE3-NEXT: addl %edx, %r8d ; SSE3-NEXT: pextrw $6, %xmm0, %eax ; SSE3-NEXT: pextrw $7, %xmm0, %r10d ; SSE3-NEXT: addl %eax, %r10d -; SSE3-NEXT: movd %xmm1, %ecx +; SSE3-NEXT: movd %xmm1, %edx ; SSE3-NEXT: pextrw $1, %xmm1, %eax -; SSE3-NEXT: addl %ecx, %eax -; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SSE3-NEXT: pextrw $2, %xmm1, %edi -; SSE3-NEXT: pextrw $3, %xmm1, %eax -; SSE3-NEXT: addl %edi, %eax +; SSE3-NEXT: addl %edx, %eax ; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE3-NEXT: pextrw $4, %xmm1, %r8d +; SSE3-NEXT: pextrw $3, %xmm1, %edx +; SSE3-NEXT: addl %edi, %edx +; SSE3-NEXT: pextrw $4, %xmm1, %r11d ; SSE3-NEXT: pextrw $5, %xmm1, %edi -; SSE3-NEXT: addl %r8d, %edi -; SSE3-NEXT: pextrw $6, %xmm1, %r11d -; SSE3-NEXT: pextrw $7, %xmm1, %r8d -; SSE3-NEXT: addl %r11d, %r8d +; SSE3-NEXT: pextrw $6, %xmm1, %ebx +; SSE3-NEXT: pextrw $7, %xmm1, %r9d +; SSE3-NEXT: addl %r11d, %edi +; SSE3-NEXT: addl %ebx, %r9d +; SSE3-NEXT: pextrw $1, %xmm2, %ebx ; SSE3-NEXT: movd %xmm2, %r11d -; SSE3-NEXT: pextrw $1, %xmm2, %ebp -; SSE3-NEXT: addl %r11d, %ebp +; SSE3-NEXT: addl %r11d, %ebx ; SSE3-NEXT: pextrw $2, %xmm2, %r11d ; SSE3-NEXT: pextrw $3, %xmm2, %r14d +; SSE3-NEXT: pextrw $4, %xmm2, %ebp ; SSE3-NEXT: addl %r11d, %r14d -; SSE3-NEXT: pextrw $4, %xmm2, %r11d -; SSE3-NEXT: pextrw $5, %xmm2, %r15d -; SSE3-NEXT: addl %r11d, %r15d +; SSE3-NEXT: pextrw $5, %xmm2, %r12d +; SSE3-NEXT: addl %ebp, %r12d ; SSE3-NEXT: pextrw $6, %xmm2, %r11d -; SSE3-NEXT: pextrw $7, %xmm2, %r12d -; SSE3-NEXT: addl %r11d, %r12d -; SSE3-NEXT: movd %xmm3, %ebx +; SSE3-NEXT: pextrw $7, %xmm2, %r13d +; SSE3-NEXT: addl %r11d, %r13d +; SSE3-NEXT: movd %xmm3, %r15d ; SSE3-NEXT: pextrw $1, %xmm3, %r11d -; SSE3-NEXT: addl %ebx, %r11d -; SSE3-NEXT: pextrw $2, %xmm3, %r13d -; SSE3-NEXT: pextrw $3, %xmm3, %ebx -; SSE3-NEXT: addl %r13d, %ebx -; SSE3-NEXT: pextrw $4, %xmm3, %r13d -; SSE3-NEXT: pextrw $5, %xmm3, %ecx -; SSE3-NEXT: addl %r13d, %ecx -; SSE3-NEXT: pextrw $6, %xmm3, %r13d -; SSE3-NEXT: pextrw $7, %xmm3, %eax -; SSE3-NEXT: addl %r13d, %eax -; SSE3-NEXT: movd %r12d, %xmm4 -; SSE3-NEXT: movd %r15d, %xmm2 +; SSE3-NEXT: pextrw $2, %xmm3, %eax +; SSE3-NEXT: pextrw $3, %xmm3, %ebp +; SSE3-NEXT: addl %r15d, %r11d +; SSE3-NEXT: addl %eax, %ebp +; SSE3-NEXT: pextrw $4, %xmm3, %ecx +; SSE3-NEXT: pextrw $5, %xmm3, %r15d +; SSE3-NEXT: pextrw $6, %xmm3, %eax +; SSE3-NEXT: addl %ecx, %r15d +; SSE3-NEXT: pextrw $7, %xmm3, %ecx +; SSE3-NEXT: addl %eax, %ecx +; SSE3-NEXT: movd %r13d, %xmm4 +; SSE3-NEXT: movd %r12d, %xmm2 ; SSE3-NEXT: movd %r14d, %xmm5 -; SSE3-NEXT: movd %ebp, %xmm3 +; SSE3-NEXT: movd %ebx, %xmm3 ; SSE3-NEXT: movd %r10d, %xmm6 -; SSE3-NEXT: movd %r9d, %xmm7 +; SSE3-NEXT: movd %r8d, %xmm7 ; SSE3-NEXT: movd %esi, %xmm8 -; SSE3-NEXT: movd %edx, %xmm0 -; SSE3-NEXT: movd %eax, %xmm9 -; SSE3-NEXT: movd %ecx, %xmm10 -; SSE3-NEXT: movd %ebx, %xmm11 +; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE3-NEXT: movd %ecx, %xmm9 +; SSE3-NEXT: movd %r15d, %xmm10 +; SSE3-NEXT: movd %ebp, %xmm11 ; SSE3-NEXT: movd %r11d, %xmm12 -; SSE3-NEXT: movd %r8d, %xmm13 +; SSE3-NEXT: movd %r9d, %xmm13 ; SSE3-NEXT: movd %edi, %xmm14 -; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload -; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero +; SSE3-NEXT: movd %edx, %xmm15 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload ; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll index f43a8ee38332a..35166a54a9ed0 100644 --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -53,13 +53,13 @@ define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hadd_reverse_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: haddps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: haddps %xmm2, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,2] -; SSE-NEXT: haddps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0,3,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse_v8f32: @@ -84,13 +84,13 @@ define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) { define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hadd_reverse2_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: haddps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: haddps %xmm2, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,2] -; SSE-NEXT: haddps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0,3,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse2_v8f32: @@ -122,8 +122,8 @@ define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hadd_reverse3_v8f32: ; SSE: # %bb.0: ; SSE-NEXT: haddps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] ; SSE-NEXT: haddps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm1 @@ -236,34 +236,32 @@ define <16 x i16> @hadd_reverse2_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; SSE-LABEL: hadd_reverse_v8f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm1, %xmm8 -; SSE-NEXT: movapd %xmm0, %xmm9 ; SSE-NEXT: haddpd %xmm7, %xmm3 ; SSE-NEXT: haddpd %xmm6, %xmm2 -; SSE-NEXT: haddpd %xmm5, %xmm8 -; SSE-NEXT: haddpd %xmm4, %xmm9 +; SSE-NEXT: movapd %xmm1, %xmm6 +; SSE-NEXT: haddpd %xmm5, %xmm6 +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: haddpd %xmm4, %xmm5 ; SSE-NEXT: movapd %xmm3, %xmm0 ; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm8, %xmm2 -; SSE-NEXT: movapd %xmm9, %xmm3 +; SSE-NEXT: movapd %xmm6, %xmm2 +; SSE-NEXT: movapd %xmm5, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse_v8f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-NEXT: vmovapd %ymm3, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v8f64: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovapd %ymm3, %ymm0 +; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX2-NEXT: retq %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> @@ -274,34 +272,32 @@ define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) noun define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; SSE-LABEL: hadd_reverse2_v8f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm1, %xmm8 -; SSE-NEXT: movapd %xmm0, %xmm9 ; SSE-NEXT: haddpd %xmm7, %xmm3 ; SSE-NEXT: haddpd %xmm6, %xmm2 -; SSE-NEXT: haddpd %xmm5, %xmm8 -; SSE-NEXT: haddpd %xmm4, %xmm9 +; SSE-NEXT: movapd %xmm1, %xmm6 +; SSE-NEXT: haddpd %xmm5, %xmm6 +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: haddpd %xmm4, %xmm5 ; SSE-NEXT: movapd %xmm3, %xmm0 ; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm8, %xmm2 -; SSE-NEXT: movapd %xmm9, %xmm3 +; SSE-NEXT: movapd %xmm6, %xmm2 +; SSE-NEXT: movapd %xmm5, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse2_v8f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-NEXT: vmovapd %ymm3, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse2_v8f64: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovapd %ymm3, %ymm0 +; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> %shuf1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 364ad953a11d4..3234e69d10409 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -1169,8 +1169,8 @@ define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) ; SSE-LABEL: hadd_4f64_v4f64_shuffle: ; SSE: # %bb.0: ; SSE-NEXT: haddpd %xmm1, %xmm0 -; SSE-NEXT: haddpd %xmm3, %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: haddpd %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_4f64_v4f64_shuffle: @@ -1197,8 +1197,8 @@ define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) ; SSE-LABEL: hsub_4f64_v4f64_shuffle: ; SSE: # %bb.0: ; SSE-NEXT: hsubpd %xmm1, %xmm0 -; SSE-NEXT: hsubpd %xmm3, %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: hsubpd %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hsub_4f64_v4f64_shuffle: @@ -1225,8 +1225,8 @@ define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hadd_8f32_v8f32_shuffle: ; SSE: # %bb.0: ; SSE-NEXT: haddps %xmm1, %xmm0 -; SSE-NEXT: haddps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: haddps %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_8f32_v8f32_shuffle: @@ -1253,8 +1253,8 @@ define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hsub_8f32_v8f32_shuffle: ; SSE: # %bb.0: ; SSE-NEXT: haddps %xmm1, %xmm0 -; SSE-NEXT: haddps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: haddps %xmm3, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hsub_8f32_v8f32_shuffle: @@ -1294,8 +1294,8 @@ define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { ; SSSE3-LABEL: hadd_8i32_v8i32_shuffle: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-NEXT: phaddd %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: phaddd %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; AVX1-LABEL: hadd_8i32_v8i32_shuffle: @@ -1338,8 +1338,8 @@ define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { ; SSSE3-LABEL: hsub_8i32_v8i32_shuffle: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phsubd %xmm1, %xmm0 -; SSSE3-NEXT: phsubd %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: phsubd %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; AVX1-LABEL: hsub_8i32_v8i32_shuffle: @@ -1395,8 +1395,8 @@ define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) { ; SSSE3-LABEL: hadd_16i16_16i16_shuffle: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddw %xmm1, %xmm0 -; SSSE3-NEXT: phaddw %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: phaddw %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; AVX1-LABEL: hadd_16i16_16i16_shuffle: diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index 94fa81742ba71..3ffa005899d3b 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -283,8 +283,8 @@ define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) { ; ; SSE-FAST-LABEL: test11_undef: ; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: movaps %xmm3, %xmm1 ; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: movaps %xmm3, %xmm1 ; SSE-FAST-NEXT: haddps %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; @@ -680,8 +680,8 @@ define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) { ; ; SSE-FAST-LABEL: add_pd_011: ; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: haddpd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; @@ -888,16 +888,16 @@ define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) { define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) { ; SSE-SLOW-LABEL: PR44694: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: haddpd %xmm3, %xmm2 +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 ; SSE-SLOW-NEXT: movapd %xmm2, %xmm1 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR44694: ; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: movapd %xmm1, %xmm0 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 +; SSE-FAST-NEXT: movapd %xmm1, %xmm0 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq @@ -1104,8 +1104,8 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) { ; ; SSE-FAST-LABEL: PR34724_add_v4f64_u123: ; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: movapd %xmm1, %xmm0 ; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 +; SSE-FAST-NEXT: movapd %xmm1, %xmm0 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq @@ -1203,8 +1203,8 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) { ; SSE-FAST-LABEL: PR34724_add_v4f64_01u3: ; SSE-FAST: # %bb.0: ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 -; SSE-FAST-NEXT: haddpd %xmm3, %xmm3 ; SSE-FAST-NEXT: movapd %xmm3, %xmm1 +; SSE-FAST-NEXT: haddpd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3: @@ -1256,8 +1256,8 @@ define <4 x double> @PR34724_add_v4f64_012u(<4 x double> %0, <4 x double> %1) { ; SSE-FAST-LABEL: PR34724_add_v4f64_012u: ; SSE-FAST: # %bb.0: ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 -; SSE-FAST-NEXT: haddpd %xmm2, %xmm2 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 +; SSE-FAST-NEXT: haddpd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: PR34724_add_v4f64_012u: diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index d8686b8b2950f..07ed430f919df 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -25,8 +25,8 @@ define void @test_load_store(ptr %in, ptr %out) #0 { ; CHECK-I686-LABEL: test_load_store: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-I686-NEXT: pinsrw $0, (%ecx), %xmm0 +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-I686-NEXT: pextrw $0, %xmm0, %ecx ; CHECK-I686-NEXT: movw %cx, (%eax) ; CHECK-I686-NEXT: retl @@ -297,9 +297,9 @@ define i64 @test_fptoui_i64(ptr %p) #0 { ; CHECK-LIBCALL-NEXT: callq __extendhfsf2@PLT ; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rcx ; CHECK-LIBCALL-NEXT: movq %rcx, %rdx -; CHECK-LIBCALL-NEXT: sarq $63, %rdx ; CHECK-LIBCALL-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax +; CHECK-LIBCALL-NEXT: sarq $63, %rdx ; CHECK-LIBCALL-NEXT: andq %rdx, %rax ; CHECK-LIBCALL-NEXT: orq %rcx, %rax ; CHECK-LIBCALL-NEXT: popq %rcx @@ -311,9 +311,9 @@ define i64 @test_fptoui_i64(ptr %p) #0 { ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvttss2si %xmm0, %rcx ; BWON-F16C-NEXT: movq %rcx, %rdx -; BWON-F16C-NEXT: sarq $63, %rdx ; BWON-F16C-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax +; BWON-F16C-NEXT: sarq $63, %rdx ; BWON-F16C-NEXT: andq %rdx, %rax ; BWON-F16C-NEXT: orq %rcx, %rax ; BWON-F16C-NEXT: retq @@ -610,10 +610,10 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 { ; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __truncsfhf2@PLT ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax -; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) ; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax -; CHECK-LIBCALL-NEXT: movw %ax, 6(%rbx) +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: movw %cx, 6(%rbx) ; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; CHECK-LIBCALL-NEXT: movw %ax, 4(%rbx) @@ -654,10 +654,10 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 { ; CHECK-I686-NEXT: movd %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncsfhf2 ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movw %ax, (%esi) ; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movw %ax, 6(%esi) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-I686-NEXT: movw %ax, (%esi) +; CHECK-I686-NEXT: movw %cx, 6(%esi) ; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, 4(%esi) @@ -693,10 +693,10 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 { ; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax -; CHECK-LIBCALL-NEXT: movw %ax, 4(%rbx) ; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax -; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-LIBCALL-NEXT: movw %ax, 4(%rbx) +; CHECK-LIBCALL-NEXT: movw %cx, (%rbx) ; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; CHECK-LIBCALL-NEXT: movw %ax, 6(%rbx) @@ -763,10 +763,10 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 { ; CHECK-I686-NEXT: movhps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movw %ax, 6(%esi) ; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movw %ax, 4(%esi) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-I686-NEXT: movw %ax, 6(%esi) +; CHECK-I686-NEXT: movw %cx, 4(%esi) ; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, 2(%esi) @@ -1615,20 +1615,20 @@ define <8 x half> @maxnum_v8f16(<8 x half> %0, <8 x half> %1) #0 { ; BWON-F16C-NEXT: # %bb.11: ; BWON-F16C-NEXT: vmovaps %xmm8, %xmm7 ; BWON-F16C-NEXT: .LBB26_12: +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm7, %xmm7 ; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm5, %xmm4 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm7, %xmm5 -; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm7 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm6 -; BWON-F16C-NEXT: vucomiss %xmm7, %xmm6 +; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm6 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm4 +; BWON-F16C-NEXT: vucomiss %xmm6, %xmm4 ; BWON-F16C-NEXT: ja .LBB26_14 ; BWON-F16C-NEXT: # %bb.13: -; BWON-F16C-NEXT: vmovaps %xmm7, %xmm6 +; BWON-F16C-NEXT: vmovaps %xmm6, %xmm4 ; BWON-F16C-NEXT: .LBB26_14: ; BWON-F16C-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm6, %xmm4 +; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; BWON-F16C-NEXT: vpsrld $16, %xmm1, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; BWON-F16C-NEXT: vpsrld $16, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll index 9121cf2d654a3..9c17bea45243d 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -633,19 +633,19 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: psrld %xmm3, %xmm4 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: psrld %xmm3, %xmm5 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: psrld %xmm5, %xmm3 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: psrld %xmm3, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: psrld %xmm4, %xmm5 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: psrld %xmm1, %xmm2 -; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] -; X86-SSE2-NEXT: andps %xmm5, %xmm0 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] +; X86-SSE2-NEXT: andps %xmm3, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -693,19 +693,19 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,u,1] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: psrld %xmm3, %xmm4 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: psrld %xmm3, %xmm5 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: psrld %xmm5, %xmm3 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: psrld %xmm3, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: psrld %xmm4, %xmm5 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: psrld %xmm1, %xmm2 -; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] -; X86-SSE2-NEXT: andps %xmm5, %xmm0 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] +; X86-SSE2-NEXT: andps %xmm3, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index 1a2aac657d30f..383953fde9e3e 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -346,9 +346,9 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI1-NEXT: movl $1, %eax -; X86-BMI1-NEXT: xorl %esi, %esi ; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: shldl %cl, %eax, %edx +; X86-BMI1-NEXT: xorl %esi, %esi ; X86-BMI1-NEXT: shll %cl, %eax ; X86-BMI1-NEXT: testb $32, %cl ; X86-BMI1-NEXT: cmovnel %eax, %edx @@ -365,9 +365,9 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl $1, %edx -; X86-BMI2-NEXT: xorl %esi, %esi ; X86-BMI2-NEXT: xorl %eax, %eax ; X86-BMI2-NEXT: shldl %cl, %edx, %eax +; X86-BMI2-NEXT: xorl %esi, %esi ; X86-BMI2-NEXT: shlxl %ecx, %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl ; X86-BMI2-NEXT: cmovnel %edx, %eax diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index 9946267b48e7f..c171bb7244c2f 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -123,7 +123,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -321,7 +321,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: notl %eax @@ -437,7 +437,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -938,7 +938,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: notl %eax @@ -1088,7 +1088,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -1779,7 +1779,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: notl %eax @@ -1950,7 +1950,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -2073,7 +2073,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: notl %eax @@ -2181,7 +2181,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: notl %eax @@ -2301,7 +2301,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 @@ -2426,7 +2426,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) { ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index 0bbf94f1817f5..1e6366a73efe7 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -125,7 +125,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll index 54911351e68dc..241e4f49b0ebc 100644 --- a/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll +++ b/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll @@ -29,8 +29,8 @@ define void @test_demanded_haddps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) define void @test_demanded_hsubps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) nounwind { ; X86-LABEL: test_demanded_hsubps_128: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vhsubps %xmm1, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vextractps $2, %xmm0, (%eax) ; X86-NEXT: retl ; @@ -129,8 +129,8 @@ define void @test_demanded_phsubd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) noun define void @test_demanded_phaddw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind { ; X86-LABEL: test_demanded_phaddw_128: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $0, %xmm0, (%eax) ; X86-NEXT: retl ; @@ -149,8 +149,8 @@ define void @test_demanded_phaddw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) noun define void @test_demanded_phsubw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind { ; X86-LABEL: test_demanded_phsubw_128: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $2, %xmm0, (%eax) ; X86-NEXT: retl ; @@ -173,8 +173,8 @@ define void @test_demanded_phsubw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) noun define void @test_demanded_haddps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) nounwind { ; X86-LABEL: test_demanded_haddps_256: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: vzeroupper @@ -221,8 +221,8 @@ define void @test_demanded_hsubps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) define void @test_demanded_haddpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind { ; X86-LABEL: test_demanded_haddpd_256: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%eax) ; X86-NEXT: vzeroupper @@ -245,8 +245,8 @@ define void @test_demanded_haddpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2 define void @test_demanded_hsubpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind { ; X86-LABEL: test_demanded_hsubpd_256: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%eax) ; X86-NEXT: vzeroupper @@ -317,9 +317,9 @@ define void @test_demanded_phsubd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) noun define void @test_demanded_phaddw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind { ; X86-LABEL: test_demanded_phaddw_256: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpbroadcastw %xmm1, %xmm0 ; X86-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $4, %xmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -341,8 +341,8 @@ define void @test_demanded_phaddw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) no define void @test_demanded_phsubw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind { ; X86-LABEL: test_demanded_phsubw_256: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vphsubw %xmm1, %xmm0, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vpextrw $6, %xmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 443275e11459d..4fc9d525157e2 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -102,11 +102,11 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] @@ -125,11 +125,11 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 @@ -169,16 +169,16 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] ; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm6 = xmm2[1],xmm6[1] ; SSSE3-SLOW-NEXT: movaps %xmm6, %xmm1 ; SSSE3-SLOW-NEXT: retq @@ -188,12 +188,12 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2 -; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4 -; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2 -; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6 -; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1 +; SSSE3-FAST-NEXT: haddps %xmm5, %xmm1 +; SSSE3-FAST-NEXT: haddps %xmm1, %xmm2 +; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6 +; SSSE3-FAST-NEXT: haddps %xmm6, %xmm1 +; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32: @@ -201,20 +201,20 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3] ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm4 ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 +; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm1 ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1] -; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm4[0,1] +; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[0] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[1,3] +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX1-SLOW-NEXT: retq @@ -232,10 +232,10 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX1-FAST-NEXT: retq @@ -243,23 +243,23 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm0[1,3,1,3] ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] -; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[3,1] ; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: retq ; @@ -270,17 +270,17 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8 ; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] -; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1] ; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> @@ -335,16 +335,16 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] ; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-SLOW-NEXT: phaddd %xmm5, %xmm2 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2] ; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6 +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-SLOW-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSSE3-SLOW-NEXT: retq ; @@ -352,38 +352,38 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 +; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm1 ; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4 -; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2 -; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm1 ; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6 ; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7 ; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] -; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm6[0,2] ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 -; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,3,1,3] +; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm5 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,0,0,0] +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm2 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX1-SLOW-NEXT: retq @@ -403,10 +403,10 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX1-FAST-NEXT: retq @@ -414,25 +414,25 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm2, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: retq ; @@ -451,11 +451,11 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq %9 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> @@ -551,9 +551,10 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4 +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm4 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[0,1] ; SSSE3-FAST-NEXT: addps %xmm5, %xmm0 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 @@ -595,9 +596,9 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] +; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] ; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 @@ -686,7 +687,7 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] @@ -703,19 +704,19 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-FAST-NEXT: retq ; @@ -732,7 +733,7 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2 @@ -749,19 +750,19 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] -; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm2 +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> @@ -839,9 +840,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSSE3-FAST-NEXT: addss %xmm5, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4 +; SSSE3-FAST-NEXT: addss %xmm5, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 @@ -902,23 +903,23 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0] ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm5 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4 -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] -; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-FAST-NEXT: vaddss %xmm4, %xmm5, %xmm4 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm5 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-FAST-NEXT: vaddss %xmm1, %xmm5, %xmm1 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4 ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll index 2849e448a0534..49106b20a1eb1 100644 --- a/llvm/test/CodeGen/X86/i128-add.ll +++ b/llvm/test/CodeGen/X86/i128-add.ll @@ -7,23 +7,23 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl {{[0-9]+}}(%esp), %esi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addl $1, %esi +; X86-NEXT: addl $1, %edx +; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -48,23 +48,23 @@ define <1 x i128> @add_v1i128(<1 x i128> %x, <1 x i128> %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl {{[0-9]+}}(%esp), %esi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addl $1, %esi +; X86-NEXT: addl $1, %edx +; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index cffd88c55bb0a..21319a346dfbc 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -14,28 +14,27 @@ define i64 @foo(i64 %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl %edi, %eax -; X86-NOBMI-NEXT: mull %esi +; X86-NOBMI-NEXT: mull %ecx ; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %ebx, %eax -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %esi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI-NEXT: mull %ecx +; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: movl %eax, %ebx ; X86-NOBMI-NEXT: addl %ebp, %ebx -; X86-NOBMI-NEXT: adcl $0, %esi +; X86-NOBMI-NEXT: adcl $0, %ecx ; X86-NOBMI-NEXT: movl %edi, %eax -; X86-NOBMI-NEXT: mull %ecx +; X86-NOBMI-NEXT: mull %esi ; X86-NOBMI-NEXT: addl %ebx, %eax -; X86-NOBMI-NEXT: adcl %edx, %esi -; X86-NOBMI-NEXT: setb %al -; X86-NOBMI-NEXT: movzbl %al, %edi +; X86-NOBMI-NEXT: adcl %edx, %ecx +; X86-NOBMI-NEXT: setb %bl ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: mull %ecx -; X86-NOBMI-NEXT: addl %esi, %eax -; X86-NOBMI-NEXT: adcl %edi, %edx +; X86-NOBMI-NEXT: mull %esi +; X86-NOBMI-NEXT: movzbl %bl, %esi +; X86-NOBMI-NEXT: addl %ecx, %eax +; X86-NOBMI-NEXT: adcl %esi, %edx ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: popl %ebx @@ -62,10 +61,10 @@ define i64 @foo(i64 %x, i64 %y) nounwind { ; X86-BMI-NEXT: mulxl %esi, %ecx, %ebx ; X86-BMI-NEXT: addl %edi, %ecx ; X86-BMI-NEXT: adcl %ebp, %ebx -; X86-BMI-NEXT: setb %cl -; X86-BMI-NEXT: movzbl %cl, %ecx ; X86-BMI-NEXT: movl %eax, %edx ; X86-BMI-NEXT: mulxl %esi, %eax, %edx +; X86-BMI-NEXT: setb %cl +; X86-BMI-NEXT: movzbl %cl, %ecx ; X86-BMI-NEXT: addl %ebx, %eax ; X86-BMI-NEXT: adcl %ecx, %edx ; X86-BMI-NEXT: popl %esi @@ -113,60 +112,59 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: # %bb.1: # %for.body.preheader ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: xorl %ecx, %ecx +; X86-NOBMI-NEXT: xorl %edi, %edi ; X86-NOBMI-NEXT: movl $0, (%esp) # 4-byte Folded Spill ; X86-NOBMI-NEXT: .p2align 4 ; X86-NOBMI-NEXT: .LBB1_2: # %for.body ; X86-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %edi -; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx -; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %edi, %eax -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: mull %esi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOBMI-NEXT: movl (%ebx,%edi,8), %esi +; X86-NOBMI-NEXT: movl %esi, %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: mull %ecx +; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl 4(%ebx,%edi,8), %eax ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %ebx, %eax -; X86-NOBMI-NEXT: mull %esi +; X86-NOBMI-NEXT: mull %ecx ; X86-NOBMI-NEXT: movl %edx, %ebx -; X86-NOBMI-NEXT: movl %eax, %esi -; X86-NOBMI-NEXT: addl %ebp, %esi +; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: addl %ebp, %ecx ; X86-NOBMI-NEXT: adcl $0, %ebx -; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx ; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %eax, %edi -; X86-NOBMI-NEXT: addl %esi, %edi +; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: addl %ecx, %esi ; X86-NOBMI-NEXT: adcl %ebx, %ebp -; X86-NOBMI-NEXT: setb %bl +; X86-NOBMI-NEXT: setb %cl ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) ; X86-NOBMI-NEXT: addl %ebp, %eax -; X86-NOBMI-NEXT: movzbl %bl, %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NOBMI-NEXT: adcl %esi, %edx -; X86-NOBMI-NEXT: movl %ecx, %ebx -; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NOBMI-NEXT: movzbl %cl, %ecx +; X86-NOBMI-NEXT: adcl %ecx, %edx +; X86-NOBMI-NEXT: movl %edi, %ebx +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NOBMI-NEXT: adcl $0, %eax ; X86-NOBMI-NEXT: adcl $0, %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8) -; X86-NOBMI-NEXT: movl %ebx, %ecx -; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8) -; X86-NOBMI-NEXT: addl $1, %ecx -; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-NOBMI-NEXT: adcl $0, %edi -; X86-NOBMI-NEXT: movl %ecx, %esi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movl %edi, (%ecx,%ebx,8) +; X86-NOBMI-NEXT: movl %ebx, %edi +; X86-NOBMI-NEXT: movl %esi, 4(%ecx,%ebx,8) +; X86-NOBMI-NEXT: addl $1, %edi +; X86-NOBMI-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NOBMI-NEXT: adcl $0, %esi +; X86-NOBMI-NEXT: movl %edi, %ecx +; X86-NOBMI-NEXT: xorl %ebp, %ecx +; X86-NOBMI-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NOBMI-NEXT: xorl %ebp, %edi -; X86-NOBMI-NEXT: orl %esi, %edi +; X86-NOBMI-NEXT: orl %ecx, %esi ; X86-NOBMI-NEXT: jne .LBB1_2 ; X86-NOBMI-NEXT: .LBB1_3: # %for.end ; X86-NOBMI-NEXT: xorl %eax, %eax @@ -198,42 +196,40 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: .LBB1_2: # %for.body ; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: movl (%eax,%ebx,8), %ecx -; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %esi -; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI-NEXT: movl (%esi,%ebx,8), %ecx ; X86-BMI-NEXT: movl %ecx, %edx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: mulxl %eax, %edx, %edi ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %eax, %esi, %eax -; X86-BMI-NEXT: addl %edi, %esi -; X86-BMI-NEXT: adcl $0, %eax +; X86-BMI-NEXT: movl 4(%esi,%ebx,8), %edx +; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-BMI-NEXT: mulxl %eax, %eax, %esi +; X86-BMI-NEXT: addl %edi, %eax ; X86-BMI-NEXT: movl %ecx, %edx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp -; X86-BMI-NEXT: addl %esi, %edi -; X86-BMI-NEXT: adcl %eax, %ebp -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-BMI-NEXT: adcl $0, %esi +; X86-BMI-NEXT: addl %eax, %edi +; X86-BMI-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax +; X86-BMI-NEXT: adcl %esi, %ebp ; X86-BMI-NEXT: setb %dl ; X86-BMI-NEXT: addl %ebp, %ecx +; X86-BMI-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI-NEXT: movzbl %dl, %edx ; X86-BMI-NEXT: adcl %edx, %eax -; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-BMI-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X86-BMI-NEXT: adcl $0, %ecx -; X86-BMI-NEXT: adcl $0, %edx -; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-BMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X86-BMI-NEXT: adcl $0, %eax ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI-NEXT: movl %eax, (%edx,%ebx,8) +; X86-BMI-NEXT: movl %ecx, (%edx,%ebx,8) ; X86-BMI-NEXT: movl %edi, 4(%edx,%ebx,8) ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI-NEXT: addl $1, %ebx @@ -243,7 +239,7 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: movl %ebp, %esi ; X86-BMI-NEXT: xorl %edi, %esi ; X86-BMI-NEXT: orl %edx, %esi -; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-BMI-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-BMI-NEXT: jne .LBB1_2 ; X86-BMI-NEXT: .LBB1_3: # %for.end ; X86-BMI-NEXT: xorl %eax, %eax @@ -272,8 +268,8 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-NOBMI-NEXT: adcq $0, %rdx ; X64-NOBMI-NEXT: movq %rax, (%rsi,%r9,8) ; X64-NOBMI-NEXT: incq %r9 -; X64-NOBMI-NEXT: cmpq %r9, %rdi ; X64-NOBMI-NEXT: movq %rdx, %r10 +; X64-NOBMI-NEXT: cmpq %r9, %rdi ; X64-NOBMI-NEXT: jne .LBB1_2 ; X64-NOBMI-NEXT: .LBB1_3: # %for.end ; X64-NOBMI-NEXT: xorl %eax, %eax @@ -296,8 +292,8 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-BMI-NEXT: adcq $0, %rdx ; X64-BMI-NEXT: movq %r10, (%rsi,%r8,8) ; X64-BMI-NEXT: incq %r8 -; X64-BMI-NEXT: cmpq %r8, %rdi ; X64-BMI-NEXT: movq %rdx, %r9 +; X64-BMI-NEXT: cmpq %r8, %rdi ; X64-BMI-NEXT: jne .LBB1_2 ; X64-BMI-NEXT: .LBB1_3: # %for.end ; X64-BMI-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll index 717f52f198ee8..09bdf7774f0d7 100644 --- a/llvm/test/CodeGen/X86/i128-sdiv.ll +++ b/llvm/test/CodeGen/X86/i128-sdiv.ll @@ -8,17 +8,16 @@ define i128 @test1(i128 %x) nounwind { ; X86-LABEL: test1: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: shrl $30, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %eax, %esi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: shrdl $2, %ecx, %edx @@ -30,7 +29,6 @@ define i128 @test1(i128 %x) nounwind { ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; X64-LABEL: test1: @@ -56,31 +54,31 @@ define i128 @test2(i128 %x) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shrl $30, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %edx, %esi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: shrdl $2, %edx, %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: sarl $2, %edx -; X86-NEXT: xorl %edi, %edi +; X86-NEXT: adcl $0, %eax +; X86-NEXT: shrdl $2, %eax, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: sarl $2, %eax +; X86-NEXT: xorl %esi, %esi ; X86-NEXT: negl %ecx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl $0, %ebx ; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %edx, %esi ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -107,8 +105,327 @@ define i128 @test2(i128 %x) nounwind { define i128 @test3(i128 %x) nounwind { ; X86-LABEL: test3: -; X86 doesn't have __divti3, so the urem is expanded into a loop. -; X86: udiv-do-while +; X86: # %bb.0: # %_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 24(%ebp), %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: xorl %edi, %ebx +; X86-NEXT: movl 20(%ebp), %esi +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: movl 16(%ebp), %edx +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: xorl %edi, %ecx +; X86-NEXT: subl %edi, %ecx +; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: jne .LBB2_1 +; X86-NEXT: # %bb.2: # %_udiv-special-cases +; X86-NEXT: bsrl %esi, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: orl $32, %eax +; X86-NEXT: jmp .LBB2_3 +; X86-NEXT: .LBB2_1: +; X86-NEXT: bsrl %ebx, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: .LBB2_3: # %_udiv-special-cases +; X86-NEXT: testl %edx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: jne .LBB2_4 +; X86-NEXT: # %bb.5: # %_udiv-special-cases +; X86-NEXT: bsrl %ecx, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: orl $32, %edx +; X86-NEXT: jmp .LBB2_6 +; X86-NEXT: .LBB2_4: +; X86-NEXT: bsrl %edx, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: .LBB2_6: # %_udiv-special-cases +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: jne .LBB2_8 +; X86-NEXT: # %bb.7: # %_udiv-special-cases +; X86-NEXT: orl $64, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: .LBB2_8: # %_udiv-special-cases +; X86-NEXT: movl $61, %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: movl $127, %eax +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %ecx, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: setb %al +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload +; X86-NEXT: orb %al, %ah +; X86-NEXT: movl $0, %edx +; X86-NEXT: movl $0, %esi +; X86-NEXT: movl $0, %ebx +; X86-NEXT: testb %ah, %ah +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jne .LBB2_10 +; X86-NEXT: # %bb.9: # %_udiv-special-cases +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: .LBB2_10: # %_udiv-special-cases +; X86-NEXT: notl %edi +; X86-NEXT: jne .LBB2_11 +; X86-NEXT: # %bb.18: # %_udiv-special-cases +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl $127, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: je .LBB2_19 +; X86-NEXT: # %bb.15: # %udiv-bb1 +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrb $3, %dl +; X86-NEXT: andb $12, %dl +; X86-NEXT: negb %dl +; X86-NEXT: movsbl %dl, %edx +; X86-NEXT: movl 136(%esp,%edx), %esi +; X86-NEXT: movl 140(%esp,%edx), %eax +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 132(%esp,%edx), %eax +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%edx), %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl %cl, %ebx, %esi +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl $1, %ecx +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jae .LBB2_12 +; X86-NEXT: # %bb.16: +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: jmp .LBB2_17 +; X86-NEXT: .LBB2_11: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jmp .LBB2_19 +; X86-NEXT: .LBB2_12: # %udiv-preheader +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %edi +; X86-NEXT: movl 92(%esp,%edi), %edx +; X86-NEXT: movl 88(%esp,%edi), %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: shrdl %cl, %edx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 84(%esp,%edi), %ebx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shrdl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%edi), %eax +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl $3, %ecx +; X86-NEXT: addl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %ecx +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $4, %ecx +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %ecx +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB2_13: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edi +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %eax, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl $4, %edi +; X86-NEXT: andl %edi, %eax +; X86-NEXT: movl $3, %edi +; X86-NEXT: andl %edi, %ecx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl $-1, %ecx +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: jne .LBB2_13 +; X86-NEXT: # %bb.14: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: .LBB2_17: # %udiv-loop-exit +; X86-NEXT: shldl $1, %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: shldl $1, %ebx, %esi +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %eax, %esi +; X86-NEXT: addl %ebx, %ebx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: .LBB2_19: # %udiv-end +; X86-NEXT: xorl %edi, %ecx +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: xorl %edi, %ebx +; X86-NEXT: subl %edi, %ebx +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 ; ; X64-LABEL: test3: ; X64: # %bb.0: @@ -118,6 +435,7 @@ define i128 @test3(i128 %x) nounwind { ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: popq %rcx ; X64-NEXT: retq +; X86 doesn't have __divti3, so the urem is expanded into a loop. %tmp = sdiv i128 %x, -73786976294838206467 ret i128 %tmp } diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll index 0a9da87642884..153c3cb5e0ba2 100644 --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -257,33 +257,33 @@ define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind { define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind { ; X86-SSE-LABEL: clamp_sitofp_2i64_2f64: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pxor %xmm1, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; X86-SSE-NEXT: pcmpeqd %xmm4, %xmm4 +; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; X86-SSE-NEXT: pand %xmm4, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE-NEXT: por %xmm3, %xmm1 -; X86-SSE-NEXT: pand %xmm1, %xmm0 -; X86-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE-NEXT: por %xmm0, %xmm1 -; X86-SSE-NEXT: pxor %xmm1, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE-NEXT: por %xmm3, %xmm2 +; X86-SSE-NEXT: pand %xmm2, %xmm0 +; X86-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE-NEXT: por %xmm0, %xmm2 +; X86-SSE-NEXT: pxor %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; X86-SSE-NEXT: pxor %xmm3, %xmm3 ; X86-SSE-NEXT: pcmpeqd %xmm0, %xmm3 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,0,2147483903,0] -; X86-SSE-NEXT: pcmpgtd %xmm2, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; X86-SSE-NEXT: pand %xmm3, %xmm2 +; X86-SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; X86-SSE-NEXT: pand %xmm3, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-SSE-NEXT: por %xmm2, %xmm0 -; X86-SSE-NEXT: pand %xmm0, %xmm1 -; X86-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: por %xmm1, %xmm0 +; X86-SSE-NEXT: pand %xmm0, %xmm2 +; X86-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: por %xmm2, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; X86-SSE-NEXT: retl @@ -324,8 +324,8 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind { ; X64-SSE-NEXT: pxor %xmm1, %xmm2 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm4 -; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm4 ; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm4 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] ; X64-SSE-NEXT: pand %xmm4, %xmm3 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] @@ -352,10 +352,12 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind { ; ; X64-AVX-LABEL: clamp_sitofp_2i64_2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361] +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361] +; X64-AVX-NEXT: # xmm1 = mem[0,0] ; X64-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; X64-AVX-NEXT: # xmm1 = mem[0,0] ; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/i686-win-shrink-wrapping.ll b/llvm/test/CodeGen/X86/i686-win-shrink-wrapping.ll index 5cb1b8f4a89e4..e7ed42a2e41c6 100644 --- a/llvm/test/CodeGen/X86/i686-win-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/i686-win-shrink-wrapping.ll @@ -12,8 +12,8 @@ define x86_thiscallcc void @stackRealignment(ptr %this) { ; SHRINK-WRAP-LABEL: stackRealignment: ; SHRINK-WRAP: # %bb.0: # %entry ; SHRINK-WRAP-NEXT: movl (%ecx), %eax -; SHRINK-WRAP-NEXT: cmpl $33, %eax ; SHRINK-WRAP-NEXT: movl $42, %edx +; SHRINK-WRAP-NEXT: cmpl $33, %eax ; SHRINK-WRAP-NEXT: jge LBB0_2 ; SHRINK-WRAP-NEXT: # %bb.1: # %entry ; SHRINK-WRAP-NEXT: movl $128, %edx @@ -43,8 +43,8 @@ define x86_thiscallcc void @stackRealignment(ptr %this) { ; NO-SHRINK-WRAP-NEXT: andl $-8, %esp ; NO-SHRINK-WRAP-NEXT: subl $16, %esp ; NO-SHRINK-WRAP-NEXT: movl (%ecx), %eax -; NO-SHRINK-WRAP-NEXT: cmpl $33, %eax ; NO-SHRINK-WRAP-NEXT: movl $42, %edx +; NO-SHRINK-WRAP-NEXT: cmpl $33, %eax ; NO-SHRINK-WRAP-NEXT: jge LBB0_2 ; NO-SHRINK-WRAP-NEXT: # %bb.1: # %entry ; NO-SHRINK-WRAP-NEXT: movl $128, %edx diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll index 55c318e87a5a0..f277f44aa1ebb 100644 --- a/llvm/test/CodeGen/X86/iabs.ll +++ b/llvm/test/CodeGen/X86/iabs.ll @@ -126,7 +126,6 @@ define i128 @test_i128(i128 %a) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx @@ -140,6 +139,7 @@ define i128 @test_i128(i128 %a) nounwind { ; X86-NEXT: subl %edx, %ebx ; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: movl %edi, 4(%eax) diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll index 23dcf334124c0..f3b4abf9c21a3 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -120,7 +120,7 @@ define <4 x i1> @illegal_abs_to_eq_or(<4 x i64> %x) { ; SSE41-NEXT: psubq %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = [129,129] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [129,129] ; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 ; SSE41-NEXT: packssdw %xmm1, %xmm2 @@ -178,7 +178,7 @@ define <4 x i64> @illegal_abs_to_eq_or_sext(<4 x i64> %x) { ; SSE41-NEXT: psubq %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = [129,129] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [129,129] ; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -242,7 +242,7 @@ define <4 x i1> @illegal_abs_to_ne_and(<4 x i64> %x) { ; SSE41-NEXT: psubq %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = [129,129] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [129,129] ; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 ; SSE41-NEXT: packssdw %xmm1, %xmm2 @@ -278,8 +278,8 @@ define <4 x i1> @illegal_abs_to_ne_and(<4 x i64> %x) { define <4 x i64> @illegal_abs_to_ne_and_sext(<4 x i64> %x) { ; AVX512-LABEL: illegal_abs_to_ne_and_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsq %ymm0, %ymm0 ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] +; AVX512-NEXT: vpabsq %ymm0, %ymm0 ; AVX512-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0 ; AVX512-NEXT: retq @@ -306,7 +306,7 @@ define <4 x i64> @illegal_abs_to_ne_and_sext(<4 x i64> %x) { ; SSE41-NEXT: psubq %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = [129,129] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [129,129] ; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm3, %xmm2 @@ -414,8 +414,8 @@ define <4 x i32> @legal_abs_eq_unchanged_sext(<4 x i32> %x) { define <4 x i1> @legal_abs_ne_unchangedd(<4 x i32> %x) { ; AVX512-LABEL: legal_abs_ne_unchangedd: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsd %xmm0, %xmm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] +; AVX512-NEXT: vpabsd %xmm0, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: retq @@ -455,8 +455,8 @@ define <4 x i1> @legal_abs_ne_unchangedd(<4 x i32> %x) { define <4 x i32> @legal_abs_ne_unchangedd_sext(<4 x i32> %x) { ; AVX512-LABEL: legal_abs_ne_unchangedd_sext: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsd %xmm0, %xmm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] +; AVX512-NEXT: vpabsd %xmm0, %xmm0 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: retq @@ -518,11 +518,11 @@ define <4 x i1> @eq_or_to_abs_vec4x64(<4 x i64> %x) { ; ; SSE41-LABEL: eq_or_to_abs_vec4x64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = [129,129] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [129,129] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE41-NEXT: pmovsxwq {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] ; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 @@ -566,19 +566,19 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) { ; AVX2-LABEL: eq_or_to_abs_vec4x64_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] +; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x64_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = [129,129] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [129,129] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE41-NEXT: pmovsxwq {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] ; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 @@ -649,12 +649,12 @@ define <4 x i1> @ne_and_to_abs_vec4x64(<4 x i64> %x) { ; ; SSE41-LABEL: ne_and_to_abs_vec4x64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = [129,129] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [129,129] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pmovsxwq {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] ; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 @@ -712,12 +712,12 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { ; ; SSE41-LABEL: ne_and_to_abs_vec4x64_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = [129,129] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [129,129] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pmovsxwq {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] ; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 @@ -944,7 +944,7 @@ define <4 x i1> @eq_or_to_abs_vec4x16(<4 x i16> %x) { ; ; SSE41-LABEL: eq_or_to_abs_vec4x16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [88,88,88,88,88,88,88,88] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u] ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 @@ -1015,8 +1015,8 @@ define <4 x i1> @ne_and_to_abs_vec4x8(<4 x i8> %x) { ; AVX512-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = ~xmm1 ; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1 -; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 @@ -1085,18 +1085,18 @@ define <4 x i16> @ne_and_to_abs_vec4x16_sext(<4 x i16> %x) { ; AVX2-LABEL: ne_and_to_abs_vec4x16_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x16_sext: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [88,88,88,88,88,88,88,88] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u] ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1105,8 +1105,8 @@ define <4 x i16> @ne_and_to_abs_vec4x16_sext(<4 x i16> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u] ; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll index dada1726be424..3b0aa96e9a416 100644 --- a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll @@ -9,9 +9,9 @@ define <4 x i1> @andnot_eq_v4i32_todo_no_splat(<4 x i32> %x) nounwind { ; AVX512-LABEL: andnot_eq_v4i32_todo_no_splat: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; AVX512-NEXT: retq ; @@ -69,9 +69,9 @@ define <2 x i1> @andnot_eq_v2i64_fail_max_not_n1(<2 x i64> %x) nounwind { ; AVX512-LABEL: andnot_eq_v2i64_fail_max_not_n1: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 +; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 +; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; AVX512-NEXT: korw %k0, %k1, %k1 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; AVX512-NEXT: retq ; @@ -151,15 +151,15 @@ define <8 x i1> @andnot_ne_v8i16_todo_no_splat(<8 x i16> %x) nounwind { ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $54, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 | xmm2) ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v8i16_todo_no_splat: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -167,8 +167,8 @@ define <8 x i1> @andnot_ne_v8i16_todo_no_splat(<8 x i16> %x) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: retq @@ -184,7 +184,7 @@ define <8 x i1> @andnot_ne_v8i16(<8 x i16> %x) nounwind { ; AVX512-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v8i16: @@ -217,15 +217,15 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind { ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $54, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 | xmm2) ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v16i8_fail_max_not_n1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -233,8 +233,8 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pcmpeqb %xmm1, %xmm2 ; SSE-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pcmpeqb %xmm1, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: retq @@ -250,7 +250,7 @@ define <16 x i1> @andnot_ne_v16i8(<16 x i8> %x) nounwind { ; AVX512-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v16i8: @@ -309,37 +309,27 @@ define <8 x i1> @addand_ne_v8i16_fail(<8 x i16> %x) nounwind { ; AVX512-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $86, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm2 ^ (xmm0 | xmm1) ; AVX512-NEXT: retq ; ; AVX2-LABEL: addand_ne_v8i16_fail: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; SSE41-LABEL: addand_ne_v8i16_fail: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65533,65533,65533,65533,65533,65533,65533,65533] -; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; SSE2-LABEL: addand_ne_v8i16_fail: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65533,65533,65533,65533,65533,65533,65533,65533] -; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: addand_ne_v8i16_fail: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65533,65533,65533,65533,65533,65533,65533,65533] +; SSE-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: retq %cmp1 = icmp ne <8 x i16> %x, %cmp2 = icmp ne <8 x i16> %x, %r = and <8 x i1> %cmp1, %cmp2 diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll index c52b3ed6c926d..32ae9ec212550 100644 --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -75,12 +75,12 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $15, %edx, %ecx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $15, %edx, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -100,12 +100,12 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $15, %edx, %ecx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $15, %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; @@ -125,9 +125,9 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $17, %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $17, %ecx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl @@ -148,9 +148,9 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $17, %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $17, %ecx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -173,22 +173,22 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: shldl $17, %edx, %esi -; X86-NEXT: shldl $17, %ecx, %edx +; X86-NEXT: shldl $17, %esi, %edx +; X86-NEXT: shldl $17, %ecx, %esi ; X86-NEXT: shldl $17, %eax, %ecx ; X86-NEXT: shll $17, %eax ; X86-NEXT: movl %ecx, %edi -; X86-NEXT: orl %esi, %edi +; X86-NEXT: orl %edx, %edi ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: orl %edi, %ebx ; X86-NEXT: sete %bl -; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edx +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %eax ; X86-NEXT: calll use@PLT @@ -225,8 +225,8 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $17, %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $17, %ecx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/immediate_merging.ll b/llvm/test/CodeGen/X86/immediate_merging.ll index 348f3a3be38ae..fdc3aed3443be 100644 --- a/llvm/test/CodeGen/X86/immediate_merging.ll +++ b/llvm/test/CodeGen/X86/immediate_merging.ll @@ -177,8 +177,8 @@ define dso_local void @foomemset() optsize { ; X64: # %bb.0: # %entry ; X64-NEXT: movabsq $2387225703656530209, %rax # imm = 0x2121212121212121 ; X64-NEXT: movq %rax, AA+16(%rip) -; X64-NEXT: movq %rax, AA+8(%rip) -; X64-NEXT: movq %rax, AA(%rip) +; X64-NEXT: movaps {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; X64-NEXT: movups %xmm0, AA(%rip) ; X64-NEXT: retq entry: call void @llvm.memset.p0.i32(ptr @AA, i8 33, i32 24, i1 false) @@ -204,8 +204,8 @@ define dso_local void @foomemset_pgso() !prof !14 { ; X64: # %bb.0: # %entry ; X64-NEXT: movabsq $2387225703656530209, %rax # imm = 0x2121212121212121 ; X64-NEXT: movq %rax, AA+16(%rip) -; X64-NEXT: movq %rax, AA+8(%rip) -; X64-NEXT: movq %rax, AA(%rip) +; X64-NEXT: movaps {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33] +; X64-NEXT: movups %xmm0, AA(%rip) ; X64-NEXT: retq entry: call void @llvm.memset.p0.i32(ptr @AA, i8 33, i32 24, i1 false) diff --git a/llvm/test/CodeGen/X86/implicit-null-check.ll b/llvm/test/CodeGen/X86/implicit-null-check.ll index de63c9ae209df..12efd9acf125e 100644 --- a/llvm/test/CodeGen/X86/implicit-null-check.ll +++ b/llvm/test/CodeGen/X86/implicit-null-check.ll @@ -137,8 +137,8 @@ define i256 @imp_null_check_load_i256(ptr %x) { ; CHECK-NEXT: retq ; CHECK-NEXT: LBB5_1: ## %is_null ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movaps %xmm0, 16(%rax) -; CHECK-NEXT: movq $0, 8(%rax) +; CHECK-NEXT: movups %xmm0, 8(%rax) +; CHECK-NEXT: movq $0, 24(%rax) ; CHECK-NEXT: movq $42, (%rax) ; CHECK-NEXT: retq @@ -408,7 +408,8 @@ define i32 @imp_null_check_gep_load_with_use_dep(ptr %x, i32 %a) { ; CHECK-NEXT: movl (%rdi), %eax ## on-fault: LBB15_1 ; CHECK-NEXT: ## %bb.2: ## %not_null ; CHECK-NEXT: addl %edi, %esi -; CHECK-NEXT: leal 4(%rax,%rsi), %eax +; CHECK-NEXT: addl %esi, %eax +; CHECK-NEXT: addl $4, %eax ; CHECK-NEXT: retq ; CHECK-NEXT: LBB15_1: ## %is_null ; CHECK-NEXT: movl $42, %eax diff --git a/llvm/test/CodeGen/X86/imul.ll b/llvm/test/CodeGen/X86/imul.ll index 9131688c4efcc..91c9ffc58d657 100644 --- a/llvm/test/CodeGen/X86/imul.ll +++ b/llvm/test/CodeGen/X86/imul.ll @@ -137,10 +137,10 @@ define i64 @mul3_64(i64 %A) { ; ; X86-LABEL: mul3_64: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx ; X86-NEXT: movl $3, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl %mul = mul i64 %A, 3 @@ -174,10 +174,10 @@ define i64 @mul40_64(i64 %A) { ; ; X86-LABEL: mul40_64: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx ; X86-NEXT: movl $40, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,4), %ecx ; X86-NEXT: leal (%edx,%ecx,8), %edx ; X86-NEXT: retl %mul = mul i64 %A, 40 @@ -394,13 +394,18 @@ define i64 @test4(i64 %a) { ; ; X86-LABEL: test4: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $31, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shll $5, %esi +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %tmp3 = mul i64 %a, 31 @@ -418,20 +423,25 @@ define i64 @test5(i64 %a) { ; ; X86-LABEL: test5: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edi ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_offset %esi, -12 +; X86-NEXT: .cfi_offset %edi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: shll $5, %eax -; X86-NEXT: subl %eax, %esi +; X86-NEXT: movl %esi, %edi ; X86-NEXT: movl $-31, %edx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edx +; X86-NEXT: shll $5, %edi +; X86-NEXT: subl %edi, %esi ; X86-NEXT: subl %ecx, %edx ; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %edi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: @@ -450,13 +460,18 @@ define i64 @test6(i64 %a) { ; ; X86-LABEL: test6: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $33, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shll $5, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %tmp3 = mul i64 %a, 33 @@ -474,20 +489,25 @@ define i64 @test7(i64 %a) { ; ; X86-LABEL: test7: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edi ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_offset %esi, -12 +; X86-NEXT: .cfi_offset %edi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shll $5, %esi -; X86-NEXT: addl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %esi ; X86-NEXT: movl $-33, %edx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edx +; X86-NEXT: shll $5, %esi +; X86-NEXT: addl %edi, %esi ; X86-NEXT: subl %ecx, %edx ; X86-NEXT: subl %esi, %edx ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %edi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll index c44945ac2d929..37e56abd484b1 100644 --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -53,39 +53,27 @@ define <16 x i8> @elt0_v16i8(i8 %x) { } define <8 x i16> @elt5_v8i16(i16 %x) { -; X86-SSE2-LABEL: elt5_v8i16: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] -; X86-SSE2-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0 -; X86-SSE2-NEXT: retl -; -; X64-SSE2-LABEL: elt5_v8i16: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] -; X64-SSE2-NEXT: pinsrw $5, %edi, %xmm0 -; X64-SSE2-NEXT: retq -; -; X86-SSE4-LABEL: elt5_v8i16: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmovsxbw {{.*#+}} xmm0 = [42,1,2,3,4,0,6,7] -; X86-SSE4-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0 -; X86-SSE4-NEXT: retl +; X86-SSE-LABEL: elt5_v8i16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] +; X86-SSE-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: retl ; -; X64-SSE4-LABEL: elt5_v8i16: -; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: pmovsxbw {{.*#+}} xmm0 = [42,1,2,3,4,0,6,7] -; X64-SSE4-NEXT: pinsrw $5, %edi, %xmm0 -; X64-SSE4-NEXT: retq +; X64-SSE-LABEL: elt5_v8i16: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] +; X64-SSE-NEXT: pinsrw $5, %edi, %xmm0 +; X64-SSE-NEXT: retq ; ; X86-AVX-LABEL: elt5_v8i16: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbw {{.*#+}} xmm0 = [42,1,2,3,4,0,6,7] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] ; X86-AVX-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: elt5_v8i16: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbw {{.*#+}} xmm0 = [42,1,2,3,4,0,6,7] +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7] ; X64-AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 ; X64-AVX-NEXT: retq %ins = insertelement <8 x i16> , i16 %x, i32 5 @@ -111,25 +99,25 @@ define <4 x i32> @elt3_v4i32(i32 %x) { ; ; X86-SSE4-LABEL: elt3_v4i32: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmovsxbd {{.*#+}} xmm0 = [42,1,2,0] +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,u] ; X86-SSE4-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE4-NEXT: retl ; ; X64-SSE4-LABEL: elt3_v4i32: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: pmovsxbd {{.*#+}} xmm0 = [42,1,2,0] +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm0 = [42,1,2,u] ; X64-SSE4-NEXT: pinsrd $3, %edi, %xmm0 ; X64-SSE4-NEXT: retq ; ; X86-AVX-LABEL: elt3_v4i32: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [42,1,2,0] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [42,1,2,u] ; X86-AVX-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: elt3_v4i32: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [42,1,2,0] +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [42,1,2,u] ; X64-AVX-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ; X64-AVX-NEXT: retq %ins = insertelement <4 x i32> , i32 %x, i32 3 @@ -152,7 +140,7 @@ define <2 x i64> @elt0_v2i64(i64 %x) { ; ; X64-SSE4-LABEL: elt0_v2i64: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,1] +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm0 = [u,1] ; X64-SSE4-NEXT: pinsrq $0, %rdi, %xmm0 ; X64-SSE4-NEXT: retq ; @@ -162,11 +150,24 @@ define <2 x i64> @elt0_v2i64(i64 %x) { ; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X86-AVX-NEXT: retl ; -; X64-AVX-LABEL: elt0_v2i64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,1] -; X64-AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: elt0_v2i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [1,1] +; X64-AVX1-NEXT: # xmm0 = mem[0,0] +; X64-AVX1-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: elt0_v2i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,1] +; X64-AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512F-LABEL: elt0_v2i64: +; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [u,1] +; X64-AVX512F-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX512F-NEXT: retq %ins = insertelement <2 x i64> , i64 %x, i32 0 ret <2 x i64> %ins } @@ -267,14 +268,14 @@ define <8 x i32> @elt7_v8i32(i32 %x) { ; ; X86-SSE4-LABEL: elt7_v8i32: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmovsxbd {{.*#+}} xmm1 = [4,5,6,0] +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [4,5,6,u] ; X86-SSE4-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm1 ; X86-SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3] ; X86-SSE4-NEXT: retl ; ; X64-SSE4-LABEL: elt7_v8i32: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: pmovsxbd {{.*#+}} xmm1 = [4,5,6,0] +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [4,5,6,u] ; X64-SSE4-NEXT: pinsrd $3, %edi, %xmm1 ; X64-SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3] ; X64-SSE4-NEXT: retq @@ -393,7 +394,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; ; X64-SSE4-LABEL: elt5_v8i64: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm2 = [4,4] +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4,u] ; X64-SSE4-NEXT: pinsrq $1, %rdi, %xmm2 ; X64-SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1] ; X64-SSE4-NEXT: movaps {{.*#+}} xmm1 = [2,3] @@ -428,7 +429,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; ; X64-AVX2-LABEL: elt5_v8i64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,0,6,7] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4,u,6,7] ; X64-AVX2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; X64-AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,1,2,3] @@ -436,9 +437,9 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; ; X86-AVX512F-LABEL: elt5_v8i64: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm0 = [42,1,2,3] +; X86-AVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] ; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,0] +; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [4,0,0,0] ; X86-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X86-AVX512F-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 ; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -447,7 +448,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; X64-AVX512F-LABEL: elt5_v8i64: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovq %rdi, %xmm1 -; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,9,10,11,12,0,14,15] +; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,9,10,11,12,0,14,15] ; X64-AVX512F-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 ; X64-AVX512F-NEXT: retq %ins = insertelement <8 x i64> , i64 %x, i32 5 diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll index 435ea61412b73..d4f0e6b7cb796 100644 --- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll +++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll @@ -10,21 +10,21 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli ; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; SSE-32-NEXT: xorps %xmm0, %xmm0 +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] ; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] -; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] -; SSE-32-NEXT: movups %xmm0, 624(%eax) -; SSE-32-NEXT: movups %xmm1, 608(%eax) +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE-32-NEXT: movups %xmm1, 624(%eax) +; SSE-32-NEXT: movups %xmm0, 608(%eax) ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: PR15298: ; SSE-64: # %bb.0: # %L.entry ; SSE-64-NEXT: xorps %xmm0, %xmm0 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] ; SSE-64-NEXT: xorps %xmm1, %xmm1 -; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] -; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] -; SSE-64-NEXT: movups %xmm0, 624(%rsi) -; SSE-64-NEXT: movups %xmm1, 608(%rsi) +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE-64-NEXT: movups %xmm1, 624(%rsi) +; SSE-64-NEXT: movups %xmm0, 608(%rsi) ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: PR15298: diff --git a/llvm/test/CodeGen/X86/insertelement-legalize.ll b/llvm/test/CodeGen/X86/insertelement-legalize.ll index 67f824ff8412d..6da9c2052e19c 100644 --- a/llvm/test/CodeGen/X86/insertelement-legalize.ll +++ b/llvm/test/CodeGen/X86/insertelement-legalize.ll @@ -10,16 +10,16 @@ define void @test(<2 x i64> %val, ptr %dst, i64 %x) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: addl %ecx, %ecx -; CHECK-NEXT: adcl %edx, %edx +; CHECK-NEXT: adcl %edi, %edi ; CHECK-NEXT: movl %ecx, 8(%eax) -; CHECK-NEXT: movl %esi, (%eax) -; CHECK-NEXT: movl %edx, 12(%eax) -; CHECK-NEXT: movl %edi, 4(%eax) +; CHECK-NEXT: movl %edx, (%eax) +; CHECK-NEXT: movl %edi, 12(%eax) +; CHECK-NEXT: movl %esi, 4(%eax) ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll index e2defdc370543..7346de54182b7 100644 --- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll +++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll @@ -31,9 +31,9 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind ; X86_AVX256-LABEL: insert_subvector_512: ; X86_AVX256: # %bb.0: ; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 +; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm3 ; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] -; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 -; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; X86_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_512: @@ -47,7 +47,7 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind ; X86_AVX512-LABEL: insert_subvector_512: ; X86_AVX512: # %bb.0: ; X86_AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X86_AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7] +; X86_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,0,3,0,4,0,5,0,6,0,7,0] ; X86_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X86_AVX512-NEXT: retl ; @@ -55,7 +55,7 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind ; X64_AVX512: # %bb.0: ; X64_AVX512-NEXT: vmovd %edi, %xmm1 ; X64_AVX512-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 -; X64_AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7] +; X64_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7] ; X64_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; X64_AVX512-NEXT: retq %ins1 = insertelement <2 x i32> undef, i32 %x0, i32 0 diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll index d151c6f28e51b..0dc2ea27c87e8 100644 --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -1054,19 +1054,19 @@ define <4 x float> @arg_f32_v4f32(<4 x float> %v, float %x, i32 %y) nounwind { ; ; AVX1-LABEL: arg_f32_v4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovd %edi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: arg_f32_v4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1111,20 +1111,20 @@ define <2 x double> @arg_f64_v2f64(<2 x double> %v, double %x, i32 %y) nounwind ; ; AVX1-LABEL: arg_f64_v2f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vmovd %edi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: arg_f64_v2f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX2-NEXT: movl %edi, %eax ; AVX2-NEXT: vmovq %rax, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1407,20 +1407,20 @@ define <4 x float> @load_f32_v4f32(<4 x float> %v, ptr %p, i32 %y) nounwind { ; ; AVX1-LABEL: load_f32_v4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss (%rdi), %xmm1 -; AVX1-NEXT: vmovd %esi, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %esi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss (%rdi), %xmm2 +; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_f32_v4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss (%rdi), %xmm1 -; AVX2-NEXT: vmovd %esi, %xmm2 -; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %esi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vbroadcastss (%rdi), %xmm2 +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_f32_v4f32: @@ -1467,21 +1467,21 @@ define <2 x double> @load_f64_v2f64(<2 x double> %v, ptr %p, i32 %y) nounwind { ; ; AVX1-LABEL: load_f64_v2f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-NEXT: vmovd %esi, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %esi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_f64_v2f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_f64_v2f64: @@ -1792,10 +1792,10 @@ define <8 x float> @arg_f32_v8f32(<8 x float> %v, float %x, i32 %y) nounwind { ; ; AVX2-LABEL: arg_f32_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 ; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1844,11 +1844,11 @@ define <4 x double> @arg_f64_v4f64(<4 x double> %v, double %x, i32 %y) nounwind ; ; AVX2-LABEL: arg_f64_v4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: movl %edi, %eax ; AVX2-NEXT: vmovq %rax, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2169,9 +2169,9 @@ define <8 x float> @load_f32_v8f32(<8 x float> %v, ptr %p, i32 %y) nounwind { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vbroadcastss (%rdi), %ymm2 -; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm1, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_f32_v8f32: @@ -2223,9 +2223,9 @@ define <4 x double> @load_f64_v4f64(<4 x double> %v, ptr %p, i32 %y) nounwind { ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_f64_v4f64: @@ -2272,8 +2272,8 @@ define <4 x double> @load_f64_v4f64(<4 x double> %v, ptr %p, i32 %y) nounwind { define i32 @PR44139(ptr %p) { ; SSE-LABEL: PR44139: ; SSE: # %bb.0: -; SSE-NEXT: movl (%rdi), %eax ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] +; SSE-NEXT: movl (%rdi), %eax ; SSE-NEXT: movdqa %xmm0, 96(%rdi) ; SSE-NEXT: movdqa %xmm0, 112(%rdi) ; SSE-NEXT: movdqa %xmm0, 64(%rdi) @@ -2281,10 +2281,10 @@ define i32 @PR44139(ptr %p) { ; SSE-NEXT: movdqa %xmm0, 32(%rdi) ; SSE-NEXT: movdqa %xmm0, 48(%rdi) ; SSE-NEXT: movdqa %xmm0, (%rdi) -; SSE-NEXT: movdqa %xmm0, 16(%rdi) ; SSE-NEXT: leal 2147483647(%rax), %ecx ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: movdqa %xmm0, 16(%rdi) ; SSE-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 ; SSE-NEXT: addl %eax, %ecx ; SSE-NEXT: # kill: def $eax killed $eax killed $rax @@ -2301,10 +2301,10 @@ define i32 @PR44139(ptr %p) { ; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm1, (%rdi) ; AVX1-NEXT: leal 2147483647(%rax), %ecx ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: cmovnsl %eax, %ecx +; AVX1-NEXT: vmovaps %ymm1, (%rdi) ; AVX1-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 ; AVX1-NEXT: addl %eax, %ecx ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax @@ -2322,10 +2322,10 @@ define i32 @PR44139(ptr %p) { ; AVX2-NEXT: vmovdqa %ymm0, 64(%rdi) ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) -; AVX2-NEXT: vmovdqa %ymm1, (%rdi) ; AVX2-NEXT: leal 2147483647(%rax), %ecx ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: cmovnsl %eax, %ecx +; AVX2-NEXT: vmovdqa %ymm1, (%rdi) ; AVX2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 ; AVX2-NEXT: addl %eax, %ecx ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax @@ -2341,10 +2341,10 @@ define i32 @PR44139(ptr %p) { ; AVX512-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1 ; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdi) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: leal 2147483647(%rax), %ecx ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: cmovnsl %eax, %ecx +; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) ; AVX512-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 ; AVX512-NEXT: addl %eax, %ecx ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax @@ -2355,6 +2355,9 @@ define i32 @PR44139(ptr %p) { ; ; X86AVX2-LABEL: PR44139: ; X86AVX2: # %bb.0: +; X86AVX2-NEXT: pushl %esi +; X86AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86AVX2-NEXT: .cfi_offset %esi, -8 ; X86AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 ; X86AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] @@ -2363,14 +2366,16 @@ define i32 @PR44139(ptr %p) { ; X86AVX2-NEXT: vmovaps %ymm0, 96(%ecx) ; X86AVX2-NEXT: vmovaps %ymm0, 32(%ecx) ; X86AVX2-NEXT: movl (%ecx), %eax -; X86AVX2-NEXT: vmovaps %ymm1, (%ecx) -; X86AVX2-NEXT: leal 2147483647(%eax), %ecx +; X86AVX2-NEXT: leal 2147483647(%eax), %esi ; X86AVX2-NEXT: testl %eax, %eax -; X86AVX2-NEXT: cmovnsl %eax, %ecx -; X86AVX2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 -; X86AVX2-NEXT: addl %eax, %ecx +; X86AVX2-NEXT: cmovnsl %eax, %esi +; X86AVX2-NEXT: vmovaps %ymm1, (%ecx) +; X86AVX2-NEXT: andl $-2147483648, %esi # imm = 0x80000000 +; X86AVX2-NEXT: addl %eax, %esi ; X86AVX2-NEXT: xorl %edx, %edx -; X86AVX2-NEXT: divl %ecx +; X86AVX2-NEXT: divl %esi +; X86AVX2-NEXT: popl %esi +; X86AVX2-NEXT: .cfi_def_cfa_offset 4 ; X86AVX2-NEXT: vzeroupper ; X86AVX2-NEXT: retl %L = load <16 x i64>, ptr %p diff --git a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll index 1eb43db350447..09707f4c4b476 100644 --- a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll +++ b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll @@ -1,15 +1,7 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s define i8 @ctz_v8i16(<8 x i16> %a) { -; CHECK-LABEL: .LCPI0_0: -; CHECK-NEXT: .byte 8 -; CHECK-NEXT: .byte 7 -; CHECK-NEXT: .byte 6 -; CHECK-NEXT: .byte 5 -; CHECK-NEXT: .byte 4 -; CHECK-NEXT: .byte 3 -; CHECK-NEXT: .byte 2 -; CHECK-NEXT: .byte 1 ; CHECK-LABEL: ctz_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: pxor %xmm1, %xmm1 @@ -19,17 +11,17 @@ define i8 @ctz_v8i16(<8 x i16> %a) { ; CHECK-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: cmpb %cl, %al ; CHECK-NEXT: cmoval %eax, %ecx ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: cmpb %al, %cl ; CHECK-NEXT: cmovbel %eax, %ecx -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: cmpb %al, %cl -; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: cmpb %dl, %cl ; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: cmpb %al, %cl ; CHECK-NEXT: cmovbel %eax, %ecx @@ -47,11 +39,6 @@ define i8 @ctz_v8i16(<8 x i16> %a) { } define i16 @ctz_v4i32(<4 x i32> %a) { -; CHECK-LABEL: .LCPI1_0: -; CHECK-NEXT: .byte 4 -; CHECK-NEXT: .byte 3 -; CHECK-NEXT: .byte 2 -; CHECK-NEXT: .byte 1 ; CHECK-LABEL: ctz_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: pxor %xmm1, %xmm1 @@ -85,15 +72,6 @@ define i16 @ctz_v4i32(<4 x i32> %a) { ; ZERO IS POISON define i8 @ctz_v8i16_poison(<8 x i16> %a) { -; CHECK-LABEL: .LCPI2_0: -; CHECK-NEXT: .byte 8 -; CHECK-NEXT: .byte 7 -; CHECK-NEXT: .byte 6 -; CHECK-NEXT: .byte 5 -; CHECK-NEXT: .byte 4 -; CHECK-NEXT: .byte 3 -; CHECK-NEXT: .byte 2 -; CHECK-NEXT: .byte 1 ; CHECK-LABEL: ctz_v8i16_poison: ; CHECK: # %bb.0: ; CHECK-NEXT: pxor %xmm1, %xmm1 @@ -103,17 +81,17 @@ define i8 @ctz_v8i16_poison(<8 x i16> %a) { ; CHECK-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: cmpb %cl, %al ; CHECK-NEXT: cmoval %eax, %ecx ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: cmpb %al, %cl ; CHECK-NEXT: cmovbel %eax, %ecx -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: cmpb %al, %cl -; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: cmpb %dl, %cl ; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx ; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: cmpb %al, %cl ; CHECK-NEXT: cmovbel %eax, %ecx diff --git a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll index 52d294ca01720..2631b04a04802 100644 --- a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll +++ b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll @@ -76,11 +76,11 @@ define i1 @is_snan_f80(x86_fp80 %x) nounwind { ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $32767, %eax # imm = 0x7FFF ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl $-2147483648, %esi # imm = 0x80000000 ; X86-NEXT: sbbl %edx, %esi ; X86-NEXT: movl $32767, %esi # imm = 0x7FFF @@ -238,9 +238,9 @@ define i1 @is_inf_f80(x86_fp80 %x) nounwind { ; X86-LABEL: is_inf_f80: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: notl %eax ; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: notl %eax ; X86-NEXT: andl $32767, %eax # imm = 0x7FFF ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl %ecx, %eax @@ -265,12 +265,12 @@ entry: define i1 @is_posinf_f80(x86_fp80 %x) nounwind { ; X86-LABEL: is_posinf_f80: ; X86: # %bb.0: # %entry -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl $32767, %eax # imm = 0x7FFF -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -303,9 +303,9 @@ define i1 @is_neginf_f80(x86_fp80 %x) nounwind { ; X64-LABEL: is_neginf_f80: ; X64: # %bb.0: # %entry ; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; X64-NEXT: xorq $65535, %rax # imm = 0xFFFF ; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; X64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx +; X64-NEXT: xorq $65535, %rax # imm = 0xFFFF ; X64-NEXT: orq %rax, %rcx ; X64-NEXT: sete %al ; X64-NEXT: retq @@ -351,16 +351,15 @@ entry: define i1 @is_posnormal_f80(x86_fp80 %x) nounwind { ; X86-LABEL: is_posnormal_f80: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl $32767, %edx # imm = 0x7FFF -; X86-NEXT: decl %edx -; X86-NEXT: movzwl %dx, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: cmpl $32766, %edx # imm = 0x7FFE -; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $32767, %eax # imm = 0x7FFF +; X86-NEXT: decl %eax +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl $32766, %eax # imm = 0x7FFE +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setb %dl ; X86-NEXT: testl $32768, %ecx # imm = 0x8000 ; X86-NEXT: sete %cl @@ -368,7 +367,6 @@ define i1 @is_posnormal_f80(x86_fp80 %x) nounwind { ; X86-NEXT: andb %cl, %al ; X86-NEXT: andb %dl, %al ; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: is_posnormal_f80: @@ -395,16 +393,15 @@ entry: define i1 @is_negnormal_f80(x86_fp80 %x) nounwind { ; X86-LABEL: is_negnormal_f80: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl $32767, %edx # imm = 0x7FFF -; X86-NEXT: decl %edx -; X86-NEXT: movzwl %dx, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: cmpl $32766, %edx # imm = 0x7FFE -; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $32767, %eax # imm = 0x7FFF +; X86-NEXT: decl %eax +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl $32766, %eax # imm = 0x7FFE +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setb %dl ; X86-NEXT: testl $32768, %ecx # imm = 0x8000 ; X86-NEXT: setne %cl @@ -412,7 +409,6 @@ define i1 @is_negnormal_f80(x86_fp80 %x) nounwind { ; X86-NEXT: andb %cl, %al ; X86-NEXT: andb %dl, %al ; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: is_negnormal_f80: diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll index 97136dafa6c2c..301831b0e6d3a 100644 --- a/llvm/test/CodeGen/X86/is_fpclass.ll +++ b/llvm/test/CodeGen/X86/is_fpclass.ll @@ -1591,35 +1591,30 @@ entry: define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp { ; X86-LABEL: isnan_v4f_strictfp: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: shlb $2, %al ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: andl %ecx, %edx ; X86-NEXT: cmpl $2139095041, %edx # imm = 0x7F800001 -; X86-NEXT: setge %dh -; X86-NEXT: shlb $2, %dh -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: andl %ecx, %esi -; X86-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 ; X86-NEXT: setge %dl ; X86-NEXT: shlb $3, %dl -; X86-NEXT: orb %dh, %dl -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: andl %ecx, %esi -; X86-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 +; X86-NEXT: orb %al, %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 ; X86-NEXT: setge %dh ; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 ; X86-NEXT: setge %cl ; X86-NEXT: addb %cl, %cl ; X86-NEXT: orb %dh, %cl ; X86-NEXT: orb %dl, %cl ; X86-NEXT: movb %cl, (%eax) -; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl $4 ; ; X64-LABEL: isnan_v4f_strictfp: diff --git a/llvm/test/CodeGen/X86/isel-and.ll b/llvm/test/CodeGen/X86/isel-and.ll index 3fda0e1d86391..094d8c3a9c13a 100644 --- a/llvm/test/CodeGen/X86/isel-and.ll +++ b/llvm/test/CodeGen/X86/isel-and.ll @@ -529,9 +529,9 @@ define i64 @and_imm32_i64(i64 %a) { define i64 @and_imm64_i64(i64 %a) { ; SDAG-X86-LABEL: and_imm64_i64: ; SDAG-X86: # %bb.0: -; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: movl $-1850691612, %eax # imm = 0x91B0AFE4 ; SDAG-X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: andl $-2, %edx ; SDAG-X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/isel-buildvector-avx.ll b/llvm/test/CodeGen/X86/isel-buildvector-avx.ll index a9297f016521d..91abfff2a3424 100644 --- a/llvm/test/CodeGen/X86/isel-buildvector-avx.ll +++ b/llvm/test/CodeGen/X86/isel-buildvector-avx.ll @@ -43,15 +43,10 @@ define <8 x float> @test_vector_v8f32() { } define <4 x i64> @test_vector_v4i64() { -; AVX-LABEL: test_vector_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23430,24650,1,12] -; AVX-NEXT: retq -; -; AVX512-LABEL: test_vector_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm0 = [23430,24650,1,12] -; AVX512-NEXT: retq +; AVX-ALL-LABEL: test_vector_v4i64: +; AVX-ALL: # %bb.0: +; AVX-ALL-NEXT: vmovaps {{.*#+}} ymm0 = [23430,24650,1,12] +; AVX-ALL-NEXT: retq ret <4 x i64> } diff --git a/llvm/test/CodeGen/X86/isel-fp-to-int.ll b/llvm/test/CodeGen/X86/isel-fp-to-int.ll index fae3db6ad0afa..fe2018263eb15 100644 --- a/llvm/test/CodeGen/X86/isel-fp-to-int.ll +++ b/llvm/test/CodeGen/X86/isel-fp-to-int.ll @@ -9,9 +9,9 @@ define i64 @test_double_to_ui64(double %x) { ; SDAG-X64: # %bb.0: # %entry ; SDAG-X64-NEXT: cvttsd2si %xmm0, %rcx ; SDAG-X64-NEXT: movq %rcx, %rdx -; SDAG-X64-NEXT: sarq $63, %rdx ; SDAG-X64-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SDAG-X64-NEXT: cvttsd2si %xmm0, %rax +; SDAG-X64-NEXT: sarq $63, %rdx ; SDAG-X64-NEXT: andq %rdx, %rax ; SDAG-X64-NEXT: orq %rcx, %rax ; SDAG-X64-NEXT: retq @@ -108,9 +108,9 @@ define i64 @test_float_to_ui64(float %x) { ; SDAG-X64: # %bb.0: # %entry ; SDAG-X64-NEXT: cvttss2si %xmm0, %rcx ; SDAG-X64-NEXT: movq %rcx, %rdx -; SDAG-X64-NEXT: sarq $63, %rdx ; SDAG-X64-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SDAG-X64-NEXT: cvttss2si %xmm0, %rax +; SDAG-X64-NEXT: sarq $63, %rdx ; SDAG-X64-NEXT: andq %rdx, %rax ; SDAG-X64-NEXT: orq %rcx, %rax ; SDAG-X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/isel-icmp.ll b/llvm/test/CodeGen/X86/isel-icmp.ll index 8a4d035086112..00bba7902774a 100644 --- a/llvm/test/CodeGen/X86/isel-icmp.ll +++ b/llvm/test/CodeGen/X86/isel-icmp.ll @@ -155,10 +155,10 @@ define i32 @test_icmp_eq_i64(i64 %a, i64 %b) { ; FAST-X86-LABEL: test_icmp_eq_i64: ; FAST-X86: ## %bb.0: ; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; FAST-X86-NEXT: xorl {{[0-9]+}}(%esp), %eax ; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FAST-X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; FAST-X86-NEXT: xorl {{[0-9]+}}(%esp), %eax -; FAST-X86-NEXT: orl %ecx, %eax +; FAST-X86-NEXT: orl %eax, %ecx ; FAST-X86-NEXT: sete %al ; FAST-X86-NEXT: andb $1, %al ; FAST-X86-NEXT: movzbl %al, %eax diff --git a/llvm/test/CodeGen/X86/isel-or.ll b/llvm/test/CodeGen/X86/isel-or.ll index 449f29a027743..0b02ba217e161 100644 --- a/llvm/test/CodeGen/X86/isel-or.ll +++ b/llvm/test/CodeGen/X86/isel-or.ll @@ -518,9 +518,9 @@ define i64 @or_imm32_i64(i64 %a) { define i64 @or_imm64_i64(i64 %a) { ; SDAG-X86-LABEL: or_imm64_i64: ; SDAG-X86: # %bb.0: -; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: movl $-1850691612, %eax # imm = 0x91B0AFE4 ; SDAG-X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: orl $-2, %edx ; SDAG-X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/isel-phi.ll b/llvm/test/CodeGen/X86/isel-phi.ll index ee2039492abfd..aa25b969d9f88 100644 --- a/llvm/test/CodeGen/X86/isel-phi.ll +++ b/llvm/test/CodeGen/X86/isel-phi.ll @@ -336,8 +336,8 @@ define ptr @test_ptr(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr %f, ptr %g, i1 ; DAG-X64-NEXT: testb $1, %r10b ; DAG-X64-NEXT: jne .LBB7_9 ; DAG-X64-NEXT: # %bb.2: # %cond.true.false -; DAG-X64-NEXT: testb $1, %dil ; DAG-X64-NEXT: movq %rsi, %rax +; DAG-X64-NEXT: testb $1, %dil ; DAG-X64-NEXT: jne .LBB7_9 ; DAG-X64-NEXT: # %bb.3: # %cond.true.false.false ; DAG-X64-NEXT: movq %rdx, %rax @@ -346,15 +346,15 @@ define ptr @test_ptr(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr %f, ptr %g, i1 ; DAG-X64-NEXT: testb $1, %r10b ; DAG-X64-NEXT: je .LBB7_7 ; DAG-X64-NEXT: # %bb.5: # %cond.false.true -; DAG-X64-NEXT: testb $1, %dil ; DAG-X64-NEXT: movq %rcx, %rax +; DAG-X64-NEXT: testb $1, %dil ; DAG-X64-NEXT: jne .LBB7_9 ; DAG-X64-NEXT: # %bb.6: # %cond.false.true.false ; DAG-X64-NEXT: movq %r8, %rax ; DAG-X64-NEXT: retq ; DAG-X64-NEXT: .LBB7_7: # %cond.false.false -; DAG-X64-NEXT: testb $1, %dil ; DAG-X64-NEXT: movq %r9, %rax +; DAG-X64-NEXT: testb $1, %dil ; DAG-X64-NEXT: jne .LBB7_9 ; DAG-X64-NEXT: # %bb.8: # %cond.false.false.false ; DAG-X64-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -373,8 +373,8 @@ define ptr @test_ptr(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr %f, ptr %g, i1 ; GLOBAL-X64-NEXT: testb $1, %r10b ; GLOBAL-X64-NEXT: jne .LBB7_9 ; GLOBAL-X64-NEXT: # %bb.2: # %cond.true.false -; GLOBAL-X64-NEXT: testb $1, %dil ; GLOBAL-X64-NEXT: movq %rsi, %rax +; GLOBAL-X64-NEXT: testb $1, %dil ; GLOBAL-X64-NEXT: jne .LBB7_9 ; GLOBAL-X64-NEXT: # %bb.3: # %cond.true.false.false ; GLOBAL-X64-NEXT: movq %rdx, %rax @@ -383,15 +383,15 @@ define ptr @test_ptr(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr %f, ptr %g, i1 ; GLOBAL-X64-NEXT: testb $1, %r10b ; GLOBAL-X64-NEXT: je .LBB7_7 ; GLOBAL-X64-NEXT: # %bb.5: # %cond.false.true -; GLOBAL-X64-NEXT: testb $1, %dil ; GLOBAL-X64-NEXT: movq %rcx, %rax +; GLOBAL-X64-NEXT: testb $1, %dil ; GLOBAL-X64-NEXT: jne .LBB7_9 ; GLOBAL-X64-NEXT: # %bb.6: # %cond.false.true.false ; GLOBAL-X64-NEXT: movq %r8, %rax ; GLOBAL-X64-NEXT: retq ; GLOBAL-X64-NEXT: .LBB7_7: # %cond.false.false -; GLOBAL-X64-NEXT: testb $1, %dil ; GLOBAL-X64-NEXT: movq %r9, %rax +; GLOBAL-X64-NEXT: testb $1, %dil ; GLOBAL-X64-NEXT: jne .LBB7_9 ; GLOBAL-X64-NEXT: # %bb.8: # %cond.false.false.false ; GLOBAL-X64-NEXT: movq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/isel-sdiv.ll b/llvm/test/CodeGen/X86/isel-sdiv.ll index 1aca8d1035664..0b7f3f0bac5d3 100644 --- a/llvm/test/CodeGen/X86/isel-sdiv.ll +++ b/llvm/test/CodeGen/X86/isel-sdiv.ll @@ -49,9 +49,9 @@ define i16 @test_sdiv_i16(i16 %arg1, i16 %arg2) nounwind { ; GISEL-X86-LABEL: test_sdiv_i16: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax ; GISEL-X86-NEXT: cwtd +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; GISEL-X86-NEXT: idivw %cx ; GISEL-X86-NEXT: retl %ret = sdiv i16 %arg1, %arg2 @@ -77,13 +77,6 @@ define i32 @test_sdiv_i32(i32 %arg1, i32 %arg2) nounwind { } define i64 @test_sdiv_i64(i64 %arg1, i64 %arg2) nounwind { -; X64-LABEL: test_sdiv_i64: -; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: cqto -; X64-NEXT: idivq %rsi -; X64-NEXT: retq -; ; DAG-X86-LABEL: test_sdiv_i64: ; DAG-X86: # %bb.0: ; DAG-X86-NEXT: subl $12, %esp diff --git a/llvm/test/CodeGen/X86/isel-select-cmov.ll b/llvm/test/CodeGen/X86/isel-select-cmov.ll index d013ad2c7fbff..5e1db59b49de6 100644 --- a/llvm/test/CodeGen/X86/isel-select-cmov.ll +++ b/llvm/test/CodeGen/X86/isel-select-cmov.ll @@ -190,8 +190,8 @@ define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroex ; ; FAST-X86-CMOV-LABEL: select_cmov_i16: ; FAST-X86-CMOV: ## %bb.0: -; FAST-X86-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; FAST-X86-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; FAST-X86-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; FAST-X86-CMOV-NEXT: cmovew {{[0-9]+}}(%esp), %ax ; FAST-X86-CMOV-NEXT: movzwl %ax, %eax ; FAST-X86-CMOV-NEXT: retl @@ -586,10 +586,10 @@ define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) { ; ; FAST-X86-CMOV-LABEL: select_cmov_i64: ; FAST-X86-CMOV: ## %bb.0: -; FAST-X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx -; FAST-X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; FAST-X86-CMOV-NEXT: cmpb $0, {{[0-9]+}}(%esp) +; FAST-X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; FAST-X86-CMOV-NEXT: cmovel {{[0-9]+}}(%esp), %eax +; FAST-X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; FAST-X86-CMOV-NEXT: cmovel {{[0-9]+}}(%esp), %edx ; FAST-X86-CMOV-NEXT: retl ; @@ -617,9 +617,9 @@ define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) { ; GISEL-X86-CMOV: ## %bb.0: ; GISEL-X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; GISEL-X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; GISEL-X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; GISEL-X86-CMOV-NEXT: testl %ecx, %ecx ; GISEL-X86-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; GISEL-X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx ; GISEL-X86-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %edx ; GISEL-X86-CMOV-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/isel-srem.ll b/llvm/test/CodeGen/X86/isel-srem.ll index 1dabf4175c852..35e7066ec42b5 100644 --- a/llvm/test/CodeGen/X86/isel-srem.ll +++ b/llvm/test/CodeGen/X86/isel-srem.ll @@ -79,9 +79,9 @@ define i16 @test_srem_i16(i16 %arg1, i16 %arg2) nounwind { ; GISEL-X86-LABEL: test_srem_i16: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; GISEL-X86-NEXT: # kill: def $ax killed $ax killed $eax ; GISEL-X86-NEXT: cwtd +; GISEL-X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; GISEL-X86-NEXT: idivw %cx ; GISEL-X86-NEXT: movl %edx, %eax ; GISEL-X86-NEXT: retl @@ -110,13 +110,67 @@ define i32 @test_srem_i32(i32 %arg1, i32 %arg2) nounwind { } define i64 @test_srem_i64(i64 %arg1, i64 %arg2) nounwind { -; X64-LABEL: test_srem_i64: -; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: cqto -; X64-NEXT: idivq %rsi -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: retq +; SDAG-X64-LABEL: test_srem_i64: +; SDAG-X64: # %bb.0: +; SDAG-X64-NEXT: movq %rdi, %rax +; SDAG-X64-NEXT: movq %rdi, %rcx +; SDAG-X64-NEXT: orq %rsi, %rcx +; SDAG-X64-NEXT: shrq $32, %rcx +; SDAG-X64-NEXT: je .LBB3_1 +; SDAG-X64-NEXT: # %bb.2: +; SDAG-X64-NEXT: cqto +; SDAG-X64-NEXT: idivq %rsi +; SDAG-X64-NEXT: movq %rdx, %rax +; SDAG-X64-NEXT: retq +; SDAG-X64-NEXT: .LBB3_1: +; SDAG-X64-NEXT: # kill: def $eax killed $eax killed $rax +; SDAG-X64-NEXT: xorl %edx, %edx +; SDAG-X64-NEXT: divl %esi +; SDAG-X64-NEXT: movl %edx, %eax +; SDAG-X64-NEXT: retq +; +; FAST-X64-LABEL: test_srem_i64: +; FAST-X64: # %bb.0: +; FAST-X64-NEXT: movq %rdi, %rax +; FAST-X64-NEXT: movq %rdi, %rcx +; FAST-X64-NEXT: orq %rsi, %rcx +; FAST-X64-NEXT: movabsq $-4294967296, %rdx # imm = 0xFFFFFFFF00000000 +; FAST-X64-NEXT: andq %rcx, %rdx +; FAST-X64-NEXT: je .LBB3_1 +; FAST-X64-NEXT: # %bb.2: +; FAST-X64-NEXT: cqto +; FAST-X64-NEXT: idivq %rsi +; FAST-X64-NEXT: movq %rdx, %rax +; FAST-X64-NEXT: retq +; FAST-X64-NEXT: .LBB3_1: +; FAST-X64-NEXT: # kill: def $eax killed $eax killed $rax +; FAST-X64-NEXT: xorl %edx, %edx +; FAST-X64-NEXT: divl %esi +; FAST-X64-NEXT: movl %edx, %eax +; FAST-X64-NEXT: retq +; +; GISEL-X64-LABEL: test_srem_i64: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movq %rdi, %rax +; GISEL-X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 +; GISEL-X64-NEXT: movq %rdi, %rdx +; GISEL-X64-NEXT: orq %rsi, %rdx +; GISEL-X64-NEXT: andq %rcx, %rdx +; GISEL-X64-NEXT: cmpq $0, %rdx +; GISEL-X64-NEXT: sete %cl +; GISEL-X64-NEXT: testb $1, %cl +; GISEL-X64-NEXT: jne .LBB3_1 +; GISEL-X64-NEXT: # %bb.2: +; GISEL-X64-NEXT: cqto +; GISEL-X64-NEXT: idivq %rsi +; GISEL-X64-NEXT: movq %rdx, %rax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: .LBB3_1: +; GISEL-X64-NEXT: # kill: def $eax killed $eax killed $rax +; GISEL-X64-NEXT: xorl %edx, %edx +; GISEL-X64-NEXT: divl %esi +; GISEL-X64-NEXT: movl %edx, %eax +; GISEL-X64-NEXT: retq ; ; DAG-X86-LABEL: test_srem_i64: ; DAG-X86: # %bb.0: diff --git a/llvm/test/CodeGen/X86/isel-udiv.ll b/llvm/test/CodeGen/X86/isel-udiv.ll index b123b3c7780fa..c0655560effd6 100644 --- a/llvm/test/CodeGen/X86/isel-udiv.ll +++ b/llvm/test/CodeGen/X86/isel-udiv.ll @@ -77,13 +77,6 @@ define i32 @test_udiv_i32(i32 %arg1, i32 %arg2) nounwind { } define i64 @test_udiv_i64(i64 %arg1, i64 %arg2) nounwind { -; X64-LABEL: test_udiv_i64: -; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divq %rsi -; X64-NEXT: retq -; ; DAG-X86-LABEL: test_udiv_i64: ; DAG-X86: # %bb.0: ; DAG-X86-NEXT: subl $12, %esp diff --git a/llvm/test/CodeGen/X86/isel-urem.ll b/llvm/test/CodeGen/X86/isel-urem.ll index 386f08151ad9c..ba865d4b6083d 100644 --- a/llvm/test/CodeGen/X86/isel-urem.ll +++ b/llvm/test/CodeGen/X86/isel-urem.ll @@ -110,13 +110,67 @@ define i32 @test_urem_i32(i32 %arg1, i32 %arg2) nounwind { } define i64 @test_urem_i64(i64 %arg1, i64 %arg2) nounwind { -; X64-LABEL: test_urem_i64: -; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divq %rsi -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: retq +; SDAG-X64-LABEL: test_urem_i64: +; SDAG-X64: # %bb.0: +; SDAG-X64-NEXT: movq %rdi, %rax +; SDAG-X64-NEXT: movq %rdi, %rcx +; SDAG-X64-NEXT: orq %rsi, %rcx +; SDAG-X64-NEXT: shrq $32, %rcx +; SDAG-X64-NEXT: je .LBB3_1 +; SDAG-X64-NEXT: # %bb.2: +; SDAG-X64-NEXT: xorl %edx, %edx +; SDAG-X64-NEXT: divq %rsi +; SDAG-X64-NEXT: movq %rdx, %rax +; SDAG-X64-NEXT: retq +; SDAG-X64-NEXT: .LBB3_1: +; SDAG-X64-NEXT: # kill: def $eax killed $eax killed $rax +; SDAG-X64-NEXT: xorl %edx, %edx +; SDAG-X64-NEXT: divl %esi +; SDAG-X64-NEXT: movl %edx, %eax +; SDAG-X64-NEXT: retq +; +; FAST-X64-LABEL: test_urem_i64: +; FAST-X64: # %bb.0: +; FAST-X64-NEXT: movq %rdi, %rax +; FAST-X64-NEXT: movq %rdi, %rcx +; FAST-X64-NEXT: orq %rsi, %rcx +; FAST-X64-NEXT: movabsq $-4294967296, %rdx # imm = 0xFFFFFFFF00000000 +; FAST-X64-NEXT: andq %rcx, %rdx +; FAST-X64-NEXT: je .LBB3_1 +; FAST-X64-NEXT: # %bb.2: +; FAST-X64-NEXT: xorl %edx, %edx +; FAST-X64-NEXT: divq %rsi +; FAST-X64-NEXT: movq %rdx, %rax +; FAST-X64-NEXT: retq +; FAST-X64-NEXT: .LBB3_1: +; FAST-X64-NEXT: # kill: def $eax killed $eax killed $rax +; FAST-X64-NEXT: xorl %edx, %edx +; FAST-X64-NEXT: divl %esi +; FAST-X64-NEXT: movl %edx, %eax +; FAST-X64-NEXT: retq +; +; GISEL-X64-LABEL: test_urem_i64: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: movq %rdi, %rax +; GISEL-X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 +; GISEL-X64-NEXT: movq %rdi, %rdx +; GISEL-X64-NEXT: orq %rsi, %rdx +; GISEL-X64-NEXT: andq %rcx, %rdx +; GISEL-X64-NEXT: cmpq $0, %rdx +; GISEL-X64-NEXT: sete %cl +; GISEL-X64-NEXT: testb $1, %cl +; GISEL-X64-NEXT: jne .LBB3_1 +; GISEL-X64-NEXT: # %bb.2: +; GISEL-X64-NEXT: xorl %edx, %edx +; GISEL-X64-NEXT: divq %rsi +; GISEL-X64-NEXT: movq %rdx, %rax +; GISEL-X64-NEXT: retq +; GISEL-X64-NEXT: .LBB3_1: +; GISEL-X64-NEXT: # kill: def $eax killed $eax killed $rax +; GISEL-X64-NEXT: xorl %edx, %edx +; GISEL-X64-NEXT: divl %esi +; GISEL-X64-NEXT: movl %edx, %eax +; GISEL-X64-NEXT: retq ; ; DAG-X86-LABEL: test_urem_i64: ; DAG-X86: # %bb.0: diff --git a/llvm/test/CodeGen/X86/isel-xor.ll b/llvm/test/CodeGen/X86/isel-xor.ll index a31ad78524ee1..7bf1d6d80146f 100644 --- a/llvm/test/CodeGen/X86/isel-xor.ll +++ b/llvm/test/CodeGen/X86/isel-xor.ll @@ -426,9 +426,9 @@ define i32 @xor_imm16_i32(i32 %a) { define i64 @xor_imm16_i64(i64 %a) { ; SDAG-X86-LABEL: xor_imm16_i64: ; SDAG-X86: # %bb.0: -; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: movl $-5022, %eax # imm = 0xEC62 ; SDAG-X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: notl %edx ; SDAG-X86-NEXT: retl ; @@ -488,9 +488,9 @@ define i32 @xor_imm32_i32(i32 %a) { define i64 @xor_imm32_i64(i64 %a) { ; SDAG-X86-LABEL: xor_imm32_i64: ; SDAG-X86: # %bb.0: -; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: movl $-125778, %eax # imm = 0xFFFE14AE ; SDAG-X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: notl %edx ; SDAG-X86-NEXT: retl ; @@ -522,9 +522,9 @@ define i64 @xor_imm32_i64(i64 %a) { define i64 @xor_imm64_i64(i64 %a) { ; SDAG-X86-LABEL: xor_imm64_i64: ; SDAG-X86: # %bb.0: -; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: movl $-1850691612, %eax # imm = 0x91B0AFE4 ; SDAG-X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; SDAG-X86-NEXT: xorl $-2, %edx ; SDAG-X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll index badfd1af940ca..831a46001a8b7 100644 --- a/llvm/test/CodeGen/X86/ispow2.ll +++ b/llvm/test/CodeGen/X86/ispow2.ll @@ -197,9 +197,9 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) { ; CHECK-AVX2-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; CHECK-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2 -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; CHECK-AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; CHECK-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm3 +; CHECK-AVX2-NEXT: vpxor %ymm2, %ymm3, %ymm2 ; CHECK-AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; CHECK-AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 ; CHECK-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll index 261908fafc06e..34ecd31038c06 100644 --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -629,8 +629,8 @@ define <4 x float> @knownbits_lshr_and_select_shuffle_uitofp(<4 x i32> %a0, <4 x ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp ; X86-NEXT: vmovaps 8(%ebp), %xmm3 -; X86-NEXT: vpsrld $5, %xmm2, %xmm2 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3 +; X86-NEXT: vpsrld $5, %xmm2, %xmm2 ; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] @@ -641,8 +641,8 @@ define <4 x float> @knownbits_lshr_and_select_shuffle_uitofp(<4 x i32> %a0, <4 x ; ; X64-LABEL: knownbits_lshr_and_select_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vpsrld $5, %xmm2, %xmm2 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; X64-NEXT: vpsrld $5, %xmm2, %xmm2 ; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll index 58a0595e4322a..1d000385fc720 100644 --- a/llvm/test/CodeGen/X86/known-bits.ll +++ b/llvm/test/CodeGen/X86/known-bits.ll @@ -88,26 +88,24 @@ define i32 @knownbits_mask_add_lshr(i32 %a0, i32 %a1) nounwind { define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind { ; X86-LABEL: knownbits_mask_addc_shl: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl $-1024, %esi # imm = 0xFC00 -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: andl %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl %esi, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: addl %eax, %esi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: shldl $22, %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shldl $22, %esi, %edx ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl $0, 4(%eax) ; X86-NEXT: movl $0, (%eax) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; X64-LABEL: knownbits_mask_addc_shl: diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll index 63336ffa7c6c8..1149c17642bfb 100644 --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -309,14 +309,14 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) { define i32 @umin_known_nonzero(i32 %xx, i32 %yy) { ; X86-LABEL: umin_known_nonzero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $4, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: addl $4, %eax -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: movl $4, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl $4, %ecx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: rep bsfl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: umin_known_nonzero: @@ -365,14 +365,14 @@ define i32 @umin_maybe_zero(i32 %x, i32 %y) { define i32 @smin_known_nonzero(i32 %xx, i32 %yy) { ; X86-LABEL: smin_known_nonzero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $4, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: addl $4, %eax -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: movl $4, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl $4, %ecx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: rep bsfl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: smin_known_nonzero: @@ -477,14 +477,14 @@ define i32 @smin_maybe_zero(i32 %x, i32 %y) { define i32 @smax_known_nonzero(i32 %xx, i32 %yy) { ; X86-LABEL: smax_known_nonzero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $4, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: addl $4, %eax -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmovgl %edx, %eax -; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: movl $4, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl $4, %ecx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: rep bsfl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: smax_known_nonzero: diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll index e183bbc15617d..d16c394a7095d 100644 --- a/llvm/test/CodeGen/X86/known-pow2.ll +++ b/llvm/test/CodeGen/X86/known-pow2.ll @@ -26,9 +26,9 @@ define <4 x i32> @pow2_non_splat_vec_fail0(<4 x i32> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [954437177,1073741824,268435456,67108864] ; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; CHECK-NEXT: movdqa %xmm1, %xmm3 @@ -595,11 +595,6 @@ define i1 @pow2_select_fail2(i1 %c, i32 %x, i32 %y, i32 %z) { define <4 x i1> @pow2_vselect_eq(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: pow2_vselect_eq: ; CHECK: # %bb.0: -; CHECK-NEXT: pslld $31, %xmm0 -; CHECK-NEXT: psrad $31, %xmm0 -; CHECK-NEXT: pslld $23, %xmm2 -; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 ; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] ; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm5, %xmm6 @@ -607,14 +602,19 @@ define <4 x i1> @pow2_vselect_eq(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y, <4 x i ; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] ; CHECK-NEXT: movdqa %xmm5, %xmm7 ; CHECK-NEXT: psrld %xmm4, %xmm7 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: psrld %xmm4, %xmm6 +; CHECK-NEXT: movdqa %xmm5, %xmm8 +; CHECK-NEXT: psrld %xmm4, %xmm8 +; CHECK-NEXT: pslld $31, %xmm0 +; CHECK-NEXT: psrad $31, %xmm0 +; CHECK-NEXT: pslld $23, %xmm2 +; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] ; CHECK-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] ; CHECK-NEXT: psrld %xmm3, %xmm5 -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm8[1] ; CHECK-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm5[0,3] ; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: pandn %xmm7, %xmm0 @@ -676,7 +676,6 @@ define <4 x i1> @pow2_vselect_fail0_ne(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y, ; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: pslld $23, %xmm2 ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 ; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] ; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [1073741824,1073741824,1073741824,1073741824] ; CHECK-NEXT: movdqa %xmm5, %xmm6 @@ -684,6 +683,7 @@ define <4 x i1> @pow2_vselect_fail0_ne(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y, ; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] ; CHECK-NEXT: movdqa %xmm5, %xmm7 ; CHECK-NEXT: psrld %xmm4, %xmm7 +; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] @@ -712,36 +712,36 @@ define <4 x i1> @pow2_vselect_fail0_ne(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y, define <4 x i1> @pow2_vselect_fail2_ne(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: pow2_vselect_fail2_ne: ; CHECK: # %bb.0: -; CHECK-NEXT: pslld $31, %xmm0 -; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: pslld $23, %xmm2 ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 ; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4,4,4,4] ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-NEXT: pmuludq %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: psrld %xmm4, %xmm6 -; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] -; CHECK-NEXT: movdqa %xmm5, %xmm7 +; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-NEXT: movdqa %xmm6, %xmm7 ; CHECK-NEXT: psrld %xmm4, %xmm7 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,1,1,1,4,5,6,7] +; CHECK-NEXT: movdqa %xmm6, %xmm8 +; CHECK-NEXT: psrld %xmm4, %xmm8 +; CHECK-NEXT: pslld $31, %xmm0 +; CHECK-NEXT: psrad $31, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm7[0] ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; CHECK-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: psrld %xmm4, %xmm6 +; CHECK-NEXT: movdqa %xmm6, %xmm5 +; CHECK-NEXT: psrld %xmm4, %xmm5 ; CHECK-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] -; CHECK-NEXT: psrld %xmm3, %xmm5 -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; CHECK-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm5[0,3] +; CHECK-NEXT: psrld %xmm3, %xmm6 +; CHECK-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; CHECK-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm6[0,3] ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: pandn %xmm7, %xmm0 +; CHECK-NEXT: pandn %xmm8, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm1 @@ -804,13 +804,13 @@ define i1 @pow2_and_fail1(i32 %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movl $1, %edx ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: shll %cl, %edx -; CHECK-NEXT: subl %edx, %eax -; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: movl $1, %ecx +; CHECK-NEXT: subl %eax, %ecx +; CHECK-NEXT: andl %eax, %ecx ; CHECK-NEXT: notl %edi -; CHECK-NEXT: testl %edi, %eax +; CHECK-NEXT: testl %edi, %ecx ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %yy = shl i32 1, %y diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll index 473fecc307ed4..7258a406fad99 100644 --- a/llvm/test/CodeGen/X86/known-signbits-shl.ll +++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll @@ -95,9 +95,9 @@ define void @computeNumSignBits_shl_zext_vec_2(<2 x i8> %x, ptr %p) nounwind { ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1024,4096,u,u,u,u,u,u] ; X64-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; X64-NEXT: pand %xmm0, %xmm2 ; X64-NEXT: pcmpgtw %xmm0, %xmm1 ; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: pand %xmm0, %xmm2 ; X64-NEXT: por %xmm2, %xmm1 ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: psllw $2, %xmm2 @@ -132,9 +132,9 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind { ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,4096,u,u,u,u,u,u] ; X64-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; X64-NEXT: pand %xmm0, %xmm2 ; X64-NEXT: pcmpgtw %xmm0, %xmm1 ; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: pand %xmm0, %xmm2 ; X64-NEXT: por %xmm2, %xmm1 ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: paddw %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index 45b61155fe626..0cae259c1cab9 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -62,13 +62,13 @@ define <4 x double> @signbits_ashr_sitofp_0(<4 x i64> %a0) nounwind { ; X86-NEXT: vpsrlq $36, %xmm1, %xmm2 ; X86-NEXT: vpsrlq $35, %xmm1, %xmm1 ; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; X86-NEXT: vpmovsxdq {{.*#+}} xmm2 = [268435456,134217728] +; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [268435456,0,134217728,0] ; X86-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X86-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; X86-NEXT: vpsrlq $34, %xmm0, %xmm2 ; X86-NEXT: vpsrlq $33, %xmm0, %xmm0 ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; X86-NEXT: vpmovsxdq {{.*#+}} xmm2 = [1073741824,536870912] +; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [1073741824,0,536870912,0] ; X86-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; X86-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] @@ -81,13 +81,13 @@ define <4 x double> @signbits_ashr_sitofp_0(<4 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vpsrlq $36, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpsrlq $35, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; X64-AVX1-NEXT: vpmovsxdq {{.*#+}} xmm2 = [268435456,134217728] +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [268435456,134217728] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpsrlq $34, %xmm0, %xmm2 ; X64-AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; X64-AVX1-NEXT: vpmovsxdq {{.*#+}} xmm2 = [1073741824,536870912] +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1073741824,536870912] ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] @@ -97,7 +97,7 @@ define <4 x double> @signbits_ashr_sitofp_0(<4 x i64> %a0) nounwind { ; X64-AVX2-LABEL: signbits_ashr_sitofp_0: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpmovsxdq {{.*#+}} ymm1 = [1073741824,536870912,268435456,134217728] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1073741824,536870912,268435456,134217728] ; X64-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll index 6376b4d599de7..d6abbf7cce62c 100644 --- a/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll @@ -18,7 +18,7 @@ entry: define <8 x i8> @hadd_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: hadd_trunc_v8i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0 @@ -181,8 +181,8 @@ define <8 x i16> @hadd_extract_4th_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x ; CHECK-LABEL: hadd_extract_4th_trunc_redundant_and_v4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/lack-of-signed-truncation-check.ll b/llvm/test/CodeGen/X86/lack-of-signed-truncation-check.ll index 7bef94cca0d35..04c307ca3f21e 100644 --- a/llvm/test/CodeGen/X86/lack-of-signed-truncation-check.ll +++ b/llvm/test/CodeGen/X86/lack-of-signed-truncation-check.ll @@ -110,8 +110,8 @@ define i1 @shifts_necmp_i64_i16(i64 %x) nounwind { ; X86-NEXT: movswl %ax, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -135,8 +135,8 @@ define i1 @shifts_necmp_i64_i8(i64 %x) nounwind { ; X86-NEXT: movsbl %al, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: setne %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/lea-16bit.ll b/llvm/test/CodeGen/X86/lea-16bit.ll index cec29ab1da6ab..043c71559f477 100644 --- a/llvm/test/CodeGen/X86/lea-16bit.ll +++ b/llvm/test/CodeGen/X86/lea-16bit.ll @@ -6,7 +6,7 @@ define i16 @lea16bit(i16 %in) { ; NO-NDD-LABEL: lea16bit: ; NO-NDD: # %bb.0: ; NO-NDD-NEXT: # kill: def $edi killed $edi def $rdi -; NO-NDD-NEXT: leal 1(%rdi,%rdi), %eax +; NO-NDD-NEXT: leal 1(,%rdi,2), %eax ; NO-NDD-NEXT: # kill: def $ax killed $ax killed $eax ; NO-NDD-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/lea-2.ll b/llvm/test/CodeGen/X86/lea-2.ll index a48c02ff3e0b7..c599c3f73c26e 100644 --- a/llvm/test/CodeGen/X86/lea-2.ll +++ b/llvm/test/CodeGen/X86/lea-2.ll @@ -11,14 +11,16 @@ define i32 @test1(i32 %A, i32 %B) { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal -5(%ecx,%eax,4), %eax +; X86-NEXT: leal (%ecx,%eax,4), %eax +; X86-NEXT: addl $-5, %eax ; X86-NEXT: retl ; ; X64-LABEL: test1: ; X64: # %bb.0: ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal -5(%rsi,%rdi,4), %eax +; X64-NEXT: leal (%rsi,%rdi,4), %eax +; X64-NEXT: addl $-5, %eax ; X64-NEXT: retq %t1 = shl i32 %A, 2 %t3 = add i32 %B, -5 @@ -35,7 +37,7 @@ define i64 @test2(i32 %a0, i64 %a1) { ; X86-NEXT: movl %edx, %eax ; X86-NEXT: andl $2147483640, %eax # imm = 0x7FFFFFF8 ; X86-NEXT: shrl $31, %edx -; X86-NEXT: leal 4(%eax,%eax), %eax +; X86-NEXT: leal 4(,%eax,2), %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl @@ -44,7 +46,8 @@ define i64 @test2(i32 %a0, i64 %a1) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: andl $-8, %edi -; X64-NEXT: leaq 4(%rsi,%rdi,2), %rax +; X64-NEXT: leaq (%rsi,%rdi,2), %rax +; X64-NEXT: addq $4, %rax ; X64-NEXT: retq %x1 = and i32 %a0, -8 %x2 = or i32 %x1, 2 diff --git a/llvm/test/CodeGen/X86/lea-4.ll b/llvm/test/CodeGen/X86/lea-4.ll index e1f0b73c33ffb..cf6c1e6073b22 100644 --- a/llvm/test/CodeGen/X86/lea-4.ll +++ b/llvm/test/CodeGen/X86/lea-4.ll @@ -7,7 +7,7 @@ define zeroext i16 @t1(i32 %on_off) nounwind { ; CHECK-LABEL: t1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal -2(%rdi,%rdi), %eax +; CHECK-NEXT: leal -2(,%rdi,2), %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq %t0 = sub i32 %on_off, 1 @@ -22,7 +22,7 @@ define i32 @t2(i32 %on_off) nounwind { ; CHECK-LABEL: t2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal -2(%rdi,%rdi), %eax +; CHECK-NEXT: leal -2(,%rdi,2), %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: retq %t0 = sub i32 %on_off, 1 diff --git a/llvm/test/CodeGen/X86/lea-5.ll b/llvm/test/CodeGen/X86/lea-5.ll index 908ec3eae7f65..8128625075c39 100644 --- a/llvm/test/CodeGen/X86/lea-5.ll +++ b/llvm/test/CodeGen/X86/lea-5.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; test for more complicated forms of lea operands which can be generated ; in loop optimized cases. ; See also http://llvm.org/bugs/show_bug.cgi?id=20016 @@ -8,6 +9,34 @@ ; Function Attrs: nounwind readnone uwtable define void @foo(i32 %x, i32 %d) #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: leaq (%rsp,%rax,4), %rax +; CHECK-NEXT: addq $-40, %rax +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: # %while.cond +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmpl $0, (%rax) +; CHECK-NEXT: leaq 4(%rax), %rax +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT: # %bb.2: # %while.end +; CHECK-NEXT: retq +; +; X32-LABEL: foo: +; X32: # %bb.0: # %entry +; X32-NEXT: # kill: def $esi killed $esi def $rsi +; X32-NEXT: leal (%rsp,%rsi,4), %eax +; X32-NEXT: addl $-40, %eax +; X32-NEXT: .p2align 4 +; X32-NEXT: .LBB0_1: # %while.cond +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: cmpl $0, (%eax) +; X32-NEXT: leal 4(%rax), %eax +; X32-NEXT: # kill: def $eax killed $eax def $rax +; X32-NEXT: jne .LBB0_1 +; X32-NEXT: # %bb.2: # %while.end +; X32-NEXT: retq entry: %a = alloca [8 x i32], align 16 br label %while.cond @@ -16,14 +45,10 @@ while.cond: ; preds = %while.cond, %entry %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ] %arrayidx = getelementptr inbounds [8 x i32], ptr %a, i32 0, i32 %d.addr.0 -; CHECK: leaq -40(%rsp,%r{{[^,]*}},4), %rax -; X32: leal -40(%rsp,%r{{[^,]*}},4), %eax %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp eq i32 %0, 0 %inc = add nsw i32 %d.addr.0, 1 -; CHECK: leaq 4(%r{{[^,]*}}), %r{{[^,]*}} -; X32: leal 4(%r{{[^,]*}}), %e{{[^,]*}} br i1 %cmp1, label %while.end, label %while.cond while.end: ; preds = %while.cond @@ -35,6 +60,28 @@ while.end: ; preds = %while.cond ; Function Attrs: nounwind readnone uwtable define void @bar(i32 %x, i32 %d) #0 { +; CHECK-LABEL: bar: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: leaq (%rsp,%rax,4), %rax +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_1: # %while.cond +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmpl $0, (%rax) +; CHECK-NEXT: leaq 4(%rax), %rax +; CHECK-NEXT: jne .LBB1_1 +; CHECK-NEXT: # %bb.2: # %while.end +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq entry: %a = alloca [8 x i32], align 64 br label %while.cond @@ -43,14 +90,10 @@ while.cond: ; preds = %while.cond, %entry %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ] %arrayidx = getelementptr inbounds [8 x i32], ptr %a, i32 0, i32 %d.addr.0 -; CHECK: leaq (%rsp,%r{{[^,]*}},4), %rax -; X32: leal (%rsp,%r{{[^,]*}},4), %eax %0 = load i32, ptr %arrayidx, align 4 %cmp1 = icmp eq i32 %0, 0 %inc = add nsw i32 %d.addr.0, 1 -; CHECK: leaq 4(%r{{[^,]*}}), %r{{[^,]*}} -; X32: leal 4(%r{{[^,]*}}), %e{{[^,]*}} br i1 %cmp1, label %while.end, label %while.cond while.end: ; preds = %while.cond diff --git a/llvm/test/CodeGen/X86/lea-opt-cse1.ll b/llvm/test/CodeGen/X86/lea-opt-cse1.ll index 5ceca9fbd9b5f..72d8e4b56bed5 100644 --- a/llvm/test/CodeGen/X86/lea-opt-cse1.ll +++ b/llvm/test/CodeGen/X86/lea-opt-cse1.ll @@ -10,9 +10,11 @@ define void @test_func(ptr nocapture %ctx, i32 %n) local_unnamed_addr { ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: movl 16(%rdi), %ecx ; X64-NEXT: leal (%rax,%rcx), %edx -; X64-NEXT: leal 1(%rax,%rcx), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: movl %eax, 12(%rdi) -; X64-NEXT: leal 1(%rcx,%rdx), %eax +; X64-NEXT: leal (%rcx,%rdx), %eax +; X64-NEXT: incl %eax ; X64-NEXT: movl %eax, 16(%rdi) ; X64-NEXT: retq ; @@ -24,10 +26,12 @@ define void @test_func(ptr nocapture %ctx, i32 %n) local_unnamed_addr { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %ecx ; X86-NEXT: movl 16(%eax), %edx -; X86-NEXT: leal 1(%ecx,%edx), %esi +; X86-NEXT: leal (%ecx,%edx), %esi +; X86-NEXT: incl %esi ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: leal 1(%edx,%ecx), %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: movl %ecx, 16(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 diff --git a/llvm/test/CodeGen/X86/lea-opt-cse2.ll b/llvm/test/CodeGen/X86/lea-opt-cse2.ll index e39d01f1447f8..7e3801b3253e5 100644 --- a/llvm/test/CodeGen/X86/lea-opt-cse2.ll +++ b/llvm/test/CodeGen/X86/lea-opt-cse2.ll @@ -12,13 +12,15 @@ define void @foo(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 { ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: movl 16(%rdi), %ecx -; X64-NEXT: leal 1(%rax,%rcx), %edx +; X64-NEXT: leal (%rax,%rcx), %edx +; X64-NEXT: incl %edx ; X64-NEXT: movl %edx, 12(%rdi) ; X64-NEXT: decl %esi ; X64-NEXT: jne .LBB0_1 ; X64-NEXT: # %bb.2: # %exit ; X64-NEXT: addl %ecx, %eax -; X64-NEXT: leal 1(%rcx,%rax), %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: movl %eax, 16(%rdi) ; X64-NEXT: retq ; @@ -37,13 +39,15 @@ define void @foo(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 { ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl (%eax), %edx ; X86-NEXT: movl 16(%eax), %esi -; X86-NEXT: leal 1(%edx,%esi), %edi +; X86-NEXT: leal (%edx,%esi), %edi +; X86-NEXT: incl %edi ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: decl %ecx ; X86-NEXT: jne .LBB0_1 ; X86-NEXT: # %bb.2: # %exit ; X86-NEXT: addl %esi, %edx -; X86-NEXT: leal 1(%esi,%edx), %ecx +; X86-NEXT: leal (%esi,%edx), %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: movl %ecx, 16(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 diff --git a/llvm/test/CodeGen/X86/lea-opt-cse3.ll b/llvm/test/CodeGen/X86/lea-opt-cse3.ll index 93e4fa77b5629..67acf8808f85c 100644 --- a/llvm/test/CodeGen/X86/lea-opt-cse3.ll +++ b/llvm/test/CodeGen/X86/lea-opt-cse3.ll @@ -7,8 +7,10 @@ define i32 @foo(i32 %a, i32 %b) local_unnamed_addr #0 { ; X64: # %bb.0: # %entry ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal 4(%rdi,%rsi,2), %ecx -; X64-NEXT: leal 4(%rdi,%rsi,4), %eax +; X64-NEXT: leal (%rdi,%rsi,2), %ecx +; X64-NEXT: addl $4, %ecx +; X64-NEXT: leal (%rdi,%rsi,4), %eax +; X64-NEXT: addl $4, %eax ; X64-NEXT: imull %ecx, %eax ; X64-NEXT: retq ; @@ -16,8 +18,10 @@ define i32 @foo(i32 %a, i32 %b) local_unnamed_addr #0 { ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 4(%ecx,%eax,2), %edx -; X86-NEXT: leal 4(%ecx,%eax,4), %eax +; X86-NEXT: leal (%ecx,%eax,2), %edx +; X86-NEXT: addl $4, %edx +; X86-NEXT: leal (%ecx,%eax,4), %eax +; X86-NEXT: addl $4, %eax ; X86-NEXT: imull %edx, %eax ; X86-NEXT: retl entry: @@ -35,8 +39,10 @@ define i32 @foo1(i32 %a, i32 %b) local_unnamed_addr #0 { ; X64: # %bb.0: # %entry ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal 4(%rdi,%rsi,4), %ecx -; X64-NEXT: leal 4(%rdi,%rsi,8), %eax +; X64-NEXT: leal (%rdi,%rsi,4), %ecx +; X64-NEXT: addl $4, %ecx +; X64-NEXT: leal (%rdi,%rsi,8), %eax +; X64-NEXT: addl $4, %eax ; X64-NEXT: imull %ecx, %eax ; X64-NEXT: retq ; @@ -44,8 +50,10 @@ define i32 @foo1(i32 %a, i32 %b) local_unnamed_addr #0 { ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 4(%ecx,%eax,4), %edx -; X86-NEXT: leal 4(%ecx,%eax,8), %eax +; X86-NEXT: leal (%ecx,%eax,4), %edx +; X86-NEXT: addl $4, %edx +; X86-NEXT: leal (%ecx,%eax,8), %eax +; X86-NEXT: addl $4, %eax ; X86-NEXT: imull %edx, %eax ; X86-NEXT: retl entry: @@ -63,12 +71,14 @@ define i32 @foo1_mult_basic_blocks(i32 %a, i32 %b) local_unnamed_addr #0 { ; X64: # %bb.0: # %entry ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal 4(%rdi,%rsi,4), %ecx +; X64-NEXT: leal (%rdi,%rsi,4), %ecx +; X64-NEXT: addl $4, %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl $10, %ecx ; X64-NEXT: je .LBB2_2 ; X64-NEXT: # %bb.1: # %mid -; X64-NEXT: leal 4(%rdi,%rsi,8), %eax +; X64-NEXT: leal (%rdi,%rsi,8), %eax +; X64-NEXT: addl $4, %eax ; X64-NEXT: imull %eax, %ecx ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: .LBB2_2: # %exit @@ -81,12 +91,14 @@ define i32 @foo1_mult_basic_blocks(i32 %a, i32 %b) local_unnamed_addr #0 { ; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: leal 4(%esi,%edx,4), %ecx +; X86-NEXT: leal (%esi,%edx,4), %ecx +; X86-NEXT: addl $4, %ecx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $10, %ecx ; X86-NEXT: je .LBB2_2 ; X86-NEXT: # %bb.1: # %mid -; X86-NEXT: leal 4(%esi,%edx,8), %eax +; X86-NEXT: leal (%esi,%edx,8), %eax +; X86-NEXT: addl $4, %eax ; X86-NEXT: imull %eax, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB2_2: # %exit @@ -116,12 +128,14 @@ define i32 @foo1_mult_basic_blocks_illegal_scale(i32 %a, i32 %b) local_unnamed_a ; X64: # %bb.0: # %entry ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal 4(%rdi,%rsi,2), %ecx +; X64-NEXT: leal (%rdi,%rsi,2), %ecx +; X64-NEXT: addl $4, %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl $10, %ecx ; X64-NEXT: je .LBB3_2 ; X64-NEXT: # %bb.1: # %mid -; X64-NEXT: leal 4(%rdi,%rsi,8), %eax +; X64-NEXT: leal (%rdi,%rsi,8), %eax +; X64-NEXT: addl $4, %eax ; X64-NEXT: imull %eax, %ecx ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: .LBB3_2: # %exit @@ -134,12 +148,14 @@ define i32 @foo1_mult_basic_blocks_illegal_scale(i32 %a, i32 %b) local_unnamed_a ; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: leal 4(%esi,%edx,2), %ecx +; X86-NEXT: leal (%esi,%edx,2), %ecx +; X86-NEXT: addl $4, %ecx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $10, %ecx ; X86-NEXT: je .LBB3_2 ; X86-NEXT: # %bb.1: # %mid -; X86-NEXT: leal 4(%esi,%edx,8), %eax +; X86-NEXT: leal (%esi,%edx,8), %eax +; X86-NEXT: addl $4, %eax ; X86-NEXT: imull %eax, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB3_2: # %exit diff --git a/llvm/test/CodeGen/X86/lea-opt-cse4.ll b/llvm/test/CodeGen/X86/lea-opt-cse4.ll index 4fa9acd99bb2f..570df5b522399 100644 --- a/llvm/test/CodeGen/X86/lea-opt-cse4.ll +++ b/llvm/test/CodeGen/X86/lea-opt-cse4.ll @@ -13,9 +13,11 @@ define void @foo(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 { ; X64-NEXT: addl %eax, %ecx ; X64-NEXT: addl %eax, %ecx ; X64-NEXT: leal (%rcx,%rax), %edx -; X64-NEXT: leal 1(%rax,%rcx), %ecx +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: incl %ecx ; X64-NEXT: movl %ecx, 12(%rdi) -; X64-NEXT: leal 1(%rax,%rdx), %eax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: incl %eax ; X64-NEXT: movl %eax, 16(%rdi) ; X64-NEXT: retq ; @@ -30,10 +32,12 @@ define void @foo(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: leal 1(%ecx,%edx), %esi +; X86-NEXT: leal (%ecx,%edx), %esi +; X86-NEXT: incl %esi ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: leal 1(%ecx,%edx), %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: movl %ecx, 16(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 @@ -64,13 +68,15 @@ define void @foo_loop(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 { ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movl (%rdi), %ecx ; X64-NEXT: movl 16(%rdi), %eax -; X64-NEXT: leal 1(%rcx,%rax), %edx +; X64-NEXT: leal (%rcx,%rax), %edx +; X64-NEXT: incl %edx ; X64-NEXT: movl %edx, 12(%rdi) ; X64-NEXT: decl %esi ; X64-NEXT: jne .LBB1_1 ; X64-NEXT: # %bb.2: # %exit ; X64-NEXT: addl %eax, %ecx -; X64-NEXT: leal 1(%rax,%rcx), %ecx +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: incl %ecx ; X64-NEXT: leal (%rax,%rax), %edx ; X64-NEXT: addl %eax, %edx ; X64-NEXT: addl %edx, %ecx @@ -93,13 +99,15 @@ define void @foo_loop(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 { ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl (%eax), %esi ; X86-NEXT: movl 16(%eax), %ecx -; X86-NEXT: leal 1(%esi,%ecx), %edi +; X86-NEXT: leal (%esi,%ecx), %edi +; X86-NEXT: incl %edi ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: decl %edx ; X86-NEXT: jne .LBB1_1 ; X86-NEXT: # %bb.2: # %exit ; X86-NEXT: addl %ecx, %esi -; X86-NEXT: leal 1(%ecx,%esi), %edx +; X86-NEXT: leal (%ecx,%esi), %edx +; X86-NEXT: incl %edx ; X86-NEXT: leal (%ecx,%ecx), %esi ; X86-NEXT: addl %ecx, %esi ; X86-NEXT: addl %esi, %edx diff --git a/llvm/test/CodeGen/X86/lea-opt.ll b/llvm/test/CodeGen/X86/lea-opt.ll index 88712328e54a7..39426bccb648c 100644 --- a/llvm/test/CodeGen/X86/lea-opt.ll +++ b/llvm/test/CodeGen/X86/lea-opt.ll @@ -13,8 +13,9 @@ define void @test1(i64 %x) nounwind { ; ENABLED: # %bb.0: # %entry ; ENABLED-NEXT: shlq $2, %rdi ; ENABLED-NEXT: movl arr1(%rdi,%rdi,2), %ecx -; ENABLED-NEXT: leaq arr1+4(%rdi,%rdi,2), %rax ; ENABLED-NEXT: subl arr1+4(%rdi,%rdi,2), %ecx +; ENABLED-NEXT: leaq (%rdi,%rdi,2), %rax +; ENABLED-NEXT: addq $arr1+4, %rax ; ENABLED-NEXT: addl arr1+8(%rdi,%rdi,2), %ecx ; ENABLED-NEXT: cmpl $2, %ecx ; ENABLED-NEXT: je .LBB0_3 @@ -35,9 +36,11 @@ define void @test1(i64 %x) nounwind { ; DISABLED: # %bb.0: # %entry ; DISABLED-NEXT: shlq $2, %rdi ; DISABLED-NEXT: movl arr1(%rdi,%rdi,2), %edx -; DISABLED-NEXT: leaq arr1+4(%rdi,%rdi,2), %rax +; DISABLED-NEXT: leaq (%rdi,%rdi,2), %rax +; DISABLED-NEXT: addq $arr1+4, %rax ; DISABLED-NEXT: subl arr1+4(%rdi,%rdi,2), %edx -; DISABLED-NEXT: leaq arr1+8(%rdi,%rdi,2), %rcx +; DISABLED-NEXT: leaq (%rdi,%rdi,2), %rcx +; DISABLED-NEXT: addq $arr1+8, %rcx ; DISABLED-NEXT: addl arr1+8(%rdi,%rdi,2), %edx ; DISABLED-NEXT: cmpl $2, %edx ; DISABLED-NEXT: je .LBB0_3 @@ -85,7 +88,8 @@ define void @test2(i64 %x) nounwind optsize { ; ENABLED-LABEL: test2: ; ENABLED: # %bb.0: # %entry ; ENABLED-NEXT: shlq $2, %rdi -; ENABLED-NEXT: leaq arr1+4(%rdi,%rdi,2), %rax +; ENABLED-NEXT: leaq (%rdi,%rdi,2), %rax +; ENABLED-NEXT: addq $arr1+4, %rax ; ENABLED-NEXT: movl -4(%rax), %ecx ; ENABLED-NEXT: subl (%rax), %ecx ; ENABLED-NEXT: addl 4(%rax), %ecx @@ -108,9 +112,11 @@ define void @test2(i64 %x) nounwind optsize { ; DISABLED: # %bb.0: # %entry ; DISABLED-NEXT: shlq $2, %rdi ; DISABLED-NEXT: movl arr1(%rdi,%rdi,2), %edx -; DISABLED-NEXT: leaq arr1+4(%rdi,%rdi,2), %rax +; DISABLED-NEXT: leaq (%rdi,%rdi,2), %rax +; DISABLED-NEXT: addq $arr1+4, %rax ; DISABLED-NEXT: subl arr1+4(%rdi,%rdi,2), %edx -; DISABLED-NEXT: leaq arr1+8(%rdi,%rdi,2), %rcx +; DISABLED-NEXT: leaq (%rdi,%rdi,2), %rcx +; DISABLED-NEXT: addq $arr1+8, %rcx ; DISABLED-NEXT: addl arr1+8(%rdi,%rdi,2), %edx ; DISABLED-NEXT: cmpl $2, %edx ; DISABLED-NEXT: je .LBB1_3 @@ -163,8 +169,10 @@ define void @test3(i64 %x) nounwind optsize { ; ENABLED: # %bb.0: # %entry ; ENABLED-NEXT: movq %rdi, %rax ; ENABLED-NEXT: shlq $7, %rax -; ENABLED-NEXT: leaq arr2+132(%rax,%rdi,8), %rcx -; ENABLED-NEXT: leaq arr2(%rax,%rdi,8), %rax +; ENABLED-NEXT: leaq (%rax,%rdi,8), %rcx +; ENABLED-NEXT: addq $arr2+132, %rcx +; ENABLED-NEXT: leaq (%rax,%rdi,8), %rax +; ENABLED-NEXT: addq $arr2, %rax ; ENABLED-NEXT: movl (%rcx), %edx ; ENABLED-NEXT: addl (%rax), %edx ; ENABLED-NEXT: cmpl $2, %edx @@ -186,8 +194,10 @@ define void @test3(i64 %x) nounwind optsize { ; DISABLED: # %bb.0: # %entry ; DISABLED-NEXT: movq %rdi, %rsi ; DISABLED-NEXT: shlq $7, %rsi -; DISABLED-NEXT: leaq arr2+132(%rsi,%rdi,8), %rcx -; DISABLED-NEXT: leaq arr2(%rsi,%rdi,8), %rax +; DISABLED-NEXT: leaq (%rsi,%rdi,8), %rcx +; DISABLED-NEXT: addq $arr2+132, %rcx +; DISABLED-NEXT: leaq (%rsi,%rdi,8), %rax +; DISABLED-NEXT: addq $arr2, %rax ; DISABLED-NEXT: movl arr2+132(%rsi,%rdi,8), %edx ; DISABLED-NEXT: addl arr2(%rsi,%rdi,8), %edx ; DISABLED-NEXT: cmpl $2, %edx diff --git a/llvm/test/CodeGen/X86/lea-recursion.ll b/llvm/test/CodeGen/X86/lea-recursion.ll index 07a550fa394d6..fb77adc8b7e4d 100644 --- a/llvm/test/CodeGen/X86/lea-recursion.ll +++ b/llvm/test/CodeGen/X86/lea-recursion.ll @@ -18,7 +18,8 @@ define dso_local void @foo() { ; CHECK-NEXT: movl g0(%rip), %eax ; CHECK-NEXT: movl g1(%rip), %ecx ; CHECK-NEXT: leal (%rax,%rcx), %edx -; CHECK-NEXT: leal 1(%rax,%rcx), %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: incl %eax ; CHECK-NEXT: movl %eax, g0+4(%rip) ; CHECK-NEXT: movl g1+4(%rip), %eax ; CHECK-NEXT: leal 1(%rax,%rdx), %ecx @@ -41,7 +42,8 @@ define dso_local void @foo() { ; CHECK-NEXT: leal 2(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+24(%rip) ; CHECK-NEXT: movl g1+24(%rip), %eax -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl $2, %eax ; CHECK-NEXT: movl %eax, g0+28(%rip) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/lea.ll b/llvm/test/CodeGen/X86/lea.ll index 33d121f6849ba..e6f22c358486c 100644 --- a/llvm/test/CodeGen/X86/lea.ll +++ b/llvm/test/CodeGen/X86/lea.ll @@ -34,7 +34,8 @@ define i32 @test2(i32 %x_offs) nounwind readnone { ; LINUX-NEXT: leal -5(%rdi), %eax ; LINUX-NEXT: andl $-4, %eax ; LINUX-NEXT: negl %eax -; LINUX-NEXT: leal -4(%rdi,%rax), %eax +; LINUX-NEXT: addl %edi, %eax +; LINUX-NEXT: addl $-4, %eax ; LINUX-NEXT: retq ; LINUX-NEXT: .LBB1_2: # %bb2 ; LINUX-NEXT: movl %edi, %eax @@ -49,7 +50,8 @@ define i32 @test2(i32 %x_offs) nounwind readnone { ; WIN-NEXT: leal -5(%rcx), %eax ; WIN-NEXT: andl $-4, %eax ; WIN-NEXT: negl %eax -; WIN-NEXT: leal -4(%rcx,%rax), %eax +; WIN-NEXT: addl %ecx, %eax +; WIN-NEXT: addl $-4, %eax ; WIN-NEXT: retq ; WIN-NEXT: .LBB1_2: # %bb2 ; WIN-NEXT: movl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/legalize-shift-64.ll b/llvm/test/CodeGen/X86/legalize-shift-64.ll index 53208de7ea27e..d5a2568e993e1 100644 --- a/llvm/test/CodeGen/X86/legalize-shift-64.ll +++ b/llvm/test/CodeGen/X86/legalize-shift-64.ll @@ -88,33 +88,33 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) { ; CHECK-NEXT: .cfi_offset %ebx, -12 ; CHECK-NEXT: .cfi_offset %ebp, -8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movb {{[0-9]+}}(%esp), %ch -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movl %ebx, %edi +; CHECK-NEXT: movl %edx, %ecx ; CHECK-NEXT: shll %cl, %edi ; CHECK-NEXT: shldl %cl, %ebx, %esi -; CHECK-NEXT: testb $32, %cl +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: testb $32, %dl ; CHECK-NEXT: je .LBB4_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: movl %edi, %esi ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: movl %edx, %ebx -; CHECK-NEXT: movb %ch, %cl -; CHECK-NEXT: shll %cl, %ebx -; CHECK-NEXT: shldl %cl, %edx, %ebp -; CHECK-NEXT: testb $32, %ch +; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: shll %cl, %edx +; CHECK-NEXT: shldl %cl, %ebp, %ebx +; CHECK-NEXT: testb $32, %cl ; CHECK-NEXT: je .LBB4_4 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: movl %ebx, %ebp -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: movl %edx, %ebx +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: .LBB4_4: -; CHECK-NEXT: movl %ebp, 12(%eax) -; CHECK-NEXT: movl %ebx, 8(%eax) +; CHECK-NEXT: movl %ebx, 12(%eax) +; CHECK-NEXT: movl %edx, 8(%eax) ; CHECK-NEXT: movl %esi, 4(%eax) ; CHECK-NEXT: movl %edi, (%eax) ; CHECK-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll index 5e168a82e03e7..81af698cc44dd 100644 --- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll +++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll @@ -6,8 +6,8 @@ define <2 x i256> @test_shl(<2 x i256> %In) nounwind { ; X86-LABEL: test_shl: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shldl $2, %ecx, %edx ; X86-NEXT: movl %edx, 60(%eax) ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -44,23 +44,23 @@ define <2 x i256> @test_shl(<2 x i256> %In) nounwind { ; ; X64-LABEL: test_shl: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; X64-NEXT: shldq $2, %rdx, %rcx -; X64-NEXT: shldq $2, %rdi, %rdx -; X64-NEXT: shldq $2, %r9, %rdi +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; X64-NEXT: shldq $2, %rcx, %rdx +; X64-NEXT: shldq $2, %r8, %rcx +; X64-NEXT: shldq $2, %r9, %r8 +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: shlq $63, %rsi ; X64-NEXT: shlq $2, %r9 -; X64-NEXT: movq %rcx, 56(%rax) -; X64-NEXT: movq %rdx, 48(%rax) -; X64-NEXT: movq %rdi, 40(%rax) -; X64-NEXT: movq %r9, 32(%rax) -; X64-NEXT: movq %rsi, 24(%rax) +; X64-NEXT: movq %rdx, 56(%rdi) +; X64-NEXT: movq %rcx, 48(%rdi) +; X64-NEXT: movq %r8, 40(%rdi) +; X64-NEXT: movq %r9, 32(%rdi) +; X64-NEXT: movq %rsi, 24(%rdi) ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movaps %xmm0, (%rax) -; X64-NEXT: movq $0, 16(%rax) +; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: movq $0, 16(%rdi) ; X64-NEXT: retq %Amt = insertelement <2 x i256> , i256 255, i32 0 %Out = shl <2 x i256> %In, %Amt @@ -75,37 +75,35 @@ define <2 x i256> @test_srl(<2 x i256> %In) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: shldl $28, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $28, %ebx, %edx -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: shldl $28, %ecx, %ebx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shldl $28, %edi, %esi -; X86-NEXT: shldl $28, %eax, %edi -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shldl $28, %edi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $28, %esi, %edi +; X86-NEXT: shldl $28, %edx, %esi ; X86-NEXT: shldl $28, %eax, %edx +; X86-NEXT: shldl $28, %ebp, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shldl $28, %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrdl $4, %eax, %ecx -; X86-NEXT: shrl $4, %ebp +; X86-NEXT: shrl $4, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, 60(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, 56(%eax) -; X86-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, 52(%eax) -; X86-NEXT: movl %ebx, 48(%eax) -; X86-NEXT: movl %esi, 44(%eax) -; X86-NEXT: movl %edi, 40(%eax) -; X86-NEXT: movl %edx, 36(%eax) +; X86-NEXT: movl %ebx, 60(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, 56(%eax) +; X86-NEXT: movl %edi, 52(%eax) +; X86-NEXT: movl %esi, 48(%eax) +; X86-NEXT: movl %edx, 44(%eax) +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 40(%eax) +; X86-NEXT: movl %ebp, 36(%eax) ; X86-NEXT: movl %ecx, 32(%eax) ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrl $31, %ecx @@ -129,20 +127,20 @@ define <2 x i256> @test_srl(<2 x i256> %In) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; X64-NEXT: shrdq $4, %rdx, %r9 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; X64-NEXT: shrdq $4, %rsi, %r9 -; X64-NEXT: shrdq $4, %rdx, %rsi ; X64-NEXT: shrdq $4, %rcx, %rdx ; X64-NEXT: shrq $63, %r8 -; X64-NEXT: shrq $4, %rcx -; X64-NEXT: movq %rcx, 56(%rdi) -; X64-NEXT: movq %rdx, 48(%rdi) -; X64-NEXT: movq %rsi, 40(%rdi) +; X64-NEXT: shrdq $4, %rsi, %rcx +; X64-NEXT: shrq $4, %rsi +; X64-NEXT: movq %rsi, 56(%rdi) +; X64-NEXT: movq %rcx, 48(%rdi) +; X64-NEXT: movq %rdx, 40(%rdi) ; X64-NEXT: movq %r9, 32(%rdi) -; X64-NEXT: movq %r8, (%rdi) ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movaps %xmm0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) +; X64-NEXT: movups %xmm0, 8(%rdi) +; X64-NEXT: movq %r8, (%rdi) +; X64-NEXT: movq $0, 24(%rdi) ; X64-NEXT: retq %Amt = insertelement <2 x i256> , i256 255, i32 0 %Out = lshr <2 x i256> %In, %Amt @@ -157,37 +155,35 @@ define <2 x i256> @test_sra(<2 x i256> %In) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: shldl $26, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $26, %ebx, %edx -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: shldl $26, %ecx, %ebx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shldl $26, %edi, %esi -; X86-NEXT: shldl $26, %eax, %edi -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shldl $26, %edi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $26, %esi, %edi +; X86-NEXT: shldl $26, %edx, %esi ; X86-NEXT: shldl $26, %eax, %edx +; X86-NEXT: shldl $26, %ebp, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shldl $26, %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrdl $6, %eax, %ecx -; X86-NEXT: sarl $6, %ebp +; X86-NEXT: sarl $6, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, 60(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, 56(%eax) -; X86-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, 52(%eax) -; X86-NEXT: movl %ebx, 48(%eax) -; X86-NEXT: movl %esi, 44(%eax) -; X86-NEXT: movl %edi, 40(%eax) -; X86-NEXT: movl %edx, 36(%eax) +; X86-NEXT: movl %ebx, 60(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, 56(%eax) +; X86-NEXT: movl %edi, 52(%eax) +; X86-NEXT: movl %esi, 48(%eax) +; X86-NEXT: movl %edx, 44(%eax) +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 40(%eax) +; X86-NEXT: movl %ebp, 36(%eax) ; X86-NEXT: movl %ecx, 32(%eax) ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $31, %ecx @@ -211,15 +207,15 @@ define <2 x i256> @test_sra(<2 x i256> %In) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; X64-NEXT: shrdq $6, %rdx, %r9 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; X64-NEXT: shrdq $6, %rsi, %r9 -; X64-NEXT: shrdq $6, %rdx, %rsi ; X64-NEXT: shrdq $6, %rcx, %rdx ; X64-NEXT: sarq $63, %r8 -; X64-NEXT: sarq $6, %rcx -; X64-NEXT: movq %rcx, 56(%rdi) -; X64-NEXT: movq %rdx, 48(%rdi) -; X64-NEXT: movq %rsi, 40(%rdi) +; X64-NEXT: shrdq $6, %rsi, %rcx +; X64-NEXT: sarq $6, %rsi +; X64-NEXT: movq %rsi, 56(%rdi) +; X64-NEXT: movq %rcx, 48(%rdi) +; X64-NEXT: movq %rdx, 40(%rdi) ; X64-NEXT: movq %r9, 32(%rdi) ; X64-NEXT: movq %r8, 24(%rdi) ; X64-NEXT: movq %r8, 16(%rdi) diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll index 8436c1052552e..3db349730e9ed 100644 --- a/llvm/test/CodeGen/X86/llvm.frexp.ll +++ b/llvm/test/CodeGen/X86/llvm.frexp.ll @@ -13,9 +13,9 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; X64-NEXT: callq __truncsfhf2@PLT ; X64-NEXT: pextrw $0, %xmm0, %ecx ; X64-NEXT: movl %ecx, %eax -; X64-NEXT: andl $31744, %eax # imm = 0x7C00 ; X64-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; X64-NEXT: pextrw $0, %xmm0, %edx +; X64-NEXT: andl $31744, %eax # imm = 0x7C00 ; X64-NEXT: movl %edx, %esi ; X64-NEXT: andl $32767, %esi # imm = 0x7FFF ; X64-NEXT: cmpl $1024, %esi # imm = 0x400 diff --git a/llvm/test/CodeGen/X86/load-local-v3i1.ll b/llvm/test/CodeGen/X86/load-local-v3i1.ll index 52e0eb826d143..d434d865a861e 100644 --- a/llvm/test/CodeGen/X86/load-local-v3i1.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i1.ll @@ -22,8 +22,8 @@ define <3 x i32> @masked_load_v3(ptr addrspace(1), <3 x i1>) { ; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: shlb $2, %cl ; CHECK-NEXT: orb %dl, %cl -; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: # implicit-def: $xmm0 +; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %else ; CHECK-NEXT: testb $2, %cl diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll index d2359ced3e19d..f7856f737c1cd 100644 --- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -494,18 +494,32 @@ define <2 x i64> @udiv_op1_constant(ptr %p) nounwind { define <2 x i64> @urem_op0_constant(ptr %p) nounwind { ; SSE-LABEL: urem_op0_constant: ; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movl $42, %edx +; SSE-NEXT: cmpq $42, %rcx +; SSE-NEXT: ja .LBB22_2 +; SSE-NEXT: # %bb.1: ; SSE-NEXT: movl $42, %eax ; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divq (%rdi) -; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: divl %ecx +; SSE-NEXT: # kill: def $edx killed $edx def $rdx +; SSE-NEXT: .LBB22_2: +; SSE-NEXT: movd %edx, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: urem_op0_constant: ; AVX: # %bb.0: +; AVX-NEXT: movq (%rdi), %rcx +; AVX-NEXT: movl $42, %edx +; AVX-NEXT: cmpq $42, %rcx +; AVX-NEXT: ja .LBB22_2 +; AVX-NEXT: # %bb.1: ; AVX-NEXT: movl $42, %eax ; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divq (%rdi) -; AVX-NEXT: vmovq %rdx, %xmm0 +; AVX-NEXT: divl %ecx +; AVX-NEXT: # kill: def $edx killed $edx def $rdx +; AVX-NEXT: .LBB22_2: +; AVX-NEXT: vmovd %edx, %xmm0 ; AVX-NEXT: retq %x = load i64, ptr %p %b = urem i64 42, %x diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll b/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll index d7390299c7b10..e55567d0a04de 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=i686-- -relocation-model=pic | FileCheck %s -check-prefix=PIC ; RUN: llc < %s -mtriple=i686-- -relocation-model=static | FileCheck %s -check-prefix=STATIC ; @@ -5,23 +6,56 @@ ; since too many registers are needed to subsume it into the addressing modes. ; It's safe to sink A in when it's not pic. -; PIC: align -; PIC: movl $4, -4([[REG:%e[a-z]+]]) -; PIC: movl $5, ([[REG]]) -; PIC: addl $4, [[REG]] -; PIC: decl {{%e[[a-z]+}} -; PIC: jne - -; STATIC: align -; STATIC: movl $4, -4(%ecx) -; STATIC: movl $5, (%ecx) -; STATIC: addl $4, %ecx -; STATIC: decl %eax -; STATIC: jne - @A = global [16 x [16 x i32]] zeroinitializer, align 32 ; [#uses=2] define void @test(i32 %row, i32 %N.in) nounwind { +; PIC-LABEL: test: +; PIC: # %bb.0: # %entry +; PIC-NEXT: calll .L0$pb +; PIC-NEXT: .L0$pb: +; PIC-NEXT: popl %ecx +; PIC-NEXT: .Ltmp0: +; PIC-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ecx +; PIC-NEXT: movl {{[0-9]+}}(%esp), %eax +; PIC-NEXT: testl %eax, %eax +; PIC-NEXT: jle .LBB0_3 +; PIC-NEXT: # %bb.1: # %cond_true.preheader +; PIC-NEXT: movl {{[0-9]+}}(%esp), %edx +; PIC-NEXT: shll $6, %edx +; PIC-NEXT: movl A@GOT(%ecx), %ecx +; PIC-NEXT: addl %edx, %ecx +; PIC-NEXT: addl $8, %ecx +; PIC-NEXT: xorl %edx, %edx +; PIC-NEXT: .p2align 4 +; PIC-NEXT: .LBB0_2: # %cond_true +; PIC-NEXT: # =>This Inner Loop Header: Depth=1 +; PIC-NEXT: movl $4, -4(%ecx,%edx,4) +; PIC-NEXT: movl $5, (%ecx,%edx,4) +; PIC-NEXT: incl %edx +; PIC-NEXT: cmpl %edx, %eax +; PIC-NEXT: jne .LBB0_2 +; PIC-NEXT: .LBB0_3: # %return +; PIC-NEXT: retl +; +; STATIC-LABEL: test: +; STATIC: # %bb.0: # %entry +; STATIC-NEXT: movl {{[0-9]+}}(%esp), %eax +; STATIC-NEXT: testl %eax, %eax +; STATIC-NEXT: jle .LBB0_3 +; STATIC-NEXT: # %bb.1: # %cond_true.preheader +; STATIC-NEXT: movl {{[0-9]+}}(%esp), %ecx +; STATIC-NEXT: shll $6, %ecx +; STATIC-NEXT: xorl %edx, %edx +; STATIC-NEXT: .p2align 4 +; STATIC-NEXT: .LBB0_2: # %cond_true +; STATIC-NEXT: # =>This Inner Loop Header: Depth=1 +; STATIC-NEXT: movl $4, A+4(%ecx,%edx,4) +; STATIC-NEXT: movl $5, A+8(%ecx,%edx,4) +; STATIC-NEXT: incl %edx +; STATIC-NEXT: cmpl %edx, %eax +; STATIC-NEXT: jne .LBB0_2 +; STATIC-NEXT: .LBB0_3: # %return +; STATIC-NEXT: retl entry: %N = bitcast i32 %N.in to i32 ; [#uses=1] %tmp5 = icmp sgt i32 %N.in, 0 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll b/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll index 558a4e9fb0864..5da5e9a63ab80 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll @@ -1,15 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s -; CHECK: align -; CHECK: movl $4, -4(%ecx) -; CHECK: movl $5, (%ecx) -; CHECK: addl $4, %ecx -; CHECK: decl %eax -; CHECK: jne - @A = global [16 x [16 x i32]] zeroinitializer, align 32 ; [#uses=2] define void @test(i32 %row, i32 %N.in) nounwind { +; CHECK-LABEL: test: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: jle LBB0_3 +; CHECK-NEXT: ## %bb.1: ## %cond_true.preheader +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: shll $6, %ecx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: LBB0_2: ## %cond_true +; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movl $4, _A+4(%ecx,%edx,4) +; CHECK-NEXT: movl $5, _A+8(%ecx,%edx,4) +; CHECK-NEXT: incl %edx +; CHECK-NEXT: cmpl %edx, %eax +; CHECK-NEXT: jne LBB0_2 +; CHECK-NEXT: LBB0_3: ## %return +; CHECK-NEXT: retl entry: %N = bitcast i32 %N.in to i32 ; [#uses=1] %tmp5 = icmp sgt i32 %N.in, 0 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce.ll b/llvm/test/CodeGen/X86/loop-strength-reduce.ll index a8c28b7d16dce..3305ce07b3565 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce.ll @@ -1,15 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=i686-- -relocation-model=static | FileCheck %s -; CHECK: align -; CHECK: movl $4, -4(%ecx) -; CHECK: movl $5, (%ecx) -; CHECK: addl $4, %ecx -; CHECK: decl %eax -; CHECK: jne - @A = internal global [16 x [16 x i32]] zeroinitializer, align 32 ; [#uses=2] define void @test(i32 %row, i32 %N.in) nounwind { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: jle .LBB0_3 +; CHECK-NEXT: # %bb.1: # %cond_true.preheader +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: shll $6, %ecx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: # %cond_true +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movl $4, A+4(%ecx,%edx,4) +; CHECK-NEXT: movl $5, A+8(%ecx,%edx,4) +; CHECK-NEXT: incl %edx +; CHECK-NEXT: cmpl %edx, %eax +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: .LBB0_3: # %return +; CHECK-NEXT: retl entry: %N = bitcast i32 %N.in to i32 ; [#uses=1] %tmp5 = icmp sgt i32 %N.in, 0 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce4.ll b/llvm/test/CodeGen/X86/loop-strength-reduce4.ll index 4bb1150bf702a..cf2d3759d29ef 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce4.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce4.ll @@ -1,27 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=i686-apple-darwin -relocation-model=static | FileCheck %s -check-prefix=STATIC ; RUN: llc < %s -mtriple=i686-apple-darwin -relocation-model=pic | FileCheck %s -check-prefix=PIC ; By starting the IV at -64 instead of 0, a cmp is eliminated, ; as the flags from the add can be used directly. -; STATIC: movl $-64, [[EAX:%e..]] - -; STATIC: movl %{{.+}}, _state+76([[EAX]]) -; STATIC: addl $16, [[EAX]] -; STATIC: jne - -; The same for PIC mode. - -; PIC: movl $-64, [[EAX:%e..]] - -; PIC: movl %{{.+}}, 76(%{{.+}},[[EAX]]) -; PIC: addl $16, [[EAX]] -; PIC: jne - @state = external global [0 x i32] ; [#uses=4] @S = external global [0 x i32] ; [#uses=4] define i32 @foo() nounwind { +; STATIC-LABEL: foo: +; STATIC: ## %bb.0: ## %entry +; STATIC-NEXT: xorl %ecx, %ecx +; STATIC-NEXT: movl $-64, %eax +; STATIC-NEXT: .p2align 4 +; STATIC-NEXT: LBB0_1: ## %bb +; STATIC-NEXT: ## =>This Inner Loop Header: Depth=1 +; STATIC-NEXT: movl _S(,%ecx,4), %ecx +; STATIC-NEXT: xorl _state+64(%eax), %ecx +; STATIC-NEXT: movl %ecx, _state+64(%eax) +; STATIC-NEXT: movl _S(,%ecx,4), %ecx +; STATIC-NEXT: xorl _state+68(%eax), %ecx +; STATIC-NEXT: movl %ecx, _state+68(%eax) +; STATIC-NEXT: movl _S(,%ecx,4), %ecx +; STATIC-NEXT: xorl _state+72(%eax), %ecx +; STATIC-NEXT: movl %ecx, _state+72(%eax) +; STATIC-NEXT: movl _S(,%ecx,4), %ecx +; STATIC-NEXT: xorl _state+76(%eax), %ecx +; STATIC-NEXT: movl %ecx, _state+76(%eax) +; STATIC-NEXT: addl $16, %eax +; STATIC-NEXT: jne LBB0_1 +; STATIC-NEXT: ## %bb.2: ## %bb57 +; STATIC-NEXT: movzbl %cl, %eax +; STATIC-NEXT: retl +; +; PIC-LABEL: foo: +; PIC: ## %bb.0: ## %entry +; PIC-NEXT: pushl %ebx +; PIC-NEXT: pushl %esi +; PIC-NEXT: calll L0$pb +; PIC-NEXT: L0$pb: +; PIC-NEXT: popl %edx +; PIC-NEXT: xorl %eax, %eax +; PIC-NEXT: movl L_state$non_lazy_ptr-L0$pb(%edx), %ecx +; PIC-NEXT: movl L_S$non_lazy_ptr-L0$pb(%edx), %edx +; PIC-NEXT: xorl %ebx, %ebx +; PIC-NEXT: .p2align 4 +; PIC-NEXT: LBB0_1: ## %bb +; PIC-NEXT: ## =>This Inner Loop Header: Depth=1 +; PIC-NEXT: movl (%edx,%ebx,4), %esi +; PIC-NEXT: xorl (%ecx,%eax), %esi +; PIC-NEXT: movl %esi, (%ecx,%eax) +; PIC-NEXT: movl (%edx,%esi,4), %esi +; PIC-NEXT: xorl 4(%ecx,%eax), %esi +; PIC-NEXT: movl %esi, 4(%ecx,%eax) +; PIC-NEXT: movl (%edx,%esi,4), %esi +; PIC-NEXT: xorl 8(%ecx,%eax), %esi +; PIC-NEXT: movl %esi, 8(%ecx,%eax) +; PIC-NEXT: movl (%edx,%esi,4), %ebx +; PIC-NEXT: xorl 12(%ecx,%eax), %ebx +; PIC-NEXT: movl %ebx, 12(%ecx,%eax) +; PIC-NEXT: addl $16, %eax +; PIC-NEXT: cmpl $64, %eax +; PIC-NEXT: jne LBB0_1 +; PIC-NEXT: ## %bb.2: ## %bb57 +; PIC-NEXT: movzbl %bl, %eax +; PIC-NEXT: popl %esi +; PIC-NEXT: popl %ebx +; PIC-NEXT: retl entry: br label %bb diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce8.ll b/llvm/test/CodeGen/X86/loop-strength-reduce8.ll index 9b76034916824..a001ad70ab469 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce8.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce8.ll @@ -1,14 +1,6 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s -; FIXME: The first two instructions, movl and addl, should have been combined to -; "leal 16(%eax), %edx" by the backend (PR20776). -; CHECK: movl %eax, %edx -; CHECK: addl $16, %edx -; CHECK: align -; CHECK: addl $4, %edx -; CHECK: decl %ecx -; CHECK: jne LBB0_2 - %struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32 } %struct.bitmap_element = type { ptr, ptr, i32, [2 x i64] } %struct.bitmap_head_def = type { ptr, ptr, i32 } @@ -48,6 +40,39 @@ @llvm.used = appending global [1 x ptr] [ ptr @build_stmt ], section "llvm.metadata" ; [#uses=0] define ptr @build_stmt(i32 %code, ...) nounwind { +; CHECK-LABEL: build_stmt: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: calll _make_node +; CHECK-NEXT: movl _tree_code_length(,%esi,4), %ecx +; CHECK-NEXT: movl _lineno, %edx +; CHECK-NEXT: movl %edx, 12(%eax) +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: jle LBB0_3 +; CHECK-NEXT: ## %bb.1: ## %bb.preheader +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: LBB0_2: ## %bb +; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movl (%esp), %esi +; CHECK-NEXT: leal 4(%esi), %edi +; CHECK-NEXT: movl %edi, (%esp) +; CHECK-NEXT: movl (%esi), %esi +; CHECK-NEXT: movl %esi, 16(%eax,%edx,4) +; CHECK-NEXT: incl %edx +; CHECK-NEXT: cmpl %edx, %ecx +; CHECK-NEXT: jne LBB0_2 +; CHECK-NEXT: LBB0_3: ## %bb3 +; CHECK-NEXT: addl $4, %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: retl entry: %p = alloca ptr ; [#uses=3] call void @llvm.va_start(ptr %p) diff --git a/llvm/test/CodeGen/X86/lrshrink-debug.ll b/llvm/test/CodeGen/X86/lrshrink-debug.ll index dd52968529902..1a70acd943c4e 100755 --- a/llvm/test/CodeGen/X86/lrshrink-debug.ll +++ b/llvm/test/CodeGen/X86/lrshrink-debug.ll @@ -38,8 +38,8 @@ define noundef i32 @test(i1 %tobool1.not, i32 %sh.012, i1 %cmp, i64 %sh_prom, i6 ; CHECK-NEXT: # %bb.2: # %if.end ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: testb $1, %dl ; CHECK-NEXT: movl $0, %ebp +; CHECK-NEXT: testb $1, %dl ; CHECK-NEXT: jne .LBB0_4 ; CHECK-NEXT: # %bb.3: # %if.end ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 diff --git a/llvm/test/CodeGen/X86/lsr-i386.ll b/llvm/test/CodeGen/X86/lsr-i386.ll index 443ec3f32dd86..44b9c26c80264 100644 --- a/llvm/test/CodeGen/X86/lsr-i386.ll +++ b/llvm/test/CodeGen/X86/lsr-i386.ll @@ -1,21 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" target triple = "i386-pc-linux-gnu" ; PR7651 -; CHECK: align -; CHECK: align -; CHECK: align -; CHECK: movl $0, (%e -; CHECK-NEXT: addl $4, %e -; CHECK-NEXT: decl %e -; CHECK-NEXT: jne - %struct.anon = type { [72 x i32], i32 } @mp2grad_ = external dso_local global %struct.anon define void @chomp2g_setup_(i32 %n, i32 %m) nounwind { +; CHECK-LABEL: chomp2g_setup_: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: shll $5, %ecx +; CHECK-NEXT: movl $mp2grad_, %edx +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: # %bb1 +; CHECK-NEXT: # =>This Loop Header: Depth=1 +; CHECK-NEXT: # Child Loop BB0_2 Depth 2 +; CHECK-NEXT: movl mp2grad_+288, %esi +; CHECK-NEXT: imull %ecx, %esi +; CHECK-NEXT: addl %edx, %esi +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: # %bb2 +; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 +; CHECK-NEXT: # => This Inner Loop Header: Depth=2 +; CHECK-NEXT: movl $0, (%esi,%edi,4) +; CHECK-NEXT: incl %edi +; CHECK-NEXT: cmpl %edi, %eax +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: # %bb.3: # %bb6 +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: addl $32, %edx +; CHECK-NEXT: jmp .LBB0_1 entry: br label %bb1 diff --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll index 4cd206adc31de..286384f8a239d 100644 --- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -9,56 +9,53 @@ define void @t(ptr nocapture %in, ptr nocapture %out, ptr nocapture %rk, i32 %r) nounwind { ; GENERIC-LABEL: t: ; GENERIC: ## %bb.0: ## %entry -; GENERIC-NEXT: pushq %rbp -; GENERIC-NEXT: pushq %r15 ; GENERIC-NEXT: pushq %r14 ; GENERIC-NEXT: pushq %rbx ; GENERIC-NEXT: ## kill: def $ecx killed $ecx def $rcx ; GENERIC-NEXT: movl (%rdx), %r8d -; GENERIC-NEXT: movl 4(%rdx), %ebx +; GENERIC-NEXT: movl 4(%rdx), %r11d ; GENERIC-NEXT: decl %ecx -; GENERIC-NEXT: leaq 20(%rdx), %r9 +; GENERIC-NEXT: shlq $4, %rcx +; GENERIC-NEXT: xorl %r9d, %r9d ; GENERIC-NEXT: movq _Te0@GOTPCREL(%rip), %rdi ; GENERIC-NEXT: movq _Te1@GOTPCREL(%rip), %rax ; GENERIC-NEXT: movq _Te3@GOTPCREL(%rip), %r10 -; GENERIC-NEXT: movq %rcx, %r11 ; GENERIC-NEXT: .p2align 4 ; GENERIC-NEXT: LBB0_1: ## %bb ; GENERIC-NEXT: ## =>This Inner Loop Header: Depth=1 ; GENERIC-NEXT: movzbl %r8b, %r14d ; GENERIC-NEXT: ## kill: def $r8d killed $r8d def $r8 ; GENERIC-NEXT: shrl $24, %r8d -; GENERIC-NEXT: movl %ebx, %r15d -; GENERIC-NEXT: shrl $14, %r15d -; GENERIC-NEXT: andl $1020, %r15d ## imm = 0x3FC -; GENERIC-NEXT: movl (%rax,%r15), %ebp -; GENERIC-NEXT: xorl (%rdi,%r8,4), %ebp -; GENERIC-NEXT: xorl -12(%r9), %ebp -; GENERIC-NEXT: shrl $24, %ebx +; GENERIC-NEXT: movl %r11d, %ebx +; GENERIC-NEXT: shrl $14, %ebx +; GENERIC-NEXT: andl $1020, %ebx ## imm = 0x3FC +; GENERIC-NEXT: movl (%rax,%rbx), %ebx +; GENERIC-NEXT: xorl (%rdi,%r8,4), %ebx +; GENERIC-NEXT: xorl 8(%rdx,%r9), %ebx +; GENERIC-NEXT: shrl $24, %r11d ; GENERIC-NEXT: movl (%r10,%r14,4), %r14d -; GENERIC-NEXT: xorl (%rdi,%rbx,4), %r14d -; GENERIC-NEXT: xorl -8(%r9), %r14d -; GENERIC-NEXT: movl %ebp, %r8d +; GENERIC-NEXT: xorl (%rdi,%r11,4), %r14d +; GENERIC-NEXT: xorl 12(%rdx,%r9), %r14d +; GENERIC-NEXT: movl %ebx, %r8d ; GENERIC-NEXT: shrl $24, %r8d ; GENERIC-NEXT: movl (%rdi,%r8,4), %r8d -; GENERIC-NEXT: subq $1, %r11 -; GENERIC-NEXT: jb LBB0_3 +; GENERIC-NEXT: cmpq %r9, %rcx +; GENERIC-NEXT: je LBB0_3 ; GENERIC-NEXT: ## %bb.2: ## %bb1 ; GENERIC-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; GENERIC-NEXT: movl %r14d, %ebx -; GENERIC-NEXT: shrl $14, %ebx -; GENERIC-NEXT: andl $1020, %ebx ## imm = 0x3FC -; GENERIC-NEXT: xorl (%rax,%rbx), %r8d -; GENERIC-NEXT: xorl -4(%r9), %r8d +; GENERIC-NEXT: movl %r14d, %r11d +; GENERIC-NEXT: shrl $14, %r11d +; GENERIC-NEXT: andl $1020, %r11d ## imm = 0x3FC +; GENERIC-NEXT: xorl (%rax,%r11), %r8d +; GENERIC-NEXT: xorl 16(%rdx,%r9), %r8d ; GENERIC-NEXT: shrl $24, %r14d -; GENERIC-NEXT: movzbl %bpl, %ebx -; GENERIC-NEXT: movl (%r10,%rbx,4), %ebx -; GENERIC-NEXT: xorl (%rdi,%r14,4), %ebx -; GENERIC-NEXT: xorl (%r9), %ebx +; GENERIC-NEXT: movzbl %bl, %r11d +; GENERIC-NEXT: movl (%r10,%r11,4), %r11d +; GENERIC-NEXT: xorl (%rdi,%r14,4), %r11d +; GENERIC-NEXT: xorl 20(%rdx,%r9), %r11d ; GENERIC-NEXT: addq $16, %r9 ; GENERIC-NEXT: jmp LBB0_1 ; GENERIC-NEXT: LBB0_3: ## %bb2 -; GENERIC-NEXT: shlq $4, %rcx ; GENERIC-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000 ; GENERIC-NEXT: movl %r14d, %r9d ; GENERIC-NEXT: shrl $14, %r9d @@ -70,7 +67,7 @@ define void @t(ptr nocapture %in, ptr nocapture %out, ptr nocapture %rk, i32 %r) ; GENERIC-NEXT: shrl $8, %r14d ; GENERIC-NEXT: movzbl 3(%rdi,%r14,4), %edi ; GENERIC-NEXT: shll $24, %edi -; GENERIC-NEXT: movzbl %bpl, %r8d +; GENERIC-NEXT: movzbl %bl, %r8d ; GENERIC-NEXT: movzbl 2(%rax,%r8,4), %eax ; GENERIC-NEXT: shll $16, %eax ; GENERIC-NEXT: orl %edi, %eax @@ -87,8 +84,6 @@ define void @t(ptr nocapture %in, ptr nocapture %out, ptr nocapture %rk, i32 %r) ; GENERIC-NEXT: movb %al, 5(%rsi) ; GENERIC-NEXT: popq %rbx ; GENERIC-NEXT: popq %r14 -; GENERIC-NEXT: popq %r15 -; GENERIC-NEXT: popq %rbp ; GENERIC-NEXT: retq ; ; ATOM-LABEL: t: diff --git a/llvm/test/CodeGen/X86/lsr-negative-stride.ll b/llvm/test/CodeGen/X86/lsr-negative-stride.ll index 1d5e208f3a326..3121a71b5c720 100644 --- a/llvm/test/CodeGen/X86/lsr-negative-stride.ll +++ b/llvm/test/CodeGen/X86/lsr-negative-stride.ll @@ -36,16 +36,16 @@ define i32 @t(i32 %a, i32 %b) nounwind { ; CHECK-NEXT: jle .LBB0_5 ; CHECK-NEXT: # %bb.4: # %cond_true ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=2 -; CHECK-NEXT: cmpl %eax, %ecx ; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: cmpl %eax, %ecx ; CHECK-NEXT: jne .LBB0_3 ; CHECK-NEXT: jmp .LBB0_6 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_5: # %cond_false ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: subl %edx, %ecx -; CHECK-NEXT: cmpl %edx, %ecx ; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: cmpl %edx, %ecx ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: .LBB0_6: # %bb17 ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll index c84a1159ad56a..0138bd0df4c7c 100644 --- a/llvm/test/CodeGen/X86/machine-cp.ll +++ b/llvm/test/CodeGen/X86/machine-cp.ll @@ -17,8 +17,8 @@ define i32 @t1(i32 %a, i32 %b) nounwind { ; CHECK-NEXT: movl %edx, %ecx ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx -; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jne LBB0_2 ; CHECK-NEXT: ## %bb.3: ## %while.end ; CHECK-NEXT: movl %ecx, %eax @@ -59,21 +59,34 @@ define i32 @t3(i64 %a, i64 %b) nounwind { ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: testq %rsi, %rsi -; CHECK-NEXT: je LBB2_4 -; CHECK-NEXT: ## %bb.1: ## %while.body.preheader +; CHECK-NEXT: je LBB2_7 +; CHECK-NEXT: ## %bb.1: ; CHECK-NEXT: movq %rsi, %rdx +; CHECK-NEXT: jmp LBB2_4 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: LBB2_2: ## %while.body -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: LBB2_2: ## in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: cqto ; CHECK-NEXT: idivq %rcx -; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: LBB2_3: ## in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: je LBB2_6 +; CHECK-NEXT: LBB2_4: ## %while.body +; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: shrq $32, %rdx ; CHECK-NEXT: jne LBB2_2 -; CHECK-NEXT: ## %bb.3: ## %while.end +; CHECK-NEXT: ## %bb.5: ## in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: divl %ecx +; CHECK-NEXT: ## kill: def $edx killed $edx def $rdx +; CHECK-NEXT: jmp LBB2_3 +; CHECK-NEXT: LBB2_6: ## %while.end ; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: LBB2_4: +; CHECK-NEXT: LBB2_7: ; CHECK-NEXT: retq entry: %cmp1 = icmp eq i64 %b, 0 diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index bdb7c307a5759..bd95a69c1bd54 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -454,9 +454,9 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado ; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 ; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 ; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 +; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4 ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 -; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB3_1 @@ -817,9 +817,9 @@ define i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly ; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2 ; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 ; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm5 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 @@ -1008,9 +1008,9 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl ; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm8 ; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmaddwd %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1 @@ -1297,9 +1297,9 @@ define i32 @test_unsigned_short_256(ptr nocapture readonly, ptr nocapture readon ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 @@ -1439,9 +1439,9 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmulld %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 @@ -1658,17 +1658,17 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm5, %xmm12, %xmm5 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm13 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm6, %xmm12, %xmm6 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm7, %xmm12, %xmm7 +; AVX1-NEXT: vpmulld %xmm7, %xmm13, %xmm7 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm8, %xmm12, %xmm8 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm9, %xmm12, %xmm9 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm13 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm10 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmulld %xmm11, %xmm12, %xmm11 +; AVX1-NEXT: vpmulld %xmm11, %xmm13, %xmm11 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 ; AVX1-NEXT: vpaddd %xmm4, %xmm12, %xmm4 ; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 @@ -2194,7 +2194,6 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) { ; SSE2-LABEL: jumbled_indices32: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm0 ; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm2 @@ -2202,6 +2201,7 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) { ; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm4 ; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm7 ; SSE2-NEXT: movdqa %xmm7, 112(%rdi) ; SSE2-NEXT: movdqa %xmm6, 96(%rdi) @@ -2363,10 +2363,10 @@ define <16 x i32> @pmaddwd_512(ptr %Aptr, ptr %Bptr) { ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1 +; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm3 ; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddwd_512: @@ -2410,7 +2410,6 @@ define <16 x i32> @pmaddwd_512(ptr %Aptr, ptr %Bptr) { define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) { ; SSE2-LABEL: pmaddwd_1024: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 @@ -2423,6 +2422,7 @@ define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) { ; SSE2-NEXT: pmaddwd 64(%rdx), %xmm4 ; SSE2-NEXT: movdqa 80(%rsi), %xmm5 ; SSE2-NEXT: pmaddwd 80(%rdx), %xmm5 +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movdqa 96(%rsi), %xmm6 ; SSE2-NEXT: pmaddwd 96(%rdx), %xmm6 ; SSE2-NEXT: movdqa 112(%rsi), %xmm7 @@ -2445,15 +2445,15 @@ define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) { ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1 +; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm3 ; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-NEXT: vpmaddwd 80(%rsi), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX1-NEXT: vpmaddwd 64(%rsi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX1-NEXT: vpmaddwd 80(%rsi), %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-NEXT: vpmaddwd 64(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm3 ; AVX1-NEXT: vpmaddwd 112(%rsi), %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm4 @@ -2481,10 +2481,10 @@ define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) { ; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm1 +; AVX512F-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm3 ; AVX512F-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: pmaddwd_1024: @@ -2679,18 +2679,18 @@ define i32 @madd_quad_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr % ; ; AVX-LABEL: madd_quad_reduction: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqu (%rdi), %xmm0 ; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: vmovdqu (%r8), %xmm2 +; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqu (%r8), %xmm1 -; AVX-NEXT: vpmaddwd (%r9), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%r10), %xmm1 +; AVX-NEXT: vmovdqu (%rcx), %xmm1 ; AVX-NEXT: vpmaddwd (%rax), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2739,6 +2739,7 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %esi, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -2755,8 +2756,8 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) { ; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: pmaddwd %xmm4, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: addq $8, %rdi -; SSE2-NEXT: addq $-8, %rax +; SSE2-NEXT: addq $8, %rcx +; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB33_1 ; SSE2-NEXT: # %bb.2: # %middle.block ; SSE2-NEXT: paddd %xmm3, %xmm2 @@ -2779,6 +2780,7 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %esi, %eax ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: xorl %ecx, %ecx ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB33_1: # %vector.body @@ -2795,8 +2797,8 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) { ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: addq $8, %rdi -; AVX1-NEXT: addq $-8, %rax +; AVX1-NEXT: addq $8, %rcx +; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB33_1 ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -2822,6 +2824,7 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) { ; AVX256: # %bb.0: # %entry ; AVX256-NEXT: movl %esi, %eax ; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX256-NEXT: xorl %ecx, %ecx ; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX256-NEXT: .p2align 4 ; AVX256-NEXT: .LBB33_1: # %vector.body @@ -2830,8 +2833,8 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) { ; AVX256-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX256-NEXT: vpmaddwd %ymm2, %ymm2, %ymm2 ; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX256-NEXT: addq $8, %rdi -; AVX256-NEXT: addq $-8, %rax +; AVX256-NEXT: addq $8, %rcx +; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB33_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -3149,9 +3152,9 @@ define i32 @add_used_by_loop_phi(ptr %a, ptr %b, i64 %offset_a, i64 %offset_b, i ; AVX1-NEXT: vpmovsxbw 8(%rdi,%rax), %xmm2 ; AVX1-NEXT: vpmovsxbw (%rdi,%rax), %xmm3 ; AVX1-NEXT: vpmovsxbw 8(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbw (%rsi,%rax), %xmm5 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxbw (%rsi,%rax), %xmm4 -; AVX1-NEXT: vpmaddwd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/masked-iv-safe.ll b/llvm/test/CodeGen/X86/masked-iv-safe.ll index a4f5e52a27d8a..daa316f220a88 100644 --- a/llvm/test/CodeGen/X86/masked-iv-safe.ll +++ b/llvm/test/CodeGen/X86/masked-iv-safe.ll @@ -7,7 +7,7 @@ define void @count_up(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: count_up: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq $-80, %rax +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] @@ -18,8 +18,9 @@ define void @count_up(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: mulsd %xmm0, %xmm3 ; CHECK-NEXT: mulsd %xmm1, %xmm3 ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, 80(%rdi,%rax) -; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) +; CHECK-NEXT: incq %rax +; CHECK-NEXT: cmpq $10, %rax ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -99,7 +100,7 @@ return: define void @count_up_signed(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: count_up_signed: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq $-80, %rax +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] @@ -110,8 +111,9 @@ define void @count_up_signed(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: mulsd %xmm0, %xmm3 ; CHECK-NEXT: mulsd %xmm1, %xmm3 ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, 80(%rdi,%rax) -; CHECK-NEXT: addq $8, %rax +; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) +; CHECK-NEXT: incq %rax +; CHECK-NEXT: cmpq $10, %rax ; CHECK-NEXT: jne .LBB2_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -245,7 +247,7 @@ return: define void @another_count_down(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: another_count_down: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq $-2040, %rax # imm = 0xF808 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] @@ -256,16 +258,17 @@ define void @another_count_down(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm0, %xmm3 -; CHECK-NEXT: movsd %xmm3, 2040(%rdi,%rax) +; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: divsd %xmm1, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rcx) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm2, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rdx) +; CHECK-NEXT: addq $255, %rax ; CHECK-NEXT: addq $-8, %rdx ; CHECK-NEXT: addq $134217720, %rcx # imm = 0x7FFFFF8 -; CHECK-NEXT: addq $2040, %rax # imm = 0x7F8 +; CHECK-NEXT: cmpq $255, %rax ; CHECK-NEXT: jne .LBB5_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -347,7 +350,7 @@ return: define void @another_count_down_signed(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: another_count_down_signed: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl $8, %eax +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] @@ -358,9 +361,9 @@ define void @another_count_down_signed(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: mulsd %xmm0, %xmm3 ; CHECK-NEXT: divsd %xmm1, %xmm3 ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, -8(%rdi,%rax) -; CHECK-NEXT: addq $-8, %rax -; CHECK-NEXT: jne .LBB7_1 +; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) +; CHECK-NEXT: addq $-1, %rax +; CHECK-NEXT: jb .LBB7_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll index 42bd6e9b75447..ce49dbde3b173 100644 --- a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll +++ b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll @@ -174,31 +174,32 @@ return: define void @count_down_signed(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: count_down_signed: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq $-10, %rax -; CHECK-NEXT: movl $167772160, %ecx # imm = 0xA000000 -; CHECK-NEXT: movl $2560, %edx # imm = 0xA00 +; CHECK-NEXT: movl $167772160, %eax # imm = 0xA000000 +; CHECK-NEXT: movl $2560, %ecx # imm = 0xA00 +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB3_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rdx, %rsi +; CHECK-NEXT: movq %rcx, %rsi ; CHECK-NEXT: sarq $8, %rsi ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm0, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rdi,%rsi,8) -; CHECK-NEXT: movq %rcx, %rsi +; CHECK-NEXT: movq %rax, %rsi ; CHECK-NEXT: sarq $24, %rsi ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm1, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rdi,%rsi,8) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, 160(%rdi,%rax,8) -; CHECK-NEXT: addq $-16777216, %rcx # imm = 0xFF000000 -; CHECK-NEXT: addq $-256, %rdx -; CHECK-NEXT: decq %rax +; CHECK-NEXT: movsd %xmm3, 80(%rdi,%rdx,8) +; CHECK-NEXT: decq %rdx +; CHECK-NEXT: addq $-16777216, %rax # imm = 0xFF000000 +; CHECK-NEXT: addq $-256, %rcx +; CHECK-NEXT: cmpq $10, %rdx ; CHECK-NEXT: jne .LBB3_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -346,7 +347,7 @@ define void @another_count_up_signed(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movq %rdi, %rdx +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB6_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -362,11 +363,11 @@ define void @another_count_up_signed(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: movsd %xmm3, (%rdi,%r8,8) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, (%rdx) -; CHECK-NEXT: addq $8, %rdx +; CHECK-NEXT: movsd %xmm3, (%rdi,%rdx,8) +; CHECK-NEXT: incq %rdx ; CHECK-NEXT: addq $16777216, %rcx # imm = 0x1000000 ; CHECK-NEXT: addq $256, %rax # imm = 0x100 -; CHECK-NEXT: decq %rsi +; CHECK-NEXT: cmpq %rdx, %rsi ; CHECK-NEXT: jne .LBB6_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -402,32 +403,33 @@ return: define void @another_count_down_signed(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: another_count_down_signed: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: leaq -10(%rsi), %rax +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: shlq $24, %rax ; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: shlq $24, %rcx -; CHECK-NEXT: shlq $8, %rsi +; CHECK-NEXT: shlq $8, %rcx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB7_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rsi, %rdx +; CHECK-NEXT: movq %rcx, %rdx ; CHECK-NEXT: sarq $8, %rdx ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm0, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rdi,%rdx,8) -; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: movq %rax, %rdx ; CHECK-NEXT: sarq $24, %rdx ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm1, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rdi,%rdx,8) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, 80(%rdi,%rax,8) -; CHECK-NEXT: addq $-16777216, %rcx # imm = 0xFF000000 -; CHECK-NEXT: addq $-256, %rsi -; CHECK-NEXT: decq %rax +; CHECK-NEXT: movsd %xmm3, (%rdi,%rsi,8) +; CHECK-NEXT: decq %rsi +; CHECK-NEXT: addq $-16777216, %rax # imm = 0xFF000000 +; CHECK-NEXT: addq $-256, %rcx +; CHECK-NEXT: cmpq $10, %rsi ; CHECK-NEXT: jne .LBB7_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -463,7 +465,7 @@ return: define void @yet_another_count_down(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: yet_another_count_down: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq $-2040, %rax # imm = 0xF808 +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] @@ -474,16 +476,17 @@ define void @yet_another_count_down(ptr %d, i64 %n) nounwind { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm0, %xmm3 -; CHECK-NEXT: movsd %xmm3, 2040(%rdi,%rax) +; CHECK-NEXT: movsd %xmm3, (%rdi,%rax,8) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm1, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rcx) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm2, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rdx) +; CHECK-NEXT: addq $255, %rax ; CHECK-NEXT: addq $-8, %rdx ; CHECK-NEXT: addq $134217720, %rcx # imm = 0x7FFFFF8 -; CHECK-NEXT: addq $2040, %rax # imm = 0x7F8 +; CHECK-NEXT: cmpq $255, %rax ; CHECK-NEXT: jne .LBB8_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq @@ -624,31 +627,32 @@ return: define void @yet_another_count_up_signed(ptr %d, i64 %n) nounwind { ; CHECK-LABEL: yet_another_count_up_signed: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq $-10, %rax -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0] +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB11_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rcx, %rsi +; CHECK-NEXT: movq %rax, %rsi ; CHECK-NEXT: sarq $8, %rsi ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm0, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rdi,%rsi,8) -; CHECK-NEXT: movq %rdx, %rsi +; CHECK-NEXT: movq %rcx, %rsi ; CHECK-NEXT: sarq $24, %rsi ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm1, %xmm3 ; CHECK-NEXT: movsd %xmm3, (%rdi,%rsi,8) ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: mulsd %xmm2, %xmm3 -; CHECK-NEXT: movsd %xmm3, 80(%rdi,%rax,8) -; CHECK-NEXT: addq $50331648, %rdx # imm = 0x3000000 -; CHECK-NEXT: addq $768, %rcx # imm = 0x300 -; CHECK-NEXT: addq $3, %rax +; CHECK-NEXT: movsd %xmm3, (%rdi,%rdx,8) +; CHECK-NEXT: addq $3, %rdx +; CHECK-NEXT: addq $50331648, %rcx # imm = 0x3000000 +; CHECK-NEXT: addq $768, %rax # imm = 0x300 +; CHECK-NEXT: cmpq $10, %rdx ; CHECK-NEXT: jne .LBB11_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll index 3187bf6448690..3f25413de665f 100644 --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -98,8 +98,8 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask ; AVX1-NEXT: vmovhpd %xmm0, (%rdi) ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB0_4: ## %else2 -; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: jne LBB0_5 ; AVX1-NEXT: ## %bb.6: ## %else5 ; AVX1-NEXT: testb $8, %al @@ -114,8 +114,8 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask ; AVX1-NEXT: vmovhps %xmm1, (%rdi) ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB0_12: ## %else14 -; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: jne LBB0_13 ; AVX1-NEXT: ## %bb.14: ## %else17 ; AVX1-NEXT: testb $-128, %al @@ -166,8 +166,8 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask ; AVX2-NEXT: vmovhpd %xmm0, (%rdi) ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB0_4: ## %else2 -; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: jne LBB0_5 ; AVX2-NEXT: ## %bb.6: ## %else5 ; AVX2-NEXT: testb $8, %al @@ -182,8 +182,8 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask ; AVX2-NEXT: vmovhps %xmm1, (%rdi) ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB0_12: ## %else14 -; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: jne LBB0_13 ; AVX2-NEXT: ## %bb.14: ## %else17 ; AVX2-NEXT: testb $-128, %al @@ -396,8 +396,8 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> % ; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_4: ## %else2 -; AVX1OR2-NEXT: testb $4, %al ; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1OR2-NEXT: testb $4, %al ; AVX1OR2-NEXT: jne LBB1_5 ; AVX1OR2-NEXT: ## %bb.6: ## %else5 ; AVX1OR2-NEXT: testb $8, %al @@ -412,8 +412,8 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> % ; AVX1OR2-NEXT: vmovhps %xmm1, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_12: ## %else14 -; AVX1OR2-NEXT: testb $64, %al ; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1OR2-NEXT: testb $64, %al ; AVX1OR2-NEXT: jne LBB1_13 ; AVX1OR2-NEXT: ## %bb.14: ## %else17 ; AVX1OR2-NEXT: testb %al, %al @@ -428,8 +428,8 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> % ; AVX1OR2-NEXT: vmovhps %xmm2, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_20: ## %else26 -; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX1OR2-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX1OR2-NEXT: jne LBB1_21 ; AVX1OR2-NEXT: ## %bb.22: ## %else29 ; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800 @@ -444,8 +444,8 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> % ; AVX1OR2-NEXT: vmovhps %xmm3, (%rdi) ; AVX1OR2-NEXT: addq $8, %rdi ; AVX1OR2-NEXT: LBB1_28: ## %else38 -; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1OR2-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1OR2-NEXT: jne LBB1_29 ; AVX1OR2-NEXT: ## %bb.30: ## %else41 ; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000 @@ -980,8 +980,8 @@ define void @compressstore_v8f32_v8i1(ptr %base, <8 x float> %V, <8 x i1> %mask) ; AVX1-NEXT: vpextrd $3, %xmm0, (%rdi) ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: LBB4_8: ## %else8 -; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: jne LBB4_9 ; AVX1-NEXT: ## %bb.10: ## %else11 ; AVX1-NEXT: testb $32, %al @@ -1051,8 +1051,8 @@ define void @compressstore_v8f32_v8i1(ptr %base, <8 x float> %V, <8 x i1> %mask) ; AVX2-NEXT: vpextrd $3, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB4_8: ## %else8 -; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: jne LBB4_9 ; AVX2-NEXT: ## %bb.10: ## %else11 ; AVX2-NEXT: testb $32, %al @@ -1135,32 +1135,13 @@ define void @compressstore_v8f32_v8i1(ptr %base, <8 x float> %V, <8 x i1> %mask) define void @compressstore_v16f32_const(ptr %base, <16 x float> %V) { ; SSE2-LABEL: compressstore_v16f32_const: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] -; SSE2-NEXT: movss %xmm4, 4(%rdi) -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE2-NEXT: movss %xmm4, 8(%rdi) -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: movss %xmm0, 12(%rdi) -; SSE2-NEXT: movss %xmm1, 16(%rdi) -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] -; SSE2-NEXT: movss %xmm0, 20(%rdi) -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: movss %xmm0, 24(%rdi) -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE2-NEXT: movss %xmm1, 28(%rdi) -; SSE2-NEXT: movss %xmm2, 32(%rdi) -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE2-NEXT: movss %xmm0, 36(%rdi) -; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: movss %xmm2, 40(%rdi) -; SSE2-NEXT: movss %xmm3, 44(%rdi) -; SSE2-NEXT: movaps %xmm3, %xmm0 +; SSE2-NEXT: movups %xmm0, (%rdi) +; SSE2-NEXT: movups %xmm1, 16(%rdi) +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE2-NEXT: movups %xmm2, 32(%rdi) +; SSE2-NEXT: movapd %xmm3, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE2-NEXT: movss %xmm0, 48(%rdi) ; SSE2-NEXT: movaps %xmm3, %xmm0 @@ -1882,8 +1863,8 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32> ; AVX1-NEXT: vextractps $3, %xmm0, (%rdi) ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: LBB6_8: ## %else8 -; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: jne LBB6_9 ; AVX1-NEXT: ## %bb.10: ## %else11 ; AVX1-NEXT: testb $32, %al @@ -1910,8 +1891,8 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32> ; AVX1-NEXT: vextractps $3, %xmm1, (%rdi) ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: LBB6_24: ## %else32 -; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: jne LBB6_25 ; AVX1-NEXT: ## %bb.26: ## %else35 ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 @@ -1938,8 +1919,8 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32> ; AVX1-NEXT: vextractps $3, %xmm2, (%rdi) ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: LBB6_40: ## %else56 -; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX1-NEXT: jne LBB6_41 ; AVX1-NEXT: ## %bb.42: ## %else59 ; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 @@ -1966,8 +1947,8 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32> ; AVX1-NEXT: vextractps $3, %xmm3, (%rdi) ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: LBB6_56: ## %else80 -; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX1-NEXT: jne LBB6_57 ; AVX1-NEXT: ## %bb.58: ## %else83 ; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 @@ -2154,8 +2135,8 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32> ; AVX2-NEXT: vextractps $3, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB6_8: ## %else8 -; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: jne LBB6_9 ; AVX2-NEXT: ## %bb.10: ## %else11 ; AVX2-NEXT: testb $32, %al @@ -2182,8 +2163,8 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32> ; AVX2-NEXT: vextractps $3, %xmm1, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB6_24: ## %else32 -; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: jne LBB6_25 ; AVX2-NEXT: ## %bb.26: ## %else35 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 @@ -2210,8 +2191,8 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32> ; AVX2-NEXT: vextractps $3, %xmm2, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB6_40: ## %else56 -; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX2-NEXT: jne LBB6_41 ; AVX2-NEXT: ## %bb.42: ## %else59 ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 @@ -2238,8 +2219,8 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32> ; AVX2-NEXT: vextractps $3, %xmm3, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB6_56: ## %else80 -; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX2-NEXT: jne LBB6_57 ; AVX2-NEXT: ## %bb.58: ## %else83 ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 @@ -2611,8 +2592,8 @@ define void @compressstore_v4i64_v4i1(ptr %base, <4 x i64> %V, <4 x i1> %mask) { ; AVX1-NEXT: vpextrq $1, %xmm0, (%rdi) ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB8_4: ## %else2 -; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: jne LBB8_5 ; AVX1-NEXT: ## %bb.6: ## %else5 ; AVX1-NEXT: testb $8, %al @@ -2646,8 +2627,8 @@ define void @compressstore_v4i64_v4i1(ptr %base, <4 x i64> %V, <4 x i1> %mask) { ; AVX2-NEXT: vpextrq $1, %xmm0, (%rdi) ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB8_4: ## %else2 -; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: jne LBB8_5 ; AVX2-NEXT: ## %bb.6: ## %else5 ; AVX2-NEXT: testb $8, %al @@ -2855,8 +2836,8 @@ define void @compressstore_v8i64_v8i1(ptr %base, <8 x i64> %V, <8 x i1> %mask) { ; AVX1-NEXT: vpextrq $1, %xmm0, (%rdi) ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB9_4: ## %else2 -; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: jne LBB9_5 ; AVX1-NEXT: ## %bb.6: ## %else5 ; AVX1-NEXT: testb $8, %al @@ -2871,8 +2852,8 @@ define void @compressstore_v8i64_v8i1(ptr %base, <8 x i64> %V, <8 x i1> %mask) { ; AVX1-NEXT: vpextrq $1, %xmm1, (%rdi) ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: LBB9_12: ## %else14 -; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: jne LBB9_13 ; AVX1-NEXT: ## %bb.14: ## %else17 ; AVX1-NEXT: testb $-128, %al @@ -2923,8 +2904,8 @@ define void @compressstore_v8i64_v8i1(ptr %base, <8 x i64> %V, <8 x i1> %mask) { ; AVX2-NEXT: vpextrq $1, %xmm0, (%rdi) ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB9_4: ## %else2 -; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: jne LBB9_5 ; AVX2-NEXT: ## %bb.6: ## %else5 ; AVX2-NEXT: testb $8, %al @@ -2939,8 +2920,8 @@ define void @compressstore_v8i64_v8i1(ptr %base, <8 x i64> %V, <8 x i1> %mask) { ; AVX2-NEXT: vpextrq $1, %xmm1, (%rdi) ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: LBB9_12: ## %else14 -; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: jne LBB9_13 ; AVX2-NEXT: ## %bb.14: ## %else17 ; AVX2-NEXT: testb $-128, %al @@ -3594,8 +3575,8 @@ define void @compressstore_v16i8_v16i8(ptr %base, <16 x i8> %V, <16 x i8> %trigg ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqb %xmm1, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB12_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -3611,8 +3592,8 @@ define void @compressstore_v16i8_v16i8(ptr %base, <16 x i8> %V, <16 x i8> %trigg ; SSE2-NEXT: movb %cl, (%rdi) ; SSE2-NEXT: incq %rdi ; SSE2-NEXT: LBB12_8: ## %else8 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB12_10 ; SSE2-NEXT: ## %bb.9: ## %cond.store10 ; SSE2-NEXT: movb %cl, (%rdi) @@ -3624,8 +3605,8 @@ define void @compressstore_v16i8_v16i8(ptr %base, <16 x i8> %V, <16 x i8> %trigg ; SSE2-NEXT: movb %ch, (%rdi) ; SSE2-NEXT: incq %rdi ; SSE2-NEXT: LBB12_12: ## %else14 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB12_14 ; SSE2-NEXT: ## %bb.13: ## %cond.store16 ; SSE2-NEXT: movb %cl, (%rdi) @@ -3637,8 +3618,8 @@ define void @compressstore_v16i8_v16i8(ptr %base, <16 x i8> %V, <16 x i8> %trigg ; SSE2-NEXT: movb %ch, (%rdi) ; SSE2-NEXT: incq %rdi ; SSE2-NEXT: LBB12_16: ## %else20 -; SSE2-NEXT: testl $256, %eax ## imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 ; SSE2-NEXT: je LBB12_18 ; SSE2-NEXT: ## %bb.17: ## %cond.store22 ; SSE2-NEXT: movb %cl, (%rdi) @@ -3650,8 +3631,8 @@ define void @compressstore_v16i8_v16i8(ptr %base, <16 x i8> %V, <16 x i8> %trigg ; SSE2-NEXT: movb %ch, (%rdi) ; SSE2-NEXT: incq %rdi ; SSE2-NEXT: LBB12_20: ## %else26 -; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: je LBB12_22 ; SSE2-NEXT: ## %bb.21: ## %cond.store28 ; SSE2-NEXT: movb %cl, (%rdi) @@ -3663,8 +3644,8 @@ define void @compressstore_v16i8_v16i8(ptr %base, <16 x i8> %V, <16 x i8> %trigg ; SSE2-NEXT: movb %ch, (%rdi) ; SSE2-NEXT: incq %rdi ; SSE2-NEXT: LBB12_24: ## %else32 -; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE2-NEXT: je LBB12_26 ; SSE2-NEXT: ## %bb.25: ## %cond.store34 ; SSE2-NEXT: movb %cl, (%rdi) @@ -3676,8 +3657,8 @@ define void @compressstore_v16i8_v16i8(ptr %base, <16 x i8> %V, <16 x i8> %trigg ; SSE2-NEXT: movb %ch, (%rdi) ; SSE2-NEXT: incq %rdi ; SSE2-NEXT: LBB12_28: ## %else38 -; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: jne LBB12_29 ; SSE2-NEXT: ## %bb.30: ## %else41 ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index 4c5b67962a58b..2b78b98934f27 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1003,17 +1003,17 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, < ; AVX512F-LABEL: expandload_v16f64_v16i32: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm3, %k1 -; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k2 -; AVX512F-NEXT: vexpandpd (%rdi), %zmm0 {%k2} -; AVX512F-NEXT: kmovw %k2, %eax +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 +; AVX512F-NEXT: vexpandpd (%rdi), %zmm0 {%k1} +; AVX512F-NEXT: vptestnmd %zmm3, %zmm3, %k2 +; AVX512F-NEXT: kmovw %k1, %eax ; AVX512F-NEXT: movzbl %al, %eax ; AVX512F-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201 ; AVX512F-NEXT: shrl $3, %eax ; AVX512F-NEXT: andl $286331153, %eax ## imm = 0x11111111 ; AVX512F-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111 ; AVX512F-NEXT: shrl $28, %eax -; AVX512F-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} +; AVX512F-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k2} ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: expandload_v16f64_v16i32: @@ -1034,17 +1034,17 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, < ; AVX512VLBW-LABEL: expandload_v16f64_v16i32: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm3, %k1 -; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k2 -; AVX512VLBW-NEXT: vexpandpd (%rdi), %zmm0 {%k2} -; AVX512VLBW-NEXT: kmovd %k2, %eax +; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1 +; AVX512VLBW-NEXT: vexpandpd (%rdi), %zmm0 {%k1} +; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm3, %k2 +; AVX512VLBW-NEXT: kmovd %k1, %eax ; AVX512VLBW-NEXT: movzbl %al, %eax ; AVX512VLBW-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201 ; AVX512VLBW-NEXT: shrl $3, %eax ; AVX512VLBW-NEXT: andl $286331153, %eax ## imm = 0x11111111 ; AVX512VLBW-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111 ; AVX512VLBW-NEXT: shrl $28, %eax -; AVX512VLBW-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} +; AVX512VLBW-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k2} ; AVX512VLBW-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer %res = call <16 x double> @llvm.masked.expandload.v16f64(ptr %base, <16 x i1> %mask, <16 x double> %src0) diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll index 559a7ec0930b9..1acde45a4fdb2 100644 --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -73,8 +73,8 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, < ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB0_4: # %else2 -; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: jne .LBB0_5 ; AVX1-NEXT: # %bb.6: # %else5 ; AVX1-NEXT: testb $8, %al @@ -113,8 +113,8 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, < ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB0_4: # %else2 -; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: jne .LBB0_5 ; AVX2-NEXT: # %bb.6: # %else5 ; AVX2-NEXT: testb $8, %al @@ -153,8 +153,8 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, < ; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX512F-NEXT: .LBB0_4: # %else2 -; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: jne .LBB0_5 ; AVX512F-NEXT: # %bb.6: # %else5 ; AVX512F-NEXT: testb $8, %al @@ -262,8 +262,8 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(ptr %base, <4 x i32> %idx, <4 x i32 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB1_4: # %else2 -; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: jne .LBB1_5 ; AVX1-NEXT: # %bb.6: # %else5 ; AVX1-NEXT: testb $8, %al @@ -307,8 +307,8 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(ptr %base, <4 x i32> %idx, <4 x i32 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB1_4: # %else2 -; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: jne .LBB1_5 ; AVX2-NEXT: # %bb.6: # %else5 ; AVX2-NEXT: testb $8, %al @@ -352,8 +352,8 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(ptr %base, <4 x i32> %idx, <4 x i32 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX512F-NEXT: .LBB1_4: # %else2 -; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: jne .LBB1_5 ; AVX512F-NEXT: # %bb.6: # %else5 ; AVX512F-NEXT: testb $8, %al @@ -459,8 +459,8 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(ptr %base, <4 x i64> %idx, <4 x i32 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX1-NEXT: .LBB2_4: # %else2 -; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: jne .LBB2_5 ; AVX1-NEXT: # %bb.6: # %else5 ; AVX1-NEXT: testb $8, %al @@ -503,8 +503,8 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(ptr %base, <4 x i64> %idx, <4 x i32 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX2-NEXT: .LBB2_4: # %else2 -; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: jne .LBB2_5 ; AVX2-NEXT: # %bb.6: # %else5 ; AVX2-NEXT: testb $8, %al @@ -547,8 +547,8 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(ptr %base, <4 x i64> %idx, <4 x i32 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; AVX512F-NEXT: .LBB2_4: # %else2 -; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: testb $4, %al ; AVX512F-NEXT: jne .LBB2_5 ; AVX512F-NEXT: # %bb.6: # %else5 ; AVX512F-NEXT: testb $8, %al @@ -750,8 +750,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: .LBB3_4: # %else2 ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm5 ; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm2 -; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je .LBB3_6 ; AVX1-NEXT: # %bb.5: # %cond.load4 ; AVX1-NEXT: vmovq %xmm0, %rcx @@ -781,8 +781,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: .LBB3_12: # %else14 ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm5 ; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm2 -; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB3_14 ; AVX1-NEXT: # %bb.13: # %cond.load16 ; AVX1-NEXT: vmovq %xmm0, %rcx @@ -813,8 +813,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: .LBB3_20: # %else26 ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testl $1024, %eax # imm = 0x400 ; AVX1-NEXT: je .LBB3_22 ; AVX1-NEXT: # %bb.21: # %cond.load28 ; AVX1-NEXT: vmovq %xmm0, %rcx @@ -840,8 +840,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: vpinsrb $13, (%rcx), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_28: # %else38 -; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX1-NEXT: jne .LBB3_29 ; AVX1-NEXT: # %bb.30: # %else41 ; AVX1-NEXT: testl $32768, %eax # imm = 0x8000 @@ -884,8 +884,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX2-NEXT: vpinsrb $1, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_4: # %else2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je .LBB3_6 ; AVX2-NEXT: # %bb.5: # %cond.load4 ; AVX2-NEXT: vmovq %xmm0, %rcx @@ -911,8 +911,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: vpinsrb $5, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_12: # %else14 -; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB3_14 ; AVX2-NEXT: # %bb.13: # %cond.load16 ; AVX2-NEXT: vmovq %xmm0, %rcx @@ -939,8 +939,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX2-NEXT: vpinsrb $9, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_20: # %else26 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testl $1024, %eax # imm = 0x400 ; AVX2-NEXT: je .LBB3_22 ; AVX2-NEXT: # %bb.21: # %cond.load28 ; AVX2-NEXT: vmovq %xmm0, %rcx @@ -966,8 +966,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: vpinsrb $13, (%rcx), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_28: # %else38 -; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX2-NEXT: jne .LBB3_29 ; AVX2-NEXT: # %bb.30: # %else41 ; AVX2-NEXT: testl $32768, %eax # imm = 0x8000 @@ -1012,8 +1012,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX512-NEXT: vpextrq $1, %xmm1, %rcx ; AVX512-NEXT: vpinsrb $3, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_8: # %else8 -; AVX512-NEXT: testb $16, %al ; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm1 +; AVX512-NEXT: testb $16, %al ; AVX512-NEXT: je .LBB3_10 ; AVX512-NEXT: # %bb.9: # %cond.load10 ; AVX512-NEXT: vmovq %xmm1, %rcx @@ -1026,8 +1026,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX512-NEXT: vpinsrb $5, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_12: # %else14 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: testb $64, %al ; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm0 +; AVX512-NEXT: testb $64, %al ; AVX512-NEXT: je .LBB3_14 ; AVX512-NEXT: # %bb.13: # %cond.load16 ; AVX512-NEXT: vmovq %xmm0, %rcx @@ -1057,8 +1057,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX512-NEXT: vpextrq $1, %xmm1, %rcx ; AVX512-NEXT: vpinsrb $11, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_24: # %else32 -; AVX512-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512-NEXT: testl $4096, %eax # imm = 0x1000 ; AVX512-NEXT: je .LBB3_26 ; AVX512-NEXT: # %bb.25: # %cond.load34 ; AVX512-NEXT: vmovq %xmm1, %rcx @@ -1070,8 +1070,8 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(ptr %base, <16 x i32> %idx, <16 x i8 ; AVX512-NEXT: vpextrq $1, %xmm1, %rcx ; AVX512-NEXT: vpinsrb $13, (%rcx), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_28: # %else38 -; AVX512-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: testl $16384, %eax # imm = 0x4000 ; AVX512-NEXT: jne .LBB3_29 ; AVX512-NEXT: # %bb.30: # %else41 ; AVX512-NEXT: testl $32768, %eax # imm = 0x8000 @@ -1334,8 +1334,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: # implicit-def: $ymm1 +; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al @@ -1367,8 +1367,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: vmovmskps %ymm3, %eax -; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: # implicit-def: $ymm3 +; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_17 ; AVX1-NEXT: # %bb.18: # %else26 ; AVX1-NEXT: testb $2, %al @@ -1400,8 +1400,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax -; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: # implicit-def: $ymm0 +; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_33 ; AVX1-NEXT: # %bb.34: # %else67 ; AVX1-NEXT: testb $2, %al @@ -1544,8 +1544,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: # implicit-def: $ymm1 +; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else ; AVX2-NEXT: testb $2, %al @@ -1575,8 +1575,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vmovmskps %ymm2, %eax -; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: # implicit-def: $ymm2 +; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_17 ; AVX2-NEXT: # %bb.18: # %else26 ; AVX2-NEXT: testb $2, %al @@ -1606,8 +1606,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovmskps %ymm0, %eax -; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: # implicit-def: $ymm0 +; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_33 ; AVX2-NEXT: # %bb.34: # %else67 ; AVX2-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index af018d83d520e..f641975337d66 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -646,8 +646,8 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) { ; X64-KNL-NEXT: vpbroadcastq %xmm1, %ymm1 ; X64-KNL-NEXT: vpaddq %ymm0, %ymm1, %ymm1 ; X64-KNL-NEXT: kmovw %k0, %eax -; X64-KNL-NEXT: testb $1, %al ; X64-KNL-NEXT: # implicit-def: $xmm0 +; X64-KNL-NEXT: testb $1, %al ; X64-KNL-NEXT: je .LBB14_2 ; X64-KNL-NEXT: # %bb.1: # %cond.load ; X64-KNL-NEXT: vmovq %xmm1, %rcx @@ -659,8 +659,8 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) { ; X64-KNL-NEXT: vpextrq $1, %xmm1, %rcx ; X64-KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; X64-KNL-NEXT: .LBB14_4: # %else2 -; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: jne .LBB14_5 ; X64-KNL-NEXT: # %bb.6: # %else5 ; X64-KNL-NEXT: testb $8, %al @@ -683,12 +683,12 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) { ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; X86-KNL-NEXT: kmovw %k0, %eax -; X86-KNL-NEXT: testb $1, %al ; X86-KNL-NEXT: # implicit-def: $xmm0 +; X86-KNL-NEXT: testb $1, %al ; X86-KNL-NEXT: jne .LBB14_1 ; X86-KNL-NEXT: # %bb.2: # %else ; X86-KNL-NEXT: testb $2, %al @@ -773,8 +773,8 @@ define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x doub ; X64-KNL-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1] ; X64-KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; X64-KNL-NEXT: .LBB15_4: # %else2 -; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: jne .LBB15_5 ; X64-KNL-NEXT: # %bb.6: # %else5 ; X64-KNL-NEXT: testb $8, %al @@ -799,8 +799,8 @@ define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x doub ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; X86-KNL-NEXT: vpslld $3, %xmm0, %xmm0 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-KNL-NEXT: vpslld $3, %xmm0, %xmm0 ; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -900,8 +900,8 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; X86-KNL-NEXT: vpslld $3, %xmm0, %xmm0 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-KNL-NEXT: vpslld $3, %xmm0, %xmm0 ; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -1008,8 +1008,8 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) { ; X64-KNL-NEXT: vpextrq $1, %xmm1, %rcx ; X64-KNL-NEXT: vextractps $1, %xmm0, (%rcx) ; X64-KNL-NEXT: .LBB17_4: # %else2 -; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: jne .LBB17_5 ; X64-KNL-NEXT: # %bb.6: # %else4 ; X64-KNL-NEXT: testb $8, %al @@ -1108,9 +1108,9 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) { ; X64-KNL-NEXT: vpextrq $1, %xmm1, %rcx ; X64-KNL-NEXT: vmovhps %xmm0, (%rcx) ; X64-KNL-NEXT: .LBB18_4: # %else2 -; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X64-KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: jne .LBB18_5 ; X64-KNL-NEXT: # %bb.6: # %else4 ; X64-KNL-NEXT: testb $8, %al @@ -1135,8 +1135,8 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) { ; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; X86-KNL-NEXT: vpmovqd %zmm2, %ymm1 -; X86-KNL-NEXT: vpslld $3, %xmm1, %xmm1 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 +; X86-KNL-NEXT: vpslld $3, %xmm1, %xmm1 ; X86-KNL-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -1151,8 +1151,8 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) { ; X86-KNL-NEXT: vpextrd $1, %xmm1, %ecx ; X86-KNL-NEXT: vmovhps %xmm0, (%ecx) ; X86-KNL-NEXT: .LBB18_4: # %else2 -; X86-KNL-NEXT: testb $4, %al ; X86-KNL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-KNL-NEXT: testb $4, %al ; X86-KNL-NEXT: jne .LBB18_5 ; X86-KNL-NEXT: # %bb.6: # %else4 ; X86-KNL-NEXT: testb $8, %al @@ -1426,8 +1426,8 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -1550,8 +1550,8 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa ; X86-KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 ; X86-KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -1675,8 +1675,8 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -1795,8 +1795,8 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> % ; X86-KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 ; X86-KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -1966,8 +1966,8 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; X86-KNL-NEXT: vpslld $3, %xmm0, %xmm0 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-KNL-NEXT: vpslld $3, %xmm0, %xmm0 ; X86-KNL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -2167,21 +2167,37 @@ define <2 x float> @test27(ptr %base, <2 x i32> %ind) { ; Data type requires promotion, mask is all-ones define void @test28(<2 x i32>%a1, <2 x ptr> %ptr) { -; X64-LABEL: test28: -; X64: # %bb.0: -; X64-NEXT: vmovq %xmm1, %rax -; X64-NEXT: vmovss %xmm0, (%rax) -; X64-NEXT: vpextrq $1, %xmm1, %rax -; X64-NEXT: vextractps $1, %xmm0, (%rax) -; X64-NEXT: retq +; X64-KNL-LABEL: test28: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: vmovq %xmm1, %rax +; X64-KNL-NEXT: vmovss %xmm0, (%rax) +; X64-KNL-NEXT: vpextrq $1, %xmm1, %rax +; X64-KNL-NEXT: vextractps $1, %xmm0, (%rax) +; X64-KNL-NEXT: retq ; -; X86-LABEL: test28: -; X86: # %bb.0: -; X86-NEXT: vmovd %xmm1, %eax -; X86-NEXT: vmovss %xmm0, (%eax) -; X86-NEXT: vpextrd $1, %xmm1, %eax -; X86-NEXT: vextractps $1, %xmm0, (%eax) -; X86-NEXT: retl +; X86-KNL-LABEL: test28: +; X86-KNL: # %bb.0: +; X86-KNL-NEXT: vmovd %xmm1, %eax +; X86-KNL-NEXT: vmovss %xmm0, (%eax) +; X86-KNL-NEXT: vpextrd $1, %xmm1, %eax +; X86-KNL-NEXT: vextractps $1, %xmm0, (%eax) +; X86-KNL-NEXT: retl +; +; X64-SKX-LABEL: test28: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vmovq %xmm1, %rax +; X64-SKX-NEXT: vpextrq $1, %xmm1, %rcx +; X64-SKX-NEXT: vmovss %xmm0, (%rax) +; X64-SKX-NEXT: vextractps $1, %xmm0, (%rcx) +; X64-SKX-NEXT: retq +; +; X86-SKX-LABEL: test28: +; X86-SKX: # %bb.0: +; X86-SKX-NEXT: vmovd %xmm1, %eax +; X86-SKX-NEXT: vpextrd $1, %xmm1, %ecx +; X86-SKX-NEXT: vmovss %xmm0, (%eax) +; X86-SKX-NEXT: vextractps $1, %xmm0, (%ecx) +; X86-SKX-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> ) ret void } @@ -2968,8 +2984,8 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; X64-KNL-NEXT: vpslld $31, %xmm1, %xmm1 ; X64-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; X64-KNL-NEXT: kmovw %k0, %eax -; X64-KNL-NEXT: testb $1, %al ; X64-KNL-NEXT: # implicit-def: $ymm1 +; X64-KNL-NEXT: testb $1, %al ; X64-KNL-NEXT: je .LBB42_2 ; X64-KNL-NEXT: # %bb.1: # %cond.load ; X64-KNL-NEXT: vmovq %xmm0, %rcx @@ -2982,8 +2998,8 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; X64-KNL-NEXT: vpinsrq $1, (%rcx), %xmm1, %xmm2 ; X64-KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; X64-KNL-NEXT: .LBB42_4: # %else2 -; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: je .LBB42_6 ; X64-KNL-NEXT: # %bb.5: # %cond.load4 ; X64-KNL-NEXT: vmovq %xmm2, %rcx @@ -2998,8 +3014,8 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; X64-KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; X64-KNL-NEXT: .LBB42_8: # %else8 ; X64-KNL-NEXT: kmovw %k0, %eax -; X64-KNL-NEXT: testb $1, %al ; X64-KNL-NEXT: # implicit-def: $ymm3 +; X64-KNL-NEXT: testb $1, %al ; X64-KNL-NEXT: jne .LBB42_9 ; X64-KNL-NEXT: # %bb.10: # %else15 ; X64-KNL-NEXT: testb $2, %al @@ -3016,8 +3032,8 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; X64-KNL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; X64-KNL-NEXT: .LBB42_16: # %else33 ; X64-KNL-NEXT: kmovw %k0, %eax -; X64-KNL-NEXT: testb $1, %al ; X64-KNL-NEXT: # implicit-def: $ymm4 +; X64-KNL-NEXT: testb $1, %al ; X64-KNL-NEXT: jne .LBB42_17 ; X64-KNL-NEXT: # %bb.18: # %else40 ; X64-KNL-NEXT: testb $2, %al @@ -3089,23 +3105,23 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; X86-KNL-NEXT: vpslld $31, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; X86-KNL-NEXT: kmovw %k0, %ebx -; X86-KNL-NEXT: testb $1, %bl ; X86-KNL-NEXT: vmovd %xmm0, %eax ; X86-KNL-NEXT: # implicit-def: $ymm1 +; X86-KNL-NEXT: testb $1, %bl ; X86-KNL-NEXT: je .LBB42_2 ; X86-KNL-NEXT: # %bb.1: # %cond.load ; X86-KNL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; X86-KNL-NEXT: .LBB42_2: # %else -; X86-KNL-NEXT: testb $2, %bl ; X86-KNL-NEXT: vpextrd $1, %xmm0, %ecx +; X86-KNL-NEXT: testb $2, %bl ; X86-KNL-NEXT: je .LBB42_4 ; X86-KNL-NEXT: # %bb.3: # %cond.load1 ; X86-KNL-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm2 ; X86-KNL-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2 ; X86-KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; X86-KNL-NEXT: .LBB42_4: # %else2 -; X86-KNL-NEXT: testb $4, %bl ; X86-KNL-NEXT: vpextrd $2, %xmm0, %edx +; X86-KNL-NEXT: testb $4, %bl ; X86-KNL-NEXT: je .LBB42_6 ; X86-KNL-NEXT: # %bb.5: # %cond.load4 ; X86-KNL-NEXT: vpbroadcastd (%edx), %ymm2 @@ -3113,8 +3129,8 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; X86-KNL-NEXT: vpbroadcastd 4(%edx), %ymm2 ; X86-KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; X86-KNL-NEXT: .LBB42_6: # %else5 -; X86-KNL-NEXT: testb $8, %bl ; X86-KNL-NEXT: vpextrd $3, %xmm0, %esi +; X86-KNL-NEXT: testb $8, %bl ; X86-KNL-NEXT: je .LBB42_8 ; X86-KNL-NEXT: # %bb.7: # %cond.load7 ; X86-KNL-NEXT: vpbroadcastd (%esi), %ymm0 @@ -3123,8 +3139,8 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; X86-KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; X86-KNL-NEXT: .LBB42_8: # %else8 ; X86-KNL-NEXT: kmovw %k0, %ebx -; X86-KNL-NEXT: testb $1, %bl ; X86-KNL-NEXT: # implicit-def: $ymm0 +; X86-KNL-NEXT: testb $1, %bl ; X86-KNL-NEXT: jne .LBB42_9 ; X86-KNL-NEXT: # %bb.10: # %else15 ; X86-KNL-NEXT: testb $2, %bl @@ -3142,8 +3158,8 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64 ; X86-KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; X86-KNL-NEXT: .LBB42_16: # %else33 ; X86-KNL-NEXT: kmovw %k0, %ebx -; X86-KNL-NEXT: testb $1, %bl ; X86-KNL-NEXT: # implicit-def: $ymm2 +; X86-KNL-NEXT: testb $1, %bl ; X86-KNL-NEXT: jne .LBB42_17 ; X86-KNL-NEXT: # %bb.18: # %else40 ; X86-KNL-NEXT: testb $2, %bl @@ -3404,8 +3420,8 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x ; X86-KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; X86-KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 +; X86-KNL-NEXT: vpslld $2, %xmm0, %xmm0 ; X86-KNL-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -3672,8 +3688,8 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; X86-KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; X86-KNL-NEXT: vpslld $3, %xmm1, %xmm1 ; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 +; X86-KNL-NEXT: vpslld $3, %xmm1, %xmm1 ; X86-KNL-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: testb $1, %al @@ -4004,10 +4020,10 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) { ; X64-KNL-LABEL: gather_2i64_constant_indices: ; X64-KNL: # %bb.0: ; X64-KNL-NEXT: vpsllq $63, %xmm0, %xmm0 +; X64-KNL-NEXT: vmovq %rdi, %xmm1 +; X64-KNL-NEXT: vpbroadcastq %xmm1, %xmm1 +; X64-KNL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; X64-KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; X64-KNL-NEXT: vmovq %rdi, %xmm0 -; X64-KNL-NEXT: vpbroadcastq %xmm0, %xmm0 -; X64-KNL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; X64-KNL-NEXT: kmovw %k0, %eax ; X64-KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-KNL-NEXT: testb $1, %al @@ -4032,9 +4048,9 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) { ; X86-KNL-LABEL: gather_2i64_constant_indices: ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: vpsllq $63, %xmm0, %xmm0 +; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-KNL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 ; X86-KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; X86-KNL-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 -; X86-KNL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 ; X86-KNL-NEXT: kmovw %k0, %eax ; X86-KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-KNL-NEXT: testb $1, %al @@ -4060,9 +4076,9 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) { ; X64-SKX-SMALL-LABEL: gather_2i64_constant_indices: ; X64-SKX-SMALL: # %bb.0: ; X64-SKX-SMALL-NEXT: vpsllq $63, %xmm0, %xmm0 +; X64-SKX-SMALL-NEXT: vpbroadcastq %rdi, %xmm1 +; X64-SKX-SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; X64-SKX-SMALL-NEXT: vpmovq2m %xmm0, %k0 -; X64-SKX-SMALL-NEXT: vpbroadcastq %rdi, %xmm0 -; X64-SKX-SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; X64-SKX-SMALL-NEXT: kmovw %k0, %eax ; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-SMALL-NEXT: testb $1, %al @@ -4085,10 +4101,10 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) { ; X64-SKX-LARGE-LABEL: gather_2i64_constant_indices: ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: vpsllq $63, %xmm0, %xmm0 -; X64-SKX-LARGE-NEXT: vpmovq2m %xmm0, %k0 -; X64-SKX-LARGE-NEXT: vpbroadcastq %rdi, %xmm0 +; X64-SKX-LARGE-NEXT: vpbroadcastq %rdi, %xmm1 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm1 +; X64-SKX-LARGE-NEXT: vpaddq (%rax), %xmm1, %xmm1 +; X64-SKX-LARGE-NEXT: vpmovq2m %xmm0, %k0 ; X64-SKX-LARGE-NEXT: kmovw %k0, %eax ; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-LARGE-NEXT: testb $1, %al @@ -4111,9 +4127,9 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) { ; X86-SKX-LABEL: gather_2i64_constant_indices: ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: vpsllq $63, %xmm0, %xmm0 +; X86-SKX-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; X86-SKX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 ; X86-SKX-NEXT: vpmovq2m %xmm0, %k0 -; X86-SKX-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 -; X86-SKX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 ; X86-SKX-NEXT: kmovw %k0, %eax ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: testb $1, %al @@ -4405,8 +4421,8 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru ; X64-KNL-NEXT: vpextrq $1, %xmm0, %rcx ; X64-KNL-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1 ; X64-KNL-NEXT: .LBB62_4: # %else2 -; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: jne .LBB62_5 ; X64-KNL-NEXT: # %bb.6: # %else5 ; X64-KNL-NEXT: testb $8, %al @@ -4515,8 +4531,8 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) { ; X64-KNL-NEXT: vpextrq $1, %xmm0, %rcx ; X64-KNL-NEXT: vextractps $1, %xmm1, (%rcx) ; X64-KNL-NEXT: .LBB63_4: # %else2 -; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; X64-KNL-NEXT: testb $4, %al ; X64-KNL-NEXT: jne .LBB63_5 ; X64-KNL-NEXT: # %bb.6: # %else4 ; X64-KNL-NEXT: testb $8, %al @@ -4805,10 +4821,10 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-KNL: # %bb.0: ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm2 +; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-KNL-NEXT: vpaddd %zmm2, %zmm2, %zmm0 ; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq @@ -4829,10 +4845,10 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-SKX-SMALL: # %bb.0: ; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm2 +; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpaddd %zmm2, %zmm2, %zmm0 ; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-SMALL-NEXT: retq @@ -4841,11 +4857,11 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm2 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm2, %zmm2 +; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-LARGE-NEXT: vpaddd %zmm2, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-LARGE-NEXT: retq @@ -4874,10 +4890,10 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-KNL: # %bb.0: ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm2 +; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-KNL-NEXT: vpaddd %zmm2, %zmm2, %zmm0 ; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq @@ -4898,10 +4914,10 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-SKX-SMALL: # %bb.0: ; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm2 +; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vpaddd %zmm2, %zmm2, %zmm0 ; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-SMALL-NEXT: retq @@ -4910,11 +4926,11 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm2 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm2, %zmm2 +; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-LARGE-NEXT: vpaddd %zmm2, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-LARGE-NEXT: retq @@ -4943,10 +4959,10 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-KNL: # %bb.0: ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm2 +; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-KNL-NEXT: vpaddd %zmm2, %zmm2, %zmm2 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} @@ -4971,10 +4987,10 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-SKX-SMALL: # %bb.0: ; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm2 +; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-SKX-SMALL-NEXT: vpaddd %zmm2, %zmm2, %zmm2 ; X64-SKX-SMALL-NEXT: kmovw %k1, %k2 ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} @@ -4985,11 +5001,11 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm2 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm2, %zmm2 +; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-LARGE-NEXT: vpaddd %zmm2, %zmm2, %zmm2 ; X64-SKX-LARGE-NEXT: kmovw %k1, %k2 ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll index aad1b44344850..7a5b533c7d66a 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -458,38 +458,38 @@ define void @test_mscatter_v17f32(ptr %base, <17 x i32> %index, <17 x float> %va ; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] ; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; WIDEN_SKX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; WIDEN_SKX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; WIDEN_SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; WIDEN_SKX-NEXT: vmovd %esi, %xmm2 -; WIDEN_SKX-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpinsrd $3, %r8d, %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vmovd %r9d, %xmm3 +; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; WIDEN_SKX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; WIDEN_SKX-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; WIDEN_SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; WIDEN_SKX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm4, %xmm2 +; WIDEN_SKX-NEXT: vmovd %esi, %xmm4 +; WIDEN_SKX-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; WIDEN_SKX-NEXT: vpinsrd $2, %ecx, %xmm4, %xmm4 +; WIDEN_SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; WIDEN_SKX-NEXT: vpinsrd $3, %r8d, %xmm4, %xmm1 +; WIDEN_SKX-NEXT: vmovd %r9d, %xmm4 +; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; WIDEN_SKX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm4, %xmm3 +; WIDEN_SKX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; WIDEN_SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; WIDEN_SKX-NEXT: kxnorw %k0, %k0, %k1 @@ -506,38 +506,38 @@ define void @test_mscatter_v17f32(ptr %base, <17 x i32> %index, <17 x float> %va ; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] ; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; WIDEN_KNL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; WIDEN_KNL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; WIDEN_KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; WIDEN_KNL-NEXT: vmovd %esi, %xmm2 -; WIDEN_KNL-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vpinsrd $3, %r8d, %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vmovd %r9d, %xmm3 +; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; WIDEN_KNL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; WIDEN_KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; WIDEN_KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; WIDEN_KNL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm4, %xmm2 +; WIDEN_KNL-NEXT: vmovd %esi, %xmm4 +; WIDEN_KNL-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4 +; WIDEN_KNL-NEXT: vpinsrd $2, %ecx, %xmm4, %xmm4 +; WIDEN_KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; WIDEN_KNL-NEXT: vpinsrd $3, %r8d, %xmm4, %xmm1 +; WIDEN_KNL-NEXT: vmovd %r9d, %xmm4 +; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; WIDEN_KNL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm4, %xmm3 +; WIDEN_KNL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; WIDEN_KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; WIDEN_KNL-NEXT: kxnorw %k0, %k0, %k1 @@ -642,7 +642,6 @@ define void @test_mscatter_v17f32(ptr %base, <17 x i32> %index, <17 x float> %va define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index) ; WIDEN_SKX-LABEL: test_mgather_v17f32: ; WIDEN_SKX: # %bb.0: -; WIDEN_SKX-NEXT: movq %rdi, %rax ; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 ; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 @@ -651,21 +650,22 @@ define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index) ; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; WIDEN_SKX-NEXT: vmovd %edx, %xmm2 +; WIDEN_SKX-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; WIDEN_SKX-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm2 +; WIDEN_SKX-NEXT: movq %rdi, %rax +; WIDEN_SKX-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 +; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; WIDEN_SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; WIDEN_SKX-NEXT: vmovd %edx, %xmm1 -; WIDEN_SKX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vpinsrd $2, %r8d, %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm1 +; WIDEN_SKX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; WIDEN_SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; WIDEN_SKX-NEXT: kxnorw %k0, %k0, %k1 ; WIDEN_SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; WIDEN_SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1} ; WIDEN_SKX-NEXT: movw $1, %cx ; WIDEN_SKX-NEXT: kmovw %ecx, %k1 @@ -677,7 +677,6 @@ define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index) ; ; WIDEN_KNL-LABEL: test_mgather_v17f32: ; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: movq %rdi, %rax ; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 ; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 @@ -686,21 +685,22 @@ define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index) ; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; WIDEN_KNL-NEXT: vmovd %edx, %xmm2 +; WIDEN_KNL-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; WIDEN_KNL-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm2 +; WIDEN_KNL-NEXT: movq %rdi, %rax +; WIDEN_KNL-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 +; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; WIDEN_KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; WIDEN_KNL-NEXT: vmovd %edx, %xmm1 -; WIDEN_KNL-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; WIDEN_KNL-NEXT: vpinsrd $2, %r8d, %xmm1, %xmm1 -; WIDEN_KNL-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1 -; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm1 +; WIDEN_KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; WIDEN_KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; WIDEN_KNL-NEXT: kxnorw %k0, %k0, %k1 ; WIDEN_KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; WIDEN_KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; WIDEN_KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1} ; WIDEN_KNL-NEXT: movw $1, %cx ; WIDEN_KNL-NEXT: kmovw %ecx, %k1 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 89459a2d10177..6b27c2ef9a61f 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -230,8 +230,8 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) { ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE-NEXT: movmskps %xmm1, %eax -; SSE-NEXT: testb $1, %al ; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: testb $1, %al ; SSE-NEXT: jne LBB3_1 ; SSE-NEXT: ## %bb.2: ## %else ; SSE-NEXT: testb $2, %al @@ -506,32 +506,32 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x double ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxwq %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 -; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 -; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm1 +; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm3 +; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v8f64_v8i16: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 -; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 -; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 -; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vpmovsxwq %xmm4, %ymm4 +; AVX2-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm1, %ymm0 +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm1 +; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v8f64_v8i16: @@ -728,15 +728,15 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double ; AVX1: ## %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm6 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %ymm0, %ymm6, %ymm2, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 -; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 ; AVX1-NEXT: retq @@ -744,10 +744,10 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double ; AVX2-LABEL: load_v8f64_v8i64: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 -; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0 +; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2 ; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 ; AVX2-NEXT: retq @@ -882,8 +882,8 @@ define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, ptr %addr) { ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: ## implicit-def: $xmm0 +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB8_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -907,8 +907,8 @@ define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, ptr %addr) { ; SSE42-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE42-NEXT: pmovsxdq %xmm1, %xmm0 ; SSE42-NEXT: movmskpd %xmm0, %eax -; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: ## implicit-def: $xmm0 +; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: jne LBB8_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -1097,8 +1097,8 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) { ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB10_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -1176,8 +1176,8 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) { ; SSE42-NEXT: packsswb %xmm0, %xmm0 ; SSE42-NEXT: pmovmskb %xmm0, %eax ; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: xorps %xmm1, %xmm1 +; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: jne LBB10_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -1901,32 +1901,32 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i64> %dst ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxwq %xmm3, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 -; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1 -; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm1 +; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm3 +; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v8i64_v8i16: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 -; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4 -; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0 -; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm3, %ymm1 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vpmovsxwq %xmm4, %ymm4 +; AVX2-NEXT: vpmaskmovq (%rdi), %ymm4, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vblendvpd %ymm4, %ymm5, %ymm1, %ymm0 +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm1 +; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v8i64_v8i16: @@ -2127,15 +2127,15 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst ; AVX1: ## %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm6 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %ymm0, %ymm6, %ymm2, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4 -; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0 ; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm2 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 ; AVX1-NEXT: retq @@ -2143,10 +2143,10 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst ; AVX2-LABEL: load_v8i64_v8i64: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4 -; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm2, %ymm0 +; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm5 +; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm2 ; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1 ; AVX2-NEXT: retq @@ -2615,8 +2615,8 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) { ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB20_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -2694,8 +2694,8 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, ptr %addr) { ; SSE42-NEXT: packsswb %xmm0, %xmm0 ; SSE42-NEXT: pmovmskb %xmm0, %eax ; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: testb $1, %al ; SSE42-NEXT: jne LBB20_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -6429,7 +6429,7 @@ define <4 x i32> @mload_constmask_v4i32(ptr %addr, <4 x i32> %dst) { ; ; AVX2-LABEL: mload_constmask_v4i32: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX2-NEXT: retq @@ -6835,10 +6835,10 @@ define <16 x double> @mload_constmask_v16f64_allones_split(ptr %addr, <16 x doub ; SSE-NEXT: movups (%rsi), %xmm0 ; SSE-NEXT: movups 16(%rsi), %xmm1 ; SSE-NEXT: movups 32(%rsi), %xmm2 -; SSE-NEXT: movups 48(%rsi), %xmm3 ; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; SSE-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; SSE-NEXT: movups 48(%rsi), %xmm3 ; SSE-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm7, 112(%rdi) ; SSE-NEXT: movaps %xmm6, 96(%rdi) @@ -6855,8 +6855,8 @@ define <16 x double> @mload_constmask_v16f64_allones_split(ptr %addr, <16 x doub ; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0] ; AVX1OR2-NEXT: ## ymm0 = mem[0,1,0,1] ; AVX1OR2-NEXT: vmaskmovpd 64(%rdi), %ymm0, %ymm1 -; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; AVX1OR2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm0 +; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] ; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 ; AVX1OR2-NEXT: vmovups 32(%rdi), %ymm1 @@ -6962,7 +6962,7 @@ define <4 x i64> @mload_constmask_v4i64_undef_passthrough(ptr %addr) { ; ; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0] ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -7158,9 +7158,9 @@ define <8 x double> @load_one_mask_bit_set5(ptr %addr, <8 x double> %val) { define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) { ; SSE2-LABEL: load_one_mask_bit_set6: ; SSE2: ## %bb.0: -; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] ; SSE2-NEXT: movaps %xmm7, 112(%rdi) @@ -7175,9 +7175,9 @@ define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) { ; ; SSE42-LABEL: load_one_mask_bit_set6: ; SSE42: ## %bb.0: -; SSE42-NEXT: movq %rdi, %rax ; SSE42-NEXT: pinsrq $0, 16(%rsi), %xmm1 ; SSE42-NEXT: pinsrq $0, 80(%rsi), %xmm5 +; SSE42-NEXT: movq %rdi, %rax ; SSE42-NEXT: pinsrq $1, 104(%rsi), %xmm6 ; SSE42-NEXT: movaps %xmm7, 112(%rdi) ; SSE42-NEXT: movdqa %xmm6, 96(%rdi) @@ -7193,8 +7193,8 @@ define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) { ; AVX1: ## %bb.0: ; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,18446744073709551615,0] ; AVX1-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm5 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] ; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm4, %ymm4 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] ; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] ; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [0,18446744073709551615,0,0] ; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm4, %ymm4 @@ -7203,12 +7203,12 @@ define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) { ; ; AVX2-LABEL: load_one_mask_bit_set6: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,18446744073709551615,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,18446744073709551615,0] ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm4, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm4, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,18446744073709551615,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,18446744073709551615,0,0] ; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm4, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: retq @@ -7260,8 +7260,8 @@ define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) { define i32 @pr38986(i1 %c, ptr %p) { ; SSE-LABEL: pr38986: ; SSE: ## %bb.0: -; SSE-NEXT: testb $1, %dil ; SSE-NEXT: ## implicit-def: $eax +; SSE-NEXT: testb $1, %dil ; SSE-NEXT: je LBB45_2 ; SSE-NEXT: ## %bb.1: ## %cond.load ; SSE-NEXT: movl (%rsi), %eax @@ -7270,8 +7270,8 @@ define i32 @pr38986(i1 %c, ptr %p) { ; ; AVX-LABEL: pr38986: ; AVX: ## %bb.0: -; AVX-NEXT: testb $1, %dil ; AVX-NEXT: ## implicit-def: $eax +; AVX-NEXT: testb $1, %dil ; AVX-NEXT: je LBB45_2 ; AVX-NEXT: ## %bb.1: ## %cond.load ; AVX-NEXT: movl (%rsi), %eax diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index c7320275091c6..0db61976f5666 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -2198,8 +2198,8 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val ; AVX1-NEXT: LBB14_15: ## %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi) ; AVX1-NEXT: LBB14_16: ## %else14 -; AVX1-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1-NEXT: jne LBB14_17 ; AVX1-NEXT: ## %bb.18: ## %else16 ; AVX1-NEXT: testl $512, %eax ## imm = 0x200 @@ -2320,8 +2320,8 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val ; AVX2-NEXT: LBB14_15: ## %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi) ; AVX2-NEXT: LBB14_16: ## %else14 -; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: jne LBB14_17 ; AVX2-NEXT: ## %bb.18: ## %else16 ; AVX2-NEXT: testl $512, %eax ## imm = 0x200 @@ -2442,8 +2442,8 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val ; AVX512F-NEXT: LBB14_15: ## %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi) ; AVX512F-NEXT: LBB14_16: ## %else14 -; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512F-NEXT: jne LBB14_17 ; AVX512F-NEXT: ## %bb.18: ## %else16 ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 @@ -2564,8 +2564,8 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val ; AVX512VLDQ-NEXT: LBB14_15: ## %cond.store13 ; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi) ; AVX512VLDQ-NEXT: LBB14_16: ## %else14 -; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512VLDQ-NEXT: jne LBB14_17 ; AVX512VLDQ-NEXT: ## %bb.18: ## %else16 ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 @@ -2682,8 +2682,8 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) no ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB15_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -2698,8 +2698,8 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) no ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: LBB15_8: ## %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB15_10 ; SSE2-NEXT: ## %bb.9: ## %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -2709,8 +2709,8 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) no ; SSE2-NEXT: ## %bb.11: ## %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: LBB15_12: ## %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB15_14 ; SSE2-NEXT: ## %bb.13: ## %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -2720,8 +2720,8 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) no ; SSE2-NEXT: ## %bb.15: ## %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: LBB15_16: ## %else14 -; SSE2-NEXT: testl $256, %eax ## imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 ; SSE2-NEXT: je LBB15_18 ; SSE2-NEXT: ## %bb.17: ## %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -2731,8 +2731,8 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) no ; SSE2-NEXT: ## %bb.19: ## %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: LBB15_20: ## %else18 -; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: je LBB15_22 ; SSE2-NEXT: ## %bb.21: ## %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -2742,8 +2742,8 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) no ; SSE2-NEXT: ## %bb.23: ## %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: LBB15_24: ## %else22 -; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE2-NEXT: je LBB15_26 ; SSE2-NEXT: ## %bb.25: ## %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -2753,8 +2753,8 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) no ; SSE2-NEXT: ## %bb.27: ## %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: LBB15_28: ## %else26 -; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: jne LBB15_29 ; SSE2-NEXT: ## %bb.30: ## %else28 ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 @@ -3283,8 +3283,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne LBB16_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -3299,8 +3299,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: LBB16_8: ## %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je LBB16_10 ; SSE2-NEXT: ## %bb.9: ## %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -3310,8 +3310,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.11: ## %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: LBB16_12: ## %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB16_14 ; SSE2-NEXT: ## %bb.13: ## %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -3321,8 +3321,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.15: ## %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: LBB16_16: ## %else14 -; SSE2-NEXT: testl $256, %eax ## imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: testl $256, %eax ## imm = 0x100 ; SSE2-NEXT: je LBB16_18 ; SSE2-NEXT: ## %bb.17: ## %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -3332,8 +3332,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.19: ## %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: LBB16_20: ## %else18 -; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: je LBB16_22 ; SSE2-NEXT: ## %bb.21: ## %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -3343,8 +3343,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.23: ## %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: LBB16_24: ## %else22 -; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: testl $4096, %eax ## imm = 0x1000 ; SSE2-NEXT: je LBB16_26 ; SSE2-NEXT: ## %bb.25: ## %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -3354,8 +3354,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.27: ## %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: LBB16_28: ## %else26 -; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: je LBB16_30 ; SSE2-NEXT: ## %bb.29: ## %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) @@ -3365,8 +3365,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.31: ## %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) ; SSE2-NEXT: LBB16_32: ## %else30 -; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000 ; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: testl $65536, %eax ## imm = 0x10000 ; SSE2-NEXT: jne LBB16_33 ; SSE2-NEXT: ## %bb.34: ## %else32 ; SSE2-NEXT: testl $131072, %eax ## imm = 0x20000 @@ -3381,8 +3381,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 19(%rdi) ; SSE2-NEXT: LBB16_40: ## %else38 -; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; SSE2-NEXT: pextrw $2, %xmm3, %ecx +; SSE2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; SSE2-NEXT: je LBB16_42 ; SSE2-NEXT: ## %bb.41: ## %cond.store39 ; SSE2-NEXT: movb %cl, 20(%rdi) @@ -3392,8 +3392,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.43: ## %cond.store41 ; SSE2-NEXT: movb %ch, 21(%rdi) ; SSE2-NEXT: LBB16_44: ## %else42 -; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; SSE2-NEXT: pextrw $3, %xmm3, %ecx +; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; SSE2-NEXT: je LBB16_46 ; SSE2-NEXT: ## %bb.45: ## %cond.store43 ; SSE2-NEXT: movb %cl, 22(%rdi) @@ -3403,8 +3403,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.47: ## %cond.store45 ; SSE2-NEXT: movb %ch, 23(%rdi) ; SSE2-NEXT: LBB16_48: ## %else46 -; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; SSE2-NEXT: pextrw $4, %xmm3, %ecx +; SSE2-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; SSE2-NEXT: je LBB16_50 ; SSE2-NEXT: ## %bb.49: ## %cond.store47 ; SSE2-NEXT: movb %cl, 24(%rdi) @@ -3414,8 +3414,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.51: ## %cond.store49 ; SSE2-NEXT: movb %ch, 25(%rdi) ; SSE2-NEXT: LBB16_52: ## %else50 -; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; SSE2-NEXT: pextrw $5, %xmm3, %ecx +; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; SSE2-NEXT: je LBB16_54 ; SSE2-NEXT: ## %bb.53: ## %cond.store51 ; SSE2-NEXT: movb %cl, 26(%rdi) @@ -3425,8 +3425,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.55: ## %cond.store53 ; SSE2-NEXT: movb %ch, 27(%rdi) ; SSE2-NEXT: LBB16_56: ## %else54 -; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; SSE2-NEXT: pextrw $6, %xmm3, %ecx +; SSE2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; SSE2-NEXT: je LBB16_58 ; SSE2-NEXT: ## %bb.57: ## %cond.store55 ; SSE2-NEXT: movb %cl, 28(%rdi) @@ -3436,8 +3436,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; SSE2-NEXT: ## %bb.59: ## %cond.store57 ; SSE2-NEXT: movb %ch, 29(%rdi) ; SSE2-NEXT: LBB16_60: ## %else58 -; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; SSE2-NEXT: pextrw $7, %xmm3, %ecx +; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; SSE2-NEXT: jne LBB16_61 ; SSE2-NEXT: ## %bb.62: ## %else60 ; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 @@ -3776,8 +3776,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; AVX1-NEXT: LBB16_31: ## %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm1, 15(%rdi) ; AVX1-NEXT: LBB16_32: ## %else30 -; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX1-NEXT: jne LBB16_33 ; AVX1-NEXT: ## %bb.34: ## %else32 ; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000 @@ -4008,8 +4008,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; AVX2-NEXT: LBB16_31: ## %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm1, 15(%rdi) ; AVX2-NEXT: LBB16_32: ## %else30 -; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX2-NEXT: jne LBB16_33 ; AVX2-NEXT: ## %bb.34: ## %else32 ; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 @@ -4240,8 +4240,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; AVX512F-NEXT: LBB16_31: ## %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi) ; AVX512F-NEXT: LBB16_32: ## %else30 -; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX512F-NEXT: jne LBB16_33 ; AVX512F-NEXT: ## %bb.34: ## %else32 ; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 @@ -4472,8 +4472,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no ; AVX512VLDQ-NEXT: LBB16_31: ## %cond.store29 ; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi) ; AVX512VLDQ-NEXT: LBB16_32: ## %else30 -; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX512VLDQ-NEXT: jne LBB16_33 ; AVX512VLDQ-NEXT: ## %bb.34: ## %else32 ; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 @@ -4696,34 +4696,21 @@ define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i3 define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 x i64> %val) nounwind { ; SSE2-LABEL: mstore_constmask_allones_split: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: movq %xmm5, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: movq %xmm5, 8(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = mem[2,3,2,3] -; SSE2-NEXT: movq %xmm5, 24(%rdi) -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, 32(%rdi) -; SSE2-NEXT: movq %xmm4, 48(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; SSE2-NEXT: movq %xmm4, 56(%rdi) -; SSE2-NEXT: movq %xmm3, 64(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm3, 72(%rdi) -; SSE2-NEXT: movq %xmm2, 80(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm2, 88(%rdi) -; SSE2-NEXT: movq %xmm1, 96(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm1, 104(%rdi) -; SSE2-NEXT: movq %xmm0, 112(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, 120(%rdi) +; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: movups %xmm6, (%rdi) +; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],mem[0] +; SSE2-NEXT: movupd %xmm5, 24(%rdi) +; SSE2-NEXT: movups %xmm4, 48(%rdi) +; SSE2-NEXT: movups %xmm3, 64(%rdi) +; SSE2-NEXT: movups %xmm2, 80(%rdi) +; SSE2-NEXT: movups %xmm1, 96(%rdi) +; SSE2-NEXT: movups %xmm0, 112(%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: mstore_constmask_allones_split: @@ -4758,9 +4745,9 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 ; ; AVX2-LABEL: mstore_constmask_allones_split: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,18446744073709551615] ; AVX2-NEXT: vpmaskmovq %ymm5, %ymm0, 32(%rdi) -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,0,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,0,18446744073709551615] ; AVX2-NEXT: vpmaskmovq %ymm4, %ymm0, (%rdi) ; AVX2-NEXT: vmovups %ymm7, 96(%rdi) ; AVX2-NEXT: vmovups %ymm6, 64(%rdi) @@ -4969,9 +4956,9 @@ define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) nounwind { ; ; AVX2-LABEL: one_mask_bit_set6: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,0,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,18446744073709551615] ; AVX2-NEXT: vpmaskmovq %ymm2, %ymm0, 64(%rdi) -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,18446744073709551615,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,18446744073709551615,0] ; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, 32(%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -5669,10 +5656,10 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE-NEXT: movl 56(%rsi), %eax ; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; SSE-NEXT: movl 52(%rsi), %eax -; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE-NEXT: packssdw 48(%rdi), %xmm2 ; SSE-NEXT: packssdw 16(%rdi), %xmm1 +; SSE-NEXT: movl 52(%rsi), %eax +; SSE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE-NEXT: packsswb %xmm2, %xmm1 ; SSE-NEXT: packssdw 80(%rdi), %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 @@ -5683,7 +5670,6 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ; SSE-NEXT: shll $16, %edi ; SSE-NEXT: orl %eax, %edi ; SSE-NEXT: movl 48(%rsi), %r13d -; SSE-NEXT: testb $1, %dil ; SSE-NEXT: movl 44(%rsi), %eax ; SSE-NEXT: movl 40(%rsi), %ecx ; SSE-NEXT: movl 36(%rsi), %r8d @@ -5695,6 +5681,7 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ; SSE-NEXT: movl 12(%rsi), %r14d ; SSE-NEXT: movl 8(%rsi), %r15d ; SSE-NEXT: movl 4(%rsi), %r12d +; SSE-NEXT: testb $1, %dil ; SSE-NEXT: jne LBB31_1 ; SSE-NEXT: ## %bb.2: ## %else ; SSE-NEXT: testb $2, %dil diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 2f0d419132492..a1e13d1c6fdac 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -363,7 +363,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 @@ -612,8 +612,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -628,8 +628,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB2_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB2_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -639,8 +639,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB2_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: jne .LBB2_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al @@ -673,7 +673,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i64_v8i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm6, %xmm6 -; SSE4-NEXT: pmovzxbq {{.*#+}} xmm7 = [255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE4-NEXT: pand %xmm7, %xmm3 ; SSE4-NEXT: pand %xmm7, %xmm2 ; SSE4-NEXT: packusdw %xmm3, %xmm2 @@ -747,7 +747,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 @@ -824,16 +824,16 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX2-LABEL: truncstore_v8i64_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax ; AVX2-NEXT: notl %eax ; AVX2-NEXT: testb $1, %al @@ -1358,8 +1358,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx ; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne .LBB5_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -1476,7 +1476,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -1813,8 +1813,8 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: xorl $3, %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB8_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -2195,7 +2195,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v16i32_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 @@ -2241,8 +2241,8 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-NEXT: .LBB9_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX1-NEXT: .LBB9_16: # %else14 -; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: jne .LBB9_17 ; AVX1-NEXT: # %bb.18: # %else16 ; AVX1-NEXT: testl $512, %eax # imm = 0x200 @@ -2372,8 +2372,8 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX2-NEXT: .LBB9_15: # %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX2-NEXT: .LBB9_16: # %else14 -; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: jne .LBB9_17 ; AVX2-NEXT: # %bb.18: # %else16 ; AVX2-NEXT: testl $512, %eax # imm = 0x200 @@ -2492,8 +2492,8 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512F-NEXT: .LBB9_15: # %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX512F-NEXT: .LBB9_16: # %else14 -; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: jne .LBB9_17 ; AVX512F-NEXT: # %bb.18: # %else16 ; AVX512F-NEXT: testl $512, %eax # imm = 0x200 @@ -2621,8 +2621,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm6, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB10_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -2637,8 +2637,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB10_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB10_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -2648,8 +2648,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB10_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB10_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -2659,8 +2659,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB10_16: # %else14 -; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: je .LBB10_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -2670,8 +2670,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB10_20: # %else18 -; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: je .LBB10_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -2681,8 +2681,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB10_24: # %else22 -; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: je .LBB10_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -2692,8 +2692,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB10_28: # %else26 -; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: jne .LBB10_29 ; SSE2-NEXT: # %bb.30: # %else28 ; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 @@ -2726,7 +2726,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE4-LABEL: truncstore_v16i32_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm9 = [255,255,255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE4-NEXT: pand %xmm9, %xmm3 ; SSE4-NEXT: pand %xmm9, %xmm2 ; SSE4-NEXT: packusdw %xmm3, %xmm2 @@ -2858,7 +2858,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 @@ -3658,8 +3658,8 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB12_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -3674,8 +3674,8 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB12_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB12_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -3685,8 +3685,8 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB12_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: jne .LBB12_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al @@ -3719,7 +3719,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i32_v8i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm5 = [255,255,255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE4-NEXT: pand %xmm5, %xmm1 ; SSE4-NEXT: pand %xmm5, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 @@ -4197,15 +4197,15 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i32_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx ; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne .LBB14_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -4384,8 +4384,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -4400,8 +4400,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB15_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB15_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -4411,8 +4411,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB15_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB15_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -4422,8 +4422,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB15_16: # %else14 -; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: je .LBB15_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -4433,8 +4433,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB15_20: # %else18 -; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: je .LBB15_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -4444,8 +4444,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB15_24: # %else22 -; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: je .LBB15_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -4457,8 +4457,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: .LBB15_28: # %else26 ; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: je .LBB15_30 ; SSE2-NEXT: # %bb.29: # %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) @@ -4469,8 +4469,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.31: # %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) ; SSE2-NEXT: .LBB15_32: # %else30 -; SSE2-NEXT: testl $65536, %eax # imm = 0x10000 ; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: testl $65536, %eax # imm = 0x10000 ; SSE2-NEXT: jne .LBB15_33 ; SSE2-NEXT: # %bb.34: # %else32 ; SSE2-NEXT: testl $131072, %eax # imm = 0x20000 @@ -4485,8 +4485,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 19(%rdi) ; SSE2-NEXT: .LBB15_40: # %else38 -; SSE2-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE2-NEXT: je .LBB15_42 ; SSE2-NEXT: # %bb.41: # %cond.store39 ; SSE2-NEXT: movb %cl, 20(%rdi) @@ -4496,8 +4496,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.43: # %cond.store41 ; SSE2-NEXT: movb %ch, 21(%rdi) ; SSE2-NEXT: .LBB15_44: # %else42 -; SSE2-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE2-NEXT: je .LBB15_46 ; SSE2-NEXT: # %bb.45: # %cond.store43 ; SSE2-NEXT: movb %cl, 22(%rdi) @@ -4507,8 +4507,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.47: # %cond.store45 ; SSE2-NEXT: movb %ch, 23(%rdi) ; SSE2-NEXT: .LBB15_48: # %else46 -; SSE2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE2-NEXT: je .LBB15_50 ; SSE2-NEXT: # %bb.49: # %cond.store47 ; SSE2-NEXT: movb %cl, 24(%rdi) @@ -4518,8 +4518,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.51: # %cond.store49 ; SSE2-NEXT: movb %ch, 25(%rdi) ; SSE2-NEXT: .LBB15_52: # %else50 -; SSE2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE2-NEXT: je .LBB15_54 ; SSE2-NEXT: # %bb.53: # %cond.store51 ; SSE2-NEXT: movb %cl, 26(%rdi) @@ -4529,8 +4529,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.55: # %cond.store53 ; SSE2-NEXT: movb %ch, 27(%rdi) ; SSE2-NEXT: .LBB15_56: # %else54 -; SSE2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE2-NEXT: je .LBB15_58 ; SSE2-NEXT: # %bb.57: # %cond.store55 ; SSE2-NEXT: movb %cl, 28(%rdi) @@ -4540,8 +4540,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.59: # %cond.store57 ; SSE2-NEXT: movb %ch, 29(%rdi) ; SSE2-NEXT: .LBB15_60: # %else58 -; SSE2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE2-NEXT: jne .LBB15_61 ; SSE2-NEXT: # %bb.62: # %else60 ; SSE2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 @@ -4589,7 +4589,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE4-LABEL: truncstore_v32i16_v32i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: pmovzxbw {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE4-NEXT: pand %xmm6, %xmm1 ; SSE4-NEXT: pand %xmm6, %xmm0 ; SSE4-NEXT: packuswb %xmm1, %xmm0 @@ -4828,7 +4828,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; ; AVX1-LABEL: truncstore_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 @@ -4896,8 +4896,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX1-NEXT: .LBB15_31: # %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) ; AVX1-NEXT: .LBB15_32: # %else30 -; AVX1-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX1-NEXT: jne .LBB15_33 ; AVX1-NEXT: # %bb.34: # %else32 ; AVX1-NEXT: testl $131072, %eax # imm = 0x20000 @@ -5134,8 +5134,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX2-NEXT: .LBB15_31: # %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) ; AVX2-NEXT: .LBB15_32: # %else30 -; AVX2-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX2-NEXT: jne .LBB15_33 ; AVX2-NEXT: # %bb.34: # %else32 ; AVX2-NEXT: testl $131072, %eax # imm = 0x20000 @@ -5373,8 +5373,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512F-NEXT: .LBB15_31: # %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) ; AVX512F-NEXT: .LBB15_32: # %else30 -; AVX512F-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX512F-NEXT: jne .LBB15_33 ; AVX512F-NEXT: # %bb.34: # %else32 ; AVX512F-NEXT: testl $131072, %eax # imm = 0x20000 @@ -5581,8 +5581,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE2-NEXT: pmovmskb %xmm3, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB16_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -5597,8 +5597,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB16_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB16_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -5608,8 +5608,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB16_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB16_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -5619,8 +5619,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB16_16: # %else14 -; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: je .LBB16_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -5630,8 +5630,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB16_20: # %else18 -; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: je .LBB16_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -5641,8 +5641,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB16_24: # %else22 -; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: je .LBB16_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -5652,8 +5652,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB16_28: # %else26 -; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: jne .LBB16_29 ; SSE2-NEXT: # %bb.30: # %else28 ; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 @@ -5686,7 +5686,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE4-LABEL: truncstore_v16i16_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE4-NEXT: pand %xmm4, %xmm1 ; SSE4-NEXT: pand %xmm4, %xmm0 ; SSE4-NEXT: packuswb %xmm1, %xmm0 @@ -6203,15 +6203,15 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-LABEL: truncstore_v8i16_v8i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB17_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -6226,8 +6226,8 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB17_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB17_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -6237,8 +6237,8 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB17_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: jne .LBB17_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index c950ce64e8883..e1a461d8d3ef8 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -182,42 +182,42 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i64_v8i32: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: pmovsxdq {{.*#+}} xmm10 = [2147483647,2147483647] -; SSE4-NEXT: movdqa %xmm10, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm10, %xmm8 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE4-NEXT: movdqa %xmm10, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm7 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm7 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm10, %xmm9 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm9 -; SSE4-NEXT: movdqa %xmm10, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movdqa %xmm10, %xmm3 +; SSE4-NEXT: movdqa %xmm9, %xmm3 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE4-NEXT: movdqa %xmm10, %xmm0 +; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 -; SSE4-NEXT: pmovsxdq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSE4-NEXT: movapd %xmm10, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE4-NEXT: movapd %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm6 ; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm2 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] -; SSE4-NEXT: movapd %xmm9, %xmm0 +; SSE4-NEXT: xorpd %xmm9, %xmm9 +; SSE4-NEXT: movapd %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3 -; SSE4-NEXT: movapd %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE4-NEXT: movapd %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] +; SSE4-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax @@ -288,30 +288,30 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2147483647,2147483647] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [2147483647,2147483647] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm5[0,2],xmm0[0,2] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,2],xmm0[0,2] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm5, %xmm4, %xmm3 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,2],xmm1[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) @@ -563,44 +563,44 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i64_v8i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: pmovsxwq {{.*#+}} xmm9 = [32767,32767] -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] +; SSE4-NEXT: movdqa %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm8 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa %xmm8, %xmm7 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm7 +; SSE4-NEXT: movdqa %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm2 +; SSE4-NEXT: movdqa %xmm8, %xmm2 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm3 +; SSE4-NEXT: movdqa %xmm8, %xmm3 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE4-NEXT: pmovsxwq {{.*#+}} xmm6 = [18446744073709518848,18446744073709518848] -; SSE4-NEXT: movapd %xmm9, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709518848,18446744073709518848] +; SSE4-NEXT: movapd %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movdqa %xmm6, %xmm10 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm10 +; SSE4-NEXT: movdqa %xmm6, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm9 +; SSE4-NEXT: xorpd %xmm8, %xmm8 ; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: movdqa %xmm6, %xmm1 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE4-NEXT: packssdw %xmm10, %xmm1 +; SSE4-NEXT: packssdw %xmm9, %xmm1 ; SSE4-NEXT: movapd %xmm2, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: movdqa %xmm6, %xmm3 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE4-NEXT: movapd %xmm8, %xmm0 +; SSE4-NEXT: movapd %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm6 ; SSE4-NEXT: packssdw %xmm3, %xmm6 ; SSE4-NEXT: packssdw %xmm6, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax @@ -664,7 +664,8 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -675,7 +676,8 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6 @@ -755,22 +757,22 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX2-LABEL: truncstore_v8i64_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [32767,32767,32767,32767] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [32767,32767,32767,32767] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax ; AVX2-NEXT: notl %eax ; AVX2-NEXT: testb $1, %al @@ -1043,8 +1045,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -1059,8 +1061,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB2_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB2_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -1070,8 +1072,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB2_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: jne .LBB2_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al @@ -1104,16 +1106,15 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i64_v8i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: pmovsxbq {{.*#+}} xmm9 = [127,127] +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [127,127] ; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm8 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE4-NEXT: movdqa %xmm9, %xmm7 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm7 ; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm2 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE4-NEXT: movdqa %xmm9, %xmm8 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm8 ; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: movdqa %xmm9, %xmm3 @@ -1121,7 +1122,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: movdqa %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE4-NEXT: pmovsxbq {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; SSE4-NEXT: pxor %xmm2, %xmm2 +; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] ; SSE4-NEXT: movapd %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: movdqa %xmm6, %xmm10 @@ -1130,19 +1132,19 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: movdqa %xmm6, %xmm1 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE4-NEXT: packssdw %xmm10, %xmm1 -; SSE4-NEXT: movapd %xmm2, %xmm0 +; SSE4-NEXT: movapd %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: movdqa %xmm6, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE4-NEXT: movapd %xmm8, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm3 +; SSE4-NEXT: packssdw %xmm10, %xmm1 +; SSE4-NEXT: movapd %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm6 ; SSE4-NEXT: packssdw %xmm3, %xmm6 ; SSE4-NEXT: packssdw %xmm6, %xmm1 ; SSE4-NEXT: packsswb %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE4-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax @@ -1206,7 +1208,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -1217,16 +1220,17 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6 ; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 @@ -1298,23 +1302,23 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX2-LABEL: truncstore_v8i64_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [127,127,127,127] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [127,127,127,127] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax ; AVX2-NEXT: notl %eax ; AVX2-NEXT: testb $1, %al @@ -1567,7 +1571,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pmovsxdq {{.*#+}} xmm5 = [2147483647,2147483647] +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 ; SSE4-NEXT: movdqa %xmm5, %xmm6 @@ -1575,7 +1579,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE4-NEXT: pmovsxdq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE4-NEXT: movapd %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 @@ -1644,11 +1648,11 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 @@ -1799,23 +1803,23 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-LABEL: truncstore_v4i64_v4i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pmovsxwq {{.*#+}} xmm5 = [32767,32767] -; SSE4-NEXT: movdqa %xmm5, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767] +; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm5, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE4-NEXT: movdqa %xmm5, %xmm0 +; SSE4-NEXT: movdqa %xmm4, %xmm5 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE4-NEXT: pmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; SSE4-NEXT: movapd %xmm5, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm3 -; SSE4-NEXT: movapd %xmm6, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE4-NEXT: xorpd %xmm4, %xmm4 +; SSE4-NEXT: movapd %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE4-NEXT: packssdw %xmm3, %xmm1 ; SSE4-NEXT: packssdw %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 @@ -1853,13 +1857,15 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-LABEL: truncstore_v4i64_v4i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 @@ -1902,13 +1908,13 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; ; AVX2-LABEL: truncstore_v4i64_v4i16: ; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [32767,32767,32767,32767] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [32767,32767,32767,32767] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 @@ -2078,8 +2084,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx ; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne .LBB5_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -2114,28 +2120,28 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-LABEL: truncstore_v4i64_v4i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 -; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pmovsxbq {{.*#+}} xmm5 = [127,127] -; SSE4-NEXT: movdqa %xmm5, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [127,127] +; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm5, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE4-NEXT: movdqa %xmm5, %xmm0 +; SSE4-NEXT: movdqa %xmm4, %xmm5 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; SSE4-NEXT: movapd %xmm5, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm3 -; SSE4-NEXT: movapd %xmm6, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE4-NEXT: movapd %xmm5, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: packssdw %xmm3, %xmm1 ; SSE4-NEXT: packssdw %xmm1, %xmm1 ; SSE4-NEXT: packsswb %xmm1, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE4-NEXT: movmskps %xmm4, %eax +; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE4-NEXT: movmskps %xmm0, %eax ; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 @@ -2168,22 +2174,24 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v4i64_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [127,127] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [127,127] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax ; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al @@ -2219,18 +2227,18 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; ; AVX2-LABEL: truncstore_v4i64_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [127,127,127,127] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax ; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al @@ -2353,8 +2361,8 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pxor %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] @@ -2387,18 +2395,18 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-LABEL: truncstore_v2i64_v2i32: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE4-NEXT: movdqa %xmm4, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; SSE4-NEXT: movdqa %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE4-NEXT: pmovsxdq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] -; SSE4-NEXT: movapd %xmm4, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 -; SSE4-NEXT: movmskpd %xmm3, %eax +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE4-NEXT: pxor %xmm2, %xmm2 +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] +; SSE4-NEXT: movapd %xmm3, %xmm0 +; SSE4-NEXT: pcmpgtq %xmm4, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm4 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 +; SSE4-NEXT: movmskpd %xmm2, %eax ; SSE4-NEXT: xorl $3, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 @@ -2441,10 +2449,10 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2147483647,2147483647] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2550,11 +2558,11 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pmovsxwq {{.*#+}} xmm4 = [32767,32767] +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [32767,32767] ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE4-NEXT: pmovsxwq {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848] +; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848] ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 @@ -2578,34 +2586,65 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [32767,32767] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB7_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB7_3 -; AVX-NEXT: .LBB7_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB7_1: # %cond.store -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: .LBB7_3: # %cond.store1 -; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB7_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB7_3 +; AVX1-NEXT: .LBB7_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB7_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB7_4 +; AVX1-NEXT: .LBB7_3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB7_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB7_3 +; AVX2-NEXT: .LBB7_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB7_1: # %cond.store +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB7_4 +; AVX2-NEXT: .LBB7_3: # %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: ; AVX512F: # %bb.0: @@ -2681,8 +2720,8 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pxor %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] @@ -2698,8 +2737,8 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: xorl $3, %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm3, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB8_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -2718,11 +2757,11 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pmovsxbq {{.*#+}} xmm4 = [127,127] +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [127,127] ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE4-NEXT: pmovsxbq {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] ; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 @@ -2745,33 +2784,63 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB8_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB8_3 -; AVX-NEXT: .LBB8_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB8_1: # %cond.store -; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: .LBB8_3: # %cond.store1 -; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB8_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB8_3 +; AVX1-NEXT: .LBB8_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB8_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB8_4 +; AVX1-NEXT: .LBB8_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB8_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB8_3 +; AVX2-NEXT: .LBB8_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB8_1: # %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB8_4 +; AVX2-NEXT: .LBB8_3: # %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -3140,8 +3209,8 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-NEXT: .LBB9_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX1-NEXT: .LBB9_16: # %else14 -; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: jne .LBB9_17 ; AVX1-NEXT: # %bb.18: # %else16 ; AVX1-NEXT: testl $512, %eax # imm = 0x200 @@ -3269,8 +3338,8 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX2-NEXT: .LBB9_15: # %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX2-NEXT: .LBB9_16: # %else14 -; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: jne .LBB9_17 ; AVX2-NEXT: # %bb.18: # %else16 ; AVX2-NEXT: testl $512, %eax # imm = 0x200 @@ -3389,8 +3458,8 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512F-NEXT: .LBB9_15: # %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX512F-NEXT: .LBB9_16: # %else14 -; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: jne .LBB9_17 ; AVX512F-NEXT: # %bb.18: # %else16 ; AVX512F-NEXT: testl $512, %eax # imm = 0x200 @@ -3521,8 +3590,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm6, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB10_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -3537,8 +3606,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB10_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB10_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -3548,8 +3617,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB10_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB10_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -3559,8 +3628,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB10_16: # %else14 -; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: je .LBB10_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -3570,8 +3639,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB10_20: # %else18 -; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: je .LBB10_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -3581,8 +3650,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB10_24: # %else22 -; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: je .LBB10_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -3592,8 +3661,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB10_28: # %else26 -; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: jne .LBB10_29 ; SSE2-NEXT: # %bb.30: # %else28 ; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 @@ -4551,8 +4620,8 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB12_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -4567,8 +4636,8 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB12_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB12_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -4578,8 +4647,8 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB12_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: jne .LBB12_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al @@ -5101,8 +5170,8 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx ; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne .LBB14_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -5288,8 +5357,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -5304,8 +5373,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB15_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB15_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -5315,8 +5384,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB15_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB15_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -5326,8 +5395,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB15_16: # %else14 -; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: je .LBB15_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -5337,8 +5406,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB15_20: # %else18 -; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: je .LBB15_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -5348,8 +5417,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB15_24: # %else22 -; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: je .LBB15_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -5359,8 +5428,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB15_28: # %else26 -; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: je .LBB15_30 ; SSE2-NEXT: # %bb.29: # %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) @@ -5371,8 +5440,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.31: # %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) ; SSE2-NEXT: .LBB15_32: # %else30 -; SSE2-NEXT: testl $65536, %eax # imm = 0x10000 ; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: testl $65536, %eax # imm = 0x10000 ; SSE2-NEXT: jne .LBB15_33 ; SSE2-NEXT: # %bb.34: # %else32 ; SSE2-NEXT: testl $131072, %eax # imm = 0x20000 @@ -5387,8 +5456,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 19(%rdi) ; SSE2-NEXT: .LBB15_40: # %else38 -; SSE2-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE2-NEXT: je .LBB15_42 ; SSE2-NEXT: # %bb.41: # %cond.store39 ; SSE2-NEXT: movb %cl, 20(%rdi) @@ -5398,8 +5467,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.43: # %cond.store41 ; SSE2-NEXT: movb %ch, 21(%rdi) ; SSE2-NEXT: .LBB15_44: # %else42 -; SSE2-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE2-NEXT: je .LBB15_46 ; SSE2-NEXT: # %bb.45: # %cond.store43 ; SSE2-NEXT: movb %cl, 22(%rdi) @@ -5409,8 +5478,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.47: # %cond.store45 ; SSE2-NEXT: movb %ch, 23(%rdi) ; SSE2-NEXT: .LBB15_48: # %else46 -; SSE2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE2-NEXT: je .LBB15_50 ; SSE2-NEXT: # %bb.49: # %cond.store47 ; SSE2-NEXT: movb %cl, 24(%rdi) @@ -5420,8 +5489,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.51: # %cond.store49 ; SSE2-NEXT: movb %ch, 25(%rdi) ; SSE2-NEXT: .LBB15_52: # %else50 -; SSE2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE2-NEXT: je .LBB15_54 ; SSE2-NEXT: # %bb.53: # %cond.store51 ; SSE2-NEXT: movb %cl, 26(%rdi) @@ -5431,8 +5500,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.55: # %cond.store53 ; SSE2-NEXT: movb %ch, 27(%rdi) ; SSE2-NEXT: .LBB15_56: # %else54 -; SSE2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE2-NEXT: je .LBB15_58 ; SSE2-NEXT: # %bb.57: # %cond.store55 ; SSE2-NEXT: movb %cl, 28(%rdi) @@ -5442,8 +5511,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.59: # %cond.store57 ; SSE2-NEXT: movb %ch, 29(%rdi) ; SSE2-NEXT: .LBB15_60: # %else58 -; SSE2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE2-NEXT: jne .LBB15_61 ; SSE2-NEXT: # %bb.62: # %else60 ; SSE2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 @@ -5792,8 +5861,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX1-NEXT: .LBB15_31: # %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) ; AVX1-NEXT: .LBB15_32: # %else30 -; AVX1-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX1-NEXT: jne .LBB15_33 ; AVX1-NEXT: # %bb.34: # %else32 ; AVX1-NEXT: testl $131072, %eax # imm = 0x20000 @@ -6027,8 +6096,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX2-NEXT: .LBB15_31: # %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) ; AVX2-NEXT: .LBB15_32: # %else30 -; AVX2-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX2-NEXT: jne .LBB15_33 ; AVX2-NEXT: # %bb.34: # %else32 ; AVX2-NEXT: testl $131072, %eax # imm = 0x20000 @@ -6263,8 +6332,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512F-NEXT: .LBB15_31: # %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) ; AVX512F-NEXT: .LBB15_32: # %else30 -; AVX512F-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX512F-NEXT: jne .LBB15_33 ; AVX512F-NEXT: # %bb.34: # %else32 ; AVX512F-NEXT: testl $131072, %eax # imm = 0x20000 @@ -6476,8 +6545,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE2-NEXT: pmovmskb %xmm3, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB16_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -6492,8 +6561,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB16_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB16_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -6503,8 +6572,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB16_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB16_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -6514,8 +6583,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB16_16: # %else14 -; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: je .LBB16_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -6525,8 +6594,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB16_20: # %else18 -; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: je .LBB16_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -6536,8 +6605,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB16_24: # %else22 -; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: je .LBB16_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -6547,8 +6616,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB16_28: # %else26 -; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: jne .LBB16_29 ; SSE2-NEXT: # %bb.30: # %else28 ; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 @@ -7105,8 +7174,8 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB17_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -7121,8 +7190,8 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB17_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB17_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -7132,8 +7201,8 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB17_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: jne .LBB17_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index da057dd084b36..ae5bd1fafed66 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -140,7 +140,6 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movapd %xmm8, %xmm1 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] ; SSE4-NEXT: movdqa %xmm3, %xmm6 ; SSE4-NEXT: pxor %xmm10, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 @@ -151,6 +150,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: pcmpgtq %xmm10, %xmm7 ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] ; SSE4-NEXT: pcmpeqd %xmm9, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm9, %xmm4 ; SSE4-NEXT: packssdw %xmm5, %xmm4 @@ -436,37 +436,37 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i64_v8i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 -; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: movapd {{.*#+}} xmm9 = [65535,65535] -; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm11 -; SSE4-NEXT: pxor %xmm10, %xmm11 +; SSE4-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm1, %xmm10 +; SSE4-NEXT: pxor %xmm9, %xmm10 ; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854841343,9223372036854841343] ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm11, %xmm0 -; SSE4-NEXT: movapd %xmm9, %xmm11 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm11 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 +; SSE4-NEXT: movapd %xmm8, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 ; SSE4-NEXT: movdqa %xmm6, %xmm1 -; SSE4-NEXT: pxor %xmm10, %xmm1 +; SSE4-NEXT: pxor %xmm9, %xmm1 ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: movapd %xmm9, %xmm1 +; SSE4-NEXT: movapd %xmm8, %xmm1 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE4-NEXT: packusdw %xmm11, %xmm1 -; SSE4-NEXT: movdqa %xmm3, %xmm6 -; SSE4-NEXT: pxor %xmm10, %xmm6 +; SSE4-NEXT: pxor %xmm6, %xmm6 +; SSE4-NEXT: packusdw %xmm10, %xmm1 +; SSE4-NEXT: movdqa %xmm3, %xmm10 +; SSE4-NEXT: pxor %xmm9, %xmm10 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movapd %xmm9, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE4-NEXT: pxor %xmm2, %xmm10 -; SSE4-NEXT: pcmpgtq %xmm10, %xmm7 +; SSE4-NEXT: pcmpgtq %xmm10, %xmm0 +; SSE4-NEXT: movapd %xmm8, %xmm10 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm10 +; SSE4-NEXT: pxor %xmm2, %xmm9 +; SSE4-NEXT: pcmpgtq %xmm9, %xmm7 ; SSE4-NEXT: movdqa %xmm7, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE4-NEXT: packusdw %xmm6, %xmm9 -; SSE4-NEXT: packusdw %xmm9, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE4-NEXT: packusdw %xmm10, %xmm8 +; SSE4-NEXT: packusdw %xmm8, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE4-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax @@ -537,20 +537,20 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpxor %xmm3, %xmm6, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm7 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm8 ; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX1-NEXT: vpxor %xmm3, %xmm9, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm9 = [65535,65535] +; AVX1-NEXT: # xmm9 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm8, %xmm7, %xmm9, %xmm7 +; AVX1-NEXT: vpxor %xmm3, %xmm6, %xmm8 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535] -; AVX1-NEXT: # xmm5 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm3, %xmm9, %xmm5, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm5, %xmm3 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm3 +; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm9, %xmm3 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 @@ -624,12 +624,12 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 -; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] +; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm7 +; AVX2-NEXT: vpcmpgtq %ymm7, %ymm6, %ymm7 +; AVX2-NEXT: vblendvpd %ymm7, %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -846,8 +846,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -862,8 +862,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB2_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB2_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -873,8 +873,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB2_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: jne .LBB2_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al @@ -923,13 +923,13 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movapd %xmm9, %xmm1 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE4-NEXT: packusdw %xmm11, %xmm1 ; SSE4-NEXT: movdqa %xmm3, %xmm6 ; SSE4-NEXT: pxor %xmm10, %xmm6 ; SSE4-NEXT: movdqa %xmm7, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 ; SSE4-NEXT: movapd %xmm9, %xmm6 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE4-NEXT: packusdw %xmm11, %xmm1 ; SSE4-NEXT: pxor %xmm2, %xmm10 ; SSE4-NEXT: pcmpgtq %xmm10, %xmm7 ; SSE4-NEXT: movdqa %xmm7, %xmm0 @@ -1020,10 +1020,10 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm3, %xmm9, %xmm5, %xmm3 ; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm5, %xmm3 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm5, %xmm6 ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 @@ -1094,22 +1094,22 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX2-LABEL: truncstore_v8i64_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] ; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 -; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm7, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm5, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax ; AVX2-NEXT: notl %eax ; AVX2-NEXT: testb $1, %al @@ -1526,24 +1526,24 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-LABEL: truncstore_v4i64_v4i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 -; SSE4-NEXT: pxor %xmm6, %xmm6 ; SSE4-NEXT: movapd {{.*#+}} xmm5 = [65535,65535] -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm8 -; SSE4-NEXT: pxor %xmm7, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm1, %xmm7 +; SSE4-NEXT: pxor %xmm6, %xmm7 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 -; SSE4-NEXT: movapd %xmm5, %xmm8 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8 -; SSE4-NEXT: pxor %xmm3, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm4 +; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 +; SSE4-NEXT: movapd %xmm5, %xmm7 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm3, %xmm6 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm4 ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE4-NEXT: packusdw %xmm8, %xmm5 +; SSE4-NEXT: packusdw %xmm7, %xmm5 ; SSE4-NEXT: packusdw %xmm5, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE4-NEXT: movmskps %xmm6, %eax +; SSE4-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE4-NEXT: movmskps %xmm1, %eax ; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 @@ -1628,13 +1628,13 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; ; AVX2-LABEL: truncstore_v4i64_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [65535,65535,65535,65535] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342] -; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342] +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -1771,8 +1771,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %ecx ; SSE2-NEXT: xorl $15, %ecx -; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne .LBB5_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -1807,25 +1807,25 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-LABEL: truncstore_v4i64_v4i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm3 -; SSE4-NEXT: pxor %xmm6, %xmm6 ; SSE4-NEXT: movapd {{.*#+}} xmm5 = [255,255] -; SSE4-NEXT: movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE4-NEXT: movdqa %xmm1, %xmm8 -; SSE4-NEXT: pxor %xmm7, %xmm8 +; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE4-NEXT: movdqa %xmm1, %xmm7 +; SSE4-NEXT: pxor %xmm6, %xmm7 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] ; SSE4-NEXT: movdqa %xmm4, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm8, %xmm0 -; SSE4-NEXT: movapd %xmm5, %xmm8 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8 -; SSE4-NEXT: pxor %xmm3, %xmm7 -; SSE4-NEXT: pcmpgtq %xmm7, %xmm4 +; SSE4-NEXT: pcmpgtq %xmm7, %xmm0 +; SSE4-NEXT: movapd %xmm5, %xmm7 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE4-NEXT: pxor %xmm3, %xmm6 +; SSE4-NEXT: pcmpgtq %xmm6, %xmm4 ; SSE4-NEXT: movdqa %xmm4, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE4-NEXT: packusdw %xmm8, %xmm5 +; SSE4-NEXT: pxor %xmm0, %xmm0 +; SSE4-NEXT: packusdw %xmm7, %xmm5 ; SSE4-NEXT: packusdw %xmm5, %xmm5 ; SSE4-NEXT: packuswb %xmm5, %xmm5 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE4-NEXT: movmskps %xmm6, %eax +; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE4-NEXT: movmskps %xmm0, %eax ; SSE4-NEXT: xorl $15, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 @@ -1858,24 +1858,24 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v4i64_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] -; AVX1-NEXT: # xmm5 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpxor %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [255,255] -; AVX1-NEXT: # xmm5 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm5, %xmm3 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm2, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax ; AVX1-NEXT: xorl $15, %eax ; AVX1-NEXT: testb $1, %al @@ -1914,8 +1914,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776062,9223372036854776062,9223372036854776062,9223372036854776062] +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 ; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -2024,22 +2024,22 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-LABEL: truncstore_v2i64_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al @@ -2193,11 +2193,11 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-LABEL: truncstore_v2i64_v2i16: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: pxor %xmm2, %xmm0 ; SSE4-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 @@ -2220,10 +2220,10 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; ; AVX-LABEL: truncstore_v2i64_v2i16: ; AVX: # %bb.0: +; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vblendvpd %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 @@ -2297,27 +2297,27 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-LABEL: truncstore_v2i64_v2i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: testb $1, %al -; SSE2-NEXT: movd %xmm3, %ecx ; SSE2-NEXT: jne .LBB8_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -2335,14 +2335,14 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-LABEL: truncstore_v2i64_v2i8: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] ; SSE4-NEXT: pxor %xmm2, %xmm0 ; SSE4-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: pxor %xmm0, %xmm0 ; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 -; SSE4-NEXT: movmskpd %xmm3, %eax +; SSE4-NEXT: pcmpeqq %xmm1, %xmm0 +; SSE4-NEXT: movmskpd %xmm0, %eax ; SSE4-NEXT: xorl $3, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 @@ -2361,10 +2361,10 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; ; AVX-LABEL: truncstore_v2i64_v2i8: ; AVX: # %bb.0: +; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX-NEXT: vblendvpd %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax @@ -2602,7 +2602,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE4-LABEL: truncstore_v16i32_v16i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm9, %xmm9 -; SSE4-NEXT: pmovsxbw {{.*#+}} xmm8 = [65535,0,65535,0,65535,0,65535,0] +; SSE4-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535] ; SSE4-NEXT: pminud %xmm8, %xmm1 ; SSE4-NEXT: pminud %xmm8, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 @@ -2778,8 +2778,8 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX1-NEXT: .LBB9_15: # %cond.store13 ; AVX1-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX1-NEXT: .LBB9_16: # %else14 -; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testl $256, %eax # imm = 0x100 ; AVX1-NEXT: jne .LBB9_17 ; AVX1-NEXT: # %bb.18: # %else16 ; AVX1-NEXT: testl $512, %eax # imm = 0x200 @@ -2910,8 +2910,8 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX2-NEXT: .LBB9_15: # %cond.store13 ; AVX2-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX2-NEXT: .LBB9_16: # %else14 -; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testl $256, %eax # imm = 0x100 ; AVX2-NEXT: jne .LBB9_17 ; AVX2-NEXT: # %bb.18: # %else16 ; AVX2-NEXT: testl $512, %eax # imm = 0x200 @@ -3030,8 +3030,8 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512F-NEXT: .LBB9_15: # %cond.store13 ; AVX512F-NEXT: vpextrw $7, %xmm0, 14(%rdi) ; AVX512F-NEXT: .LBB9_16: # %else14 -; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: testl $256, %eax # imm = 0x100 ; AVX512F-NEXT: jne .LBB9_17 ; AVX512F-NEXT: # %bb.18: # %else16 ; AVX512F-NEXT: testl $512, %eax # imm = 0x200 @@ -3187,8 +3187,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm6, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB10_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -3203,8 +3203,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB10_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB10_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -3214,8 +3214,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB10_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB10_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -3225,8 +3225,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB10_16: # %else14 -; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: je .LBB10_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -3236,8 +3236,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB10_20: # %else18 -; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: je .LBB10_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -3247,8 +3247,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB10_24: # %else22 -; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: je .LBB10_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -3258,8 +3258,8 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB10_28: # %else26 -; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: jne .LBB10_29 ; SSE2-NEXT: # %bb.30: # %else28 ; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 @@ -3292,7 +3292,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; SSE4-LABEL: truncstore_v16i32_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm8, %xmm8 -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm9 = [255,255,255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255] ; SSE4-NEXT: pminud %xmm9, %xmm1 ; SSE4-NEXT: pminud %xmm9, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 @@ -3936,7 +3936,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i32_v8i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pmovsxbw {{.*#+}} xmm5 = [65535,0,65535,0,65535,0,65535,0] +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] ; SSE4-NEXT: pminud %xmm5, %xmm1 ; SSE4-NEXT: pminud %xmm5, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 @@ -4078,12 +4078,12 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; ; AVX2-LABEL: truncstore_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax ; AVX2-NEXT: notl %eax ; AVX2-NEXT: testb $1, %al @@ -4265,8 +4265,8 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm4, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB12_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -4281,8 +4281,8 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB12_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm4, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB12_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -4292,8 +4292,8 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB12_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm4, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: jne .LBB12_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al @@ -4326,7 +4326,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i32_v8i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pmovzxbd {{.*#+}} xmm5 = [255,255,255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255] ; SSE4-NEXT: pminud %xmm5, %xmm1 ; SSE4-NEXT: pminud %xmm5, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 @@ -4634,19 +4634,19 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i32_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrld $16, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: psrld $16, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 @@ -4860,21 +4860,21 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i32_v4i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrld $24, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %ecx +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %ecx ; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: jne .LBB14_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -4984,12 +4984,12 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; ; AVX2-LABEL: truncstore_v4i32_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255] -; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax ; AVX2-NEXT: xorl $15, %eax ; AVX2-NEXT: testb $1, %al @@ -5105,8 +5105,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -5121,8 +5121,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB15_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB15_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -5132,8 +5132,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB15_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB15_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -5143,8 +5143,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB15_16: # %else14 -; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: je .LBB15_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -5154,8 +5154,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB15_20: # %else18 -; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: je .LBB15_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -5165,8 +5165,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB15_24: # %else22 -; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: je .LBB15_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -5182,8 +5182,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: .LBB15_28: # %else26 ; SSE2-NEXT: psubw %xmm1, %xmm3 ; SSE2-NEXT: psubw %xmm4, %xmm2 -; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: je .LBB15_30 ; SSE2-NEXT: # %bb.29: # %cond.store27 ; SSE2-NEXT: movb %cl, 14(%rdi) @@ -5194,8 +5194,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.31: # %cond.store29 ; SSE2-NEXT: movb %ch, 15(%rdi) ; SSE2-NEXT: .LBB15_32: # %else30 -; SSE2-NEXT: testl $65536, %eax # imm = 0x10000 ; SSE2-NEXT: movd %xmm2, %ecx +; SSE2-NEXT: testl $65536, %eax # imm = 0x10000 ; SSE2-NEXT: jne .LBB15_33 ; SSE2-NEXT: # %bb.34: # %else32 ; SSE2-NEXT: testl $131072, %eax # imm = 0x20000 @@ -5210,8 +5210,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 19(%rdi) ; SSE2-NEXT: .LBB15_40: # %else38 -; SSE2-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE2-NEXT: pextrw $2, %xmm2, %ecx +; SSE2-NEXT: testl $1048576, %eax # imm = 0x100000 ; SSE2-NEXT: je .LBB15_42 ; SSE2-NEXT: # %bb.41: # %cond.store39 ; SSE2-NEXT: movb %cl, 20(%rdi) @@ -5221,8 +5221,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.43: # %cond.store41 ; SSE2-NEXT: movb %ch, 21(%rdi) ; SSE2-NEXT: .LBB15_44: # %else42 -; SSE2-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE2-NEXT: pextrw $3, %xmm2, %ecx +; SSE2-NEXT: testl $4194304, %eax # imm = 0x400000 ; SSE2-NEXT: je .LBB15_46 ; SSE2-NEXT: # %bb.45: # %cond.store43 ; SSE2-NEXT: movb %cl, 22(%rdi) @@ -5232,8 +5232,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.47: # %cond.store45 ; SSE2-NEXT: movb %ch, 23(%rdi) ; SSE2-NEXT: .LBB15_48: # %else46 -; SSE2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE2-NEXT: pextrw $4, %xmm2, %ecx +; SSE2-NEXT: testl $16777216, %eax # imm = 0x1000000 ; SSE2-NEXT: je .LBB15_50 ; SSE2-NEXT: # %bb.49: # %cond.store47 ; SSE2-NEXT: movb %cl, 24(%rdi) @@ -5243,8 +5243,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.51: # %cond.store49 ; SSE2-NEXT: movb %ch, 25(%rdi) ; SSE2-NEXT: .LBB15_52: # %else50 -; SSE2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE2-NEXT: pextrw $5, %xmm2, %ecx +; SSE2-NEXT: testl $67108864, %eax # imm = 0x4000000 ; SSE2-NEXT: je .LBB15_54 ; SSE2-NEXT: # %bb.53: # %cond.store51 ; SSE2-NEXT: movb %cl, 26(%rdi) @@ -5254,8 +5254,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.55: # %cond.store53 ; SSE2-NEXT: movb %ch, 27(%rdi) ; SSE2-NEXT: .LBB15_56: # %else54 -; SSE2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE2-NEXT: pextrw $6, %xmm2, %ecx +; SSE2-NEXT: testl $268435456, %eax # imm = 0x10000000 ; SSE2-NEXT: je .LBB15_58 ; SSE2-NEXT: # %bb.57: # %cond.store55 ; SSE2-NEXT: movb %cl, 28(%rdi) @@ -5265,8 +5265,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE2-NEXT: # %bb.59: # %cond.store57 ; SSE2-NEXT: movb %ch, 29(%rdi) ; SSE2-NEXT: .LBB15_60: # %else58 -; SSE2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE2-NEXT: pextrw $7, %xmm2, %ecx +; SSE2-NEXT: testl $1073741824, %eax # imm = 0x40000000 ; SSE2-NEXT: jne .LBB15_61 ; SSE2-NEXT: # %bb.62: # %else60 ; SSE2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 @@ -5314,7 +5314,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; SSE4-LABEL: truncstore_v32i16_v32i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: pmovzxbw {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; SSE4-NEXT: pminuw %xmm6, %xmm1 ; SSE4-NEXT: pminuw %xmm6, %xmm0 ; SSE4-NEXT: packuswb %xmm1, %xmm0 @@ -5623,8 +5623,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX1-NEXT: .LBB15_31: # %cond.store29 ; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rdi) ; AVX1-NEXT: .LBB15_32: # %else30 -; AVX1-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX1-NEXT: jne .LBB15_33 ; AVX1-NEXT: # %bb.34: # %else32 ; AVX1-NEXT: testl $131072, %eax # imm = 0x20000 @@ -5861,8 +5861,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX2-NEXT: .LBB15_31: # %cond.store29 ; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rdi) ; AVX2-NEXT: .LBB15_32: # %else30 -; AVX2-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX2-NEXT: jne .LBB15_33 ; AVX2-NEXT: # %bb.34: # %else32 ; AVX2-NEXT: testl $131072, %eax # imm = 0x20000 @@ -6100,8 +6100,8 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512F-NEXT: .LBB15_31: # %cond.store29 ; AVX512F-NEXT: vpextrb $15, %xmm0, 15(%rdi) ; AVX512F-NEXT: .LBB15_32: # %else30 -; AVX512F-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: testl $65536, %eax # imm = 0x10000 ; AVX512F-NEXT: jne .LBB15_33 ; AVX512F-NEXT: # %bb.34: # %else32 ; AVX512F-NEXT: testl $131072, %eax # imm = 0x20000 @@ -6316,8 +6316,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE2-NEXT: pmovmskb %xmm3, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB16_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -6332,8 +6332,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB16_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB16_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -6343,8 +6343,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB16_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB16_14 ; SSE2-NEXT: # %bb.13: # %cond.store11 ; SSE2-NEXT: movb %cl, 6(%rdi) @@ -6354,8 +6354,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.15: # %cond.store13 ; SSE2-NEXT: movb %ch, 7(%rdi) ; SSE2-NEXT: .LBB16_16: # %else14 -; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: testl $256, %eax # imm = 0x100 ; SSE2-NEXT: je .LBB16_18 ; SSE2-NEXT: # %bb.17: # %cond.store15 ; SSE2-NEXT: movb %cl, 8(%rdi) @@ -6365,8 +6365,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.19: # %cond.store17 ; SSE2-NEXT: movb %ch, 9(%rdi) ; SSE2-NEXT: .LBB16_20: # %else18 -; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: testl $1024, %eax # imm = 0x400 ; SSE2-NEXT: je .LBB16_22 ; SSE2-NEXT: # %bb.21: # %cond.store19 ; SSE2-NEXT: movb %cl, 10(%rdi) @@ -6376,8 +6376,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.23: # %cond.store21 ; SSE2-NEXT: movb %ch, 11(%rdi) ; SSE2-NEXT: .LBB16_24: # %else22 -; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: testl $4096, %eax # imm = 0x1000 ; SSE2-NEXT: je .LBB16_26 ; SSE2-NEXT: # %bb.25: # %cond.store23 ; SSE2-NEXT: movb %cl, 12(%rdi) @@ -6387,8 +6387,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE2-NEXT: # %bb.27: # %cond.store25 ; SSE2-NEXT: movb %ch, 13(%rdi) ; SSE2-NEXT: .LBB16_28: # %else26 -; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: testl $16384, %eax # imm = 0x4000 ; SSE2-NEXT: jne .LBB16_29 ; SSE2-NEXT: # %bb.30: # %else28 ; SSE2-NEXT: testl $32768, %eax # imm = 0x8000 @@ -6421,7 +6421,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE4-LABEL: truncstore_v16i16_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE4-NEXT: pminuw %xmm4, %xmm1 ; SSE4-NEXT: pminuw %xmm4, %xmm0 ; SSE4-NEXT: packuswb %xmm1, %xmm0 @@ -6953,8 +6953,8 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: notl %eax -; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB17_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %al @@ -6969,8 +6969,8 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: shrl $24, %ecx ; SSE2-NEXT: movb %cl, 3(%rdi) ; SSE2-NEXT: .LBB17_8: # %else6 -; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB17_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 ; SSE2-NEXT: movb %cl, 4(%rdi) @@ -6980,8 +6980,8 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: # %bb.11: # %cond.store9 ; SSE2-NEXT: movb %ch, 5(%rdi) ; SSE2-NEXT: .LBB17_12: # %else10 -; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: jne .LBB17_13 ; SSE2-NEXT: # %bb.14: # %else12 ; SSE2-NEXT: testb $-128, %al @@ -7149,8 +7149,8 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index 9a5bd1ce87c03..64a3f5dd71095 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -406,7 +406,7 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { ; AVX512F-NEXT: vaddss %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm2 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0] +; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; @@ -465,7 +465,7 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { ; AVX512VL-NEXT: vaddss %xmm1, %xmm2, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm3, %zmm2 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u] ; AVX512VL-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 ; AVX512VL-NEXT: retq entry: @@ -774,7 +774,7 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; AVX512F-NEXT: vmulsd %xmm1, %xmm8, %xmm1 ; AVX512F-NEXT: vaddsd %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] +; AVX512F-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] ; AVX512F-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 ; AVX512F-NEXT: vmovsd %xmm1, 64(%rdi) ; AVX512F-NEXT: vmovapd %zmm3, (%rdi) @@ -792,16 +792,16 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX512VL-NEXT: vmulpd %xmm4, %xmm3, %xmm10 ; AVX512VL-NEXT: vaddpd %xmm10, %xmm9, %xmm9 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm7, %xmm6, %xmm10 -; AVX512VL-NEXT: vaddpd %xmm10, %xmm9, %xmm9 +; AVX512VL-NEXT: vmulpd %xmm6, %xmm10, %xmm7 +; AVX512VL-NEXT: vaddpd %xmm7, %xmm9, %xmm7 ; AVX512VL-NEXT: vmulsd %xmm1, %xmm2, %xmm1 ; AVX512VL-NEXT: vmulsd %xmm4, %xmm5, %xmm4 ; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmulsd %xmm7, %xmm8, %xmm4 +; AVX512VL-NEXT: vmulsd %xmm10, %xmm8, %xmm4 ; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX512VL-NEXT: vmulpd %xmm4, %xmm0, %xmm7 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] @@ -830,7 +830,7 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; AVX512VL-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX512VL-NEXT: vaddsd %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] +; AVX512VL-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] ; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm3 ; AVX512VL-NEXT: vmovsd %xmm2, 64(%rdi) ; AVX512VL-NEXT: vmovapd %zmm3, (%rdi) @@ -1942,9 +1942,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX1OR2-NEXT: pushq %rbp ; AVX1OR2-NEXT: movq %rsp, %rbp ; AVX1OR2-NEXT: andq $-32, %rsp -; AVX1OR2-NEXT: subq $32, %rsp -; AVX1OR2-NEXT: movq %rdi, %rax +; AVX1OR2-NEXT: subq $64, %rsp ; AVX1OR2-NEXT: vbroadcastss 16(%rbp), %ymm8 +; AVX1OR2-NEXT: movq %rdi, %rax ; AVX1OR2-NEXT: vmulps %ymm0, %ymm8, %ymm8 ; AVX1OR2-NEXT: vbroadcastss 20(%rbp), %ymm9 ; AVX1OR2-NEXT: vmulps %ymm1, %ymm9, %ymm9 @@ -1967,6 +1967,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX1OR2-NEXT: vbroadcastss 44(%rbp), %ymm9 ; AVX1OR2-NEXT: vmulps %ymm7, %ymm9, %ymm9 ; AVX1OR2-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX1OR2-NEXT: vmovaps %ymm8, (%rsp) # 32-byte Spill ; AVX1OR2-NEXT: vbroadcastss 48(%rbp), %ymm9 ; AVX1OR2-NEXT: vmulps %ymm0, %ymm9, %ymm9 ; AVX1OR2-NEXT: vbroadcastss 52(%rbp), %ymm10 @@ -2014,30 +2015,30 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX1OR2-NEXT: vmulps %ymm7, %ymm11, %ymm11 ; AVX1OR2-NEXT: vaddps %ymm11, %ymm10, %ymm10 ; AVX1OR2-NEXT: vbroadcastss 112(%rbp), %ymm11 -; AVX1OR2-NEXT: vmulps %ymm0, %ymm11, %ymm11 ; AVX1OR2-NEXT: vbroadcastss 116(%rbp), %ymm12 +; AVX1OR2-NEXT: vmulps %ymm0, %ymm11, %ymm11 ; AVX1OR2-NEXT: vmulps %ymm1, %ymm12, %ymm12 +; AVX1OR2-NEXT: vbroadcastss 120(%rbp), %ymm13 ; AVX1OR2-NEXT: vaddps %ymm12, %ymm11, %ymm11 -; AVX1OR2-NEXT: vbroadcastss 120(%rbp), %ymm12 -; AVX1OR2-NEXT: vmulps %ymm2, %ymm12, %ymm12 +; AVX1OR2-NEXT: vmulps %ymm2, %ymm13, %ymm12 +; AVX1OR2-NEXT: vbroadcastss 124(%rbp), %ymm13 ; AVX1OR2-NEXT: vaddps %ymm12, %ymm11, %ymm11 -; AVX1OR2-NEXT: vbroadcastss 124(%rbp), %ymm12 -; AVX1OR2-NEXT: vmulps %ymm3, %ymm12, %ymm12 +; AVX1OR2-NEXT: vmulps %ymm3, %ymm13, %ymm12 +; AVX1OR2-NEXT: vbroadcastss 128(%rbp), %ymm13 ; AVX1OR2-NEXT: vaddps %ymm12, %ymm11, %ymm11 -; AVX1OR2-NEXT: vbroadcastss 128(%rbp), %ymm12 -; AVX1OR2-NEXT: vmulps %ymm4, %ymm12, %ymm12 +; AVX1OR2-NEXT: vmulps %ymm4, %ymm13, %ymm12 +; AVX1OR2-NEXT: vbroadcastss 132(%rbp), %ymm13 ; AVX1OR2-NEXT: vaddps %ymm12, %ymm11, %ymm11 -; AVX1OR2-NEXT: vbroadcastss 132(%rbp), %ymm12 -; AVX1OR2-NEXT: vmulps %ymm5, %ymm12, %ymm12 +; AVX1OR2-NEXT: vmulps %ymm5, %ymm13, %ymm12 +; AVX1OR2-NEXT: vbroadcastss 136(%rbp), %ymm13 ; AVX1OR2-NEXT: vaddps %ymm12, %ymm11, %ymm11 -; AVX1OR2-NEXT: vbroadcastss 136(%rbp), %ymm12 -; AVX1OR2-NEXT: vmulps %ymm6, %ymm12, %ymm12 +; AVX1OR2-NEXT: vmulps %ymm6, %ymm13, %ymm12 +; AVX1OR2-NEXT: vbroadcastss 140(%rbp), %ymm13 ; AVX1OR2-NEXT: vaddps %ymm12, %ymm11, %ymm11 -; AVX1OR2-NEXT: vbroadcastss 140(%rbp), %ymm12 -; AVX1OR2-NEXT: vmulps %ymm7, %ymm12, %ymm12 +; AVX1OR2-NEXT: vmulps %ymm7, %ymm13, %ymm12 +; AVX1OR2-NEXT: vbroadcastss 144(%rbp), %ymm13 ; AVX1OR2-NEXT: vaddps %ymm12, %ymm11, %ymm11 -; AVX1OR2-NEXT: vbroadcastss 144(%rbp), %ymm12 -; AVX1OR2-NEXT: vmulps %ymm0, %ymm12, %ymm12 +; AVX1OR2-NEXT: vmulps %ymm0, %ymm13, %ymm12 ; AVX1OR2-NEXT: vbroadcastss 148(%rbp), %ymm13 ; AVX1OR2-NEXT: vmulps %ymm1, %ymm13, %ymm13 ; AVX1OR2-NEXT: vaddps %ymm13, %ymm12, %ymm12 @@ -2106,27 +2107,27 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX1OR2-NEXT: vmulps %ymm7, %ymm15, %ymm15 ; AVX1OR2-NEXT: vaddps %ymm15, %ymm14, %ymm14 ; AVX1OR2-NEXT: vbroadcastss 240(%rbp), %ymm15 +; AVX1OR2-NEXT: vbroadcastss 244(%rbp), %ymm8 ; AVX1OR2-NEXT: vmulps %ymm0, %ymm15, %ymm0 -; AVX1OR2-NEXT: vbroadcastss 244(%rbp), %ymm15 -; AVX1OR2-NEXT: vmulps %ymm1, %ymm15, %ymm1 +; AVX1OR2-NEXT: vmulps %ymm1, %ymm8, %ymm1 +; AVX1OR2-NEXT: vbroadcastss 248(%rbp), %ymm8 ; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastss 248(%rbp), %ymm1 -; AVX1OR2-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX1OR2-NEXT: vmulps %ymm2, %ymm8, %ymm1 +; AVX1OR2-NEXT: vbroadcastss 252(%rbp), %ymm2 ; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastss 252(%rbp), %ymm1 -; AVX1OR2-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; AVX1OR2-NEXT: vmulps %ymm2, %ymm3, %ymm1 +; AVX1OR2-NEXT: vbroadcastss 256(%rbp), %ymm2 ; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastss 256(%rbp), %ymm1 -; AVX1OR2-NEXT: vmulps %ymm1, %ymm4, %ymm1 +; AVX1OR2-NEXT: vmulps %ymm2, %ymm4, %ymm1 +; AVX1OR2-NEXT: vbroadcastss 260(%rbp), %ymm2 ; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastss 260(%rbp), %ymm1 -; AVX1OR2-NEXT: vmulps %ymm1, %ymm5, %ymm1 +; AVX1OR2-NEXT: vmulps %ymm2, %ymm5, %ymm1 +; AVX1OR2-NEXT: vbroadcastss 264(%rbp), %ymm2 ; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastss 264(%rbp), %ymm1 -; AVX1OR2-NEXT: vmulps %ymm1, %ymm6, %ymm1 +; AVX1OR2-NEXT: vmulps %ymm2, %ymm6, %ymm1 +; AVX1OR2-NEXT: vbroadcastss 268(%rbp), %ymm2 ; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastss 268(%rbp), %ymm1 -; AVX1OR2-NEXT: vmulps %ymm1, %ymm7, %ymm1 +; AVX1OR2-NEXT: vmulps %ymm2, %ymm7, %ymm1 ; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1OR2-NEXT: vmovaps %ymm0, 224(%rdi) ; AVX1OR2-NEXT: vmovaps %ymm14, 192(%rdi) @@ -2135,7 +2136,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX1OR2-NEXT: vmovaps %ymm11, 96(%rdi) ; AVX1OR2-NEXT: vmovaps %ymm10, 64(%rdi) ; AVX1OR2-NEXT: vmovaps %ymm9, 32(%rdi) -; AVX1OR2-NEXT: vmovaps %ymm8, (%rdi) +; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload +; AVX1OR2-NEXT: vmovaps %ymm0, (%rdi) ; AVX1OR2-NEXT: movq %rbp, %rsp ; AVX1OR2-NEXT: popq %rbp ; AVX1OR2-NEXT: vzeroupper @@ -3413,43 +3415,43 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1OR2-NEXT: pushq %rbp ; AVX1OR2-NEXT: movq %rsp, %rbp ; AVX1OR2-NEXT: andq $-32, %rsp -; AVX1OR2-NEXT: subq $448, %rsp # imm = 0x1C0 -; AVX1OR2-NEXT: vmovapd %ymm2, %ymm12 -; AVX1OR2-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill +; AVX1OR2-NEXT: subq $416, %rsp # imm = 0x1A0 +; AVX1OR2-NEXT: vmovapd %ymm1, %ymm13 +; AVX1OR2-NEXT: vmovapd %ymm0, %ymm9 ; AVX1OR2-NEXT: movq %rdi, %rax -; AVX1OR2-NEXT: vmovapd 144(%rbp), %ymm13 -; AVX1OR2-NEXT: vmovapd 112(%rbp), %ymm14 -; AVX1OR2-NEXT: vbroadcastsd 272(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm1, %ymm10, %ymm8 -; AVX1OR2-NEXT: vmovapd %ymm1, %ymm9 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmovapd 48(%rbp), %ymm14 +; AVX1OR2-NEXT: vbroadcastsd 272(%rbp), %ymm8 +; AVX1OR2-NEXT: vmovapd 16(%rbp), %ymm15 +; AVX1OR2-NEXT: vmulpd %ymm1, %ymm8, %ymm12 ; AVX1OR2-NEXT: vbroadcastsd 280(%rbp), %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm0, %ymm8, %ymm8 ; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm11, %ymm8, %ymm1 +; AVX1OR2-NEXT: vaddpd %ymm11, %ymm12, %ymm1 ; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vaddpd %ymm10, %ymm8, %ymm0 ; AVX1OR2-NEXT: vbroadcastsd 288(%rbp), %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm4, %ymm8 ; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1OR2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 296(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 296(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm7, %ymm11, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1OR2-NEXT: vmulpd %ymm6, %ymm11, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1OR2-NEXT: vbroadcastsd 304(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 312(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 312(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd 112(%rbp), %ymm11, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 +; AVX1OR2-NEXT: vmulpd 80(%rbp), %ymm11, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1OR2-NEXT: vbroadcastsd 320(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1OR2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1OR2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 @@ -3461,318 +3463,313 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1OR2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1OR2-NEXT: vbroadcastsd 336(%rbp), %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 +; AVX1OR2-NEXT: vmovapd %ymm13, %ymm4 +; AVX1OR2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 344(%rbp), %ymm10 ; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX1OR2-NEXT: vmovapd %ymm3, %ymm8 +; AVX1OR2-NEXT: vmovapd %ymm3, %ymm13 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 ; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm2, %ymm3 +; AVX1OR2-NEXT: vbroadcastsd 352(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 352(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1OR2-NEXT: vmovapd %ymm5, %ymm3 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm8, %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm8, %ymm12 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm5, %ymm11, %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm5, %ymm8 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 360(%rbp), %ymm10 ; AVX1OR2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1OR2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 368(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 368(%rbp), %ymm10 -; AVX1OR2-NEXT: vmovapd 16(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm15, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm14, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 376(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1OR2-NEXT: vmovapd 112(%rbp), %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmovapd 80(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 384(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 384(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmovapd 176(%rbp), %ymm14 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1OR2-NEXT: vmovapd 144(%rbp), %ymm14 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm14, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmovapd 176(%rbp), %ymm15 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm15, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 392(%rbp), %ymm10 -; AVX1OR2-NEXT: vmovapd 240(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX1OR2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1OR2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1OR2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1OR2-NEXT: vbroadcastsd 400(%rbp), %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1OR2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1OR2-NEXT: vbroadcastsd 400(%rbp), %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 408(%rbp), %ymm10 -; AVX1OR2-NEXT: vmovapd %ymm8, %ymm5 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 416(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 +; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmovapd %ymm4, %ymm2 +; AVX1OR2-NEXT: vbroadcastsd 408(%rbp), %ymm1 +; AVX1OR2-NEXT: vmovapd %ymm13, %ymm5 +; AVX1OR2-NEXT: vmulpd %ymm1, %ymm13, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 -; AVX1OR2-NEXT: vmovapd %ymm3, %ymm2 +; AVX1OR2-NEXT: vmovapd %ymm9, %ymm13 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm1, %ymm3, %ymm1 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 424(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX1OR2-NEXT: vbroadcastsd 416(%rbp), %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm12, %ymm4 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 424(%rbp), %ymm11 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm7, %ymm11, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm6, %ymm11, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 432(%rbp), %ymm10 ; AVX1OR2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 440(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 440(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 448(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1OR2-NEXT: vmulpd 112(%rbp), %ymm11, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd 80(%rbp), %ymm11, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 456(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX1OR2-NEXT: vbroadcastsd 448(%rbp), %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1OR2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 456(%rbp), %ymm11 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd 240(%rbp), %ymm11, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1OR2-NEXT: vmovapd 208(%rbp), %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm0, %ymm11, %ymm0 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX1OR2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1OR2-NEXT: vbroadcastsd 464(%rbp), %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 -; AVX1OR2-NEXT: vmovapd %ymm9, %ymm13 +; AVX1OR2-NEXT: vmulpd %ymm0, %ymm2, %ymm1 +; AVX1OR2-NEXT: vmovapd %ymm2, %ymm12 ; AVX1OR2-NEXT: vbroadcastsd 472(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX1OR2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 -; AVX1OR2-NEXT: vmovapd %ymm15, %ymm9 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm3, %ymm9 +; AVX1OR2-NEXT: vbroadcastsd 480(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 480(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1OR2-NEXT: vmovapd %ymm4, %ymm3 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmovapd %ymm2, %ymm15 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm4, %ymm11, %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm4, %ymm15 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm8, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 488(%rbp), %ymm10 -; AVX1OR2-NEXT: vmovapd %ymm7, %ymm8 ; AVX1OR2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmovapd %ymm6, %ymm7 ; AVX1OR2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 496(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 496(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmovapd 48(%rbp), %ymm4 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmovapd 16(%rbp), %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm11, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmovapd 48(%rbp), %ymm14 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm14, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 504(%rbp), %ymm10 -; AVX1OR2-NEXT: vmovapd 112(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX1OR2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmovapd 80(%rbp), %ymm14 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1OR2-NEXT: vmovapd 80(%rbp), %ymm3 +; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 512(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 512(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmovapd 176(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmovapd 144(%rbp), %ymm3 +; AVX1OR2-NEXT: vmulpd %ymm3, %ymm11, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd 176(%rbp), %ymm11, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 520(%rbp), %ymm10 ; AVX1OR2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1OR2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1OR2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1OR2-NEXT: vbroadcastsd 528(%rbp), %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1OR2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1OR2-NEXT: vbroadcastsd 528(%rbp), %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 536(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX1OR2-NEXT: vmovapd %ymm5, %ymm6 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 -; AVX1OR2-NEXT: vmovapd %ymm12, %ymm5 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 544(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX1OR2-NEXT: vmovapd %ymm3, %ymm12 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm12, %ymm0 +; AVX1OR2-NEXT: vbroadcastsd 536(%rbp), %ymm1 +; AVX1OR2-NEXT: vmulpd %ymm1, %ymm5, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm13, %ymm4 +; AVX1OR2-NEXT: vmulpd %ymm1, %ymm9, %ymm1 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 552(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX1OR2-NEXT: vbroadcastsd 544(%rbp), %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 552(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 560(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX1OR2-NEXT: vmovapd %ymm4, %ymm3 +; AVX1OR2-NEXT: vmulpd %ymm7, %ymm11, %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm7, %ymm13 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm6, %ymm11, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 568(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 +; AVX1OR2-NEXT: vbroadcastsd 560(%rbp), %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 568(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 576(%rbp), %ymm10 -; AVX1OR2-NEXT: vmovapd 144(%rbp), %ymm4 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmovapd 112(%rbp), %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm11, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd 80(%rbp), %ymm11, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 584(%rbp), %ymm10 -; AVX1OR2-NEXT: vmovapd 240(%rbp), %ymm14 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1OR2-NEXT: vbroadcastsd 576(%rbp), %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1OR2-NEXT: vmovapd 208(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vmovapd 176(%rbp), %ymm7 +; AVX1OR2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 584(%rbp), %ymm11 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmovapd 240(%rbp), %ymm14 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm14, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1OR2-NEXT: vmovapd 208(%rbp), %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm0, %ymm11, %ymm0 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX1OR2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1OR2-NEXT: vbroadcastsd 592(%rbp), %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 +; AVX1OR2-NEXT: vmulpd %ymm0, %ymm12, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 600(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm6, %ymm10, %ymm11 +; AVX1OR2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm0, %ymm4, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 608(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 608(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm15, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm8, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 616(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1OR2-NEXT: vmovapd %ymm13, %ymm3 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmovapd %ymm6, %ymm7 +; AVX1OR2-NEXT: vbroadcastsd 624(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 624(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmovapd 16(%rbp), %ymm13 +; AVX1OR2-NEXT: vmulpd %ymm11, %ymm13, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd 48(%rbp), %ymm11, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 632(%rbp), %ymm10 -; AVX1OR2-NEXT: vmovapd 112(%rbp), %ymm3 -; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 -; AVX1OR2-NEXT: vmovapd 80(%rbp), %ymm3 -; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1OR2-NEXT: vbroadcastsd 640(%rbp), %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 640(%rbp), %ymm10 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1OR2-NEXT: vmovapd 176(%rbp), %ymm3 -; AVX1OR2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1OR2-NEXT: vmovapd 144(%rbp), %ymm6 +; AVX1OR2-NEXT: vmulpd %ymm6, %ymm11, %ymm10 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1OR2-NEXT: vmulpd 176(%rbp), %ymm11, %ymm10 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 648(%rbp), %ymm10 -; AVX1OR2-NEXT: vmovapd %ymm14, %ymm4 ; AVX1OR2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1OR2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1OR2-NEXT: vmovapd 208(%rbp), %ymm11 +; AVX1OR2-NEXT: vmulpd %ymm10, %ymm11, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 656(%rbp), %ymm2 -; AVX1OR2-NEXT: vmovapd %ymm13, %ymm3 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm13, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 664(%rbp), %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm6, %ymm14 -; AVX1OR2-NEXT: vmovapd %ymm6, %ymm10 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm9, %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm0, %ymm5, %ymm0 -; AVX1OR2-NEXT: vmovapd %ymm5, %ymm6 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm2, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 672(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm12, %ymm14 +; AVX1OR2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm12, %ymm0 +; AVX1OR2-NEXT: vbroadcastsd 664(%rbp), %ymm1 +; AVX1OR2-NEXT: vmulpd %ymm1, %ymm5, %ymm14 ; AVX1OR2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm15, %ymm2 -; AVX1OR2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 680(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm8, %ymm14 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm4, %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm1, %ymm9, %ymm1 +; AVX1OR2-NEXT: vaddpd %ymm1, %ymm2, %ymm1 +; AVX1OR2-NEXT: vbroadcastsd 672(%rbp), %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm15, %ymm14 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm7, %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm8, %ymm2 +; AVX1OR2-NEXT: vbroadcastsd 680(%rbp), %ymm14 ; AVX1OR2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 688(%rbp), %ymm2 -; AVX1OR2-NEXT: vmovapd 16(%rbp), %ymm11 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm11, %ymm14 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 -; AVX1OR2-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 +; AVX1OR2-NEXT: vmovapd %ymm3, %ymm10 +; AVX1OR2-NEXT: vmulpd %ymm3, %ymm14, %ymm2 +; AVX1OR2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: vmulpd %ymm7, %ymm14, %ymm2 ; AVX1OR2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX1OR2-NEXT: vbroadcastsd 696(%rbp), %ymm2 -; AVX1OR2-NEXT: vmovapd 112(%rbp), %ymm5 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm5, %ymm14 +; AVX1OR2-NEXT: vbroadcastsd 688(%rbp), %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm13, %ymm14 ; AVX1OR2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX1OR2-NEXT: vmovapd 80(%rbp), %ymm5 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm5, %ymm2 +; AVX1OR2-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 +; AVX1OR2-NEXT: vbroadcastsd 696(%rbp), %ymm14 ; AVX1OR2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastsd 704(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd 144(%rbp), %ymm2, %ymm14 -; AVX1OR2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 -; AVX1OR2-NEXT: vmovapd 176(%rbp), %ymm13 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm13, %ymm2 +; AVX1OR2-NEXT: vmulpd 112(%rbp), %ymm14, %ymm2 +; AVX1OR2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: vmulpd 80(%rbp), %ymm14, %ymm2 ; AVX1OR2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1OR2-NEXT: vbroadcastsd 704(%rbp), %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm6, %ymm14 +; AVX1OR2-NEXT: vaddpd %ymm1, %ymm14, %ymm14 +; AVX1OR2-NEXT: vmulpd 176(%rbp), %ymm2, %ymm1 ; AVX1OR2-NEXT: vbroadcastsd 712(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm4, %ymm14 -; AVX1OR2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 -; AVX1OR2-NEXT: vmovapd 208(%rbp), %ymm14 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm14, %ymm2 -; AVX1OR2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vmovapd 240(%rbp), %ymm13 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm13, %ymm1 +; AVX1OR2-NEXT: vaddpd %ymm1, %ymm0, %ymm1 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm11, %ymm0 +; AVX1OR2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 ; AVX1OR2-NEXT: vbroadcastsd 720(%rbp), %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm3, %ymm3 -; AVX1OR2-NEXT: vmulpd %ymm2, %ymm9, %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm12, %ymm3 +; AVX1OR2-NEXT: vmulpd %ymm2, %ymm4, %ymm2 ; AVX1OR2-NEXT: vbroadcastsd 728(%rbp), %ymm4 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm5 +; AVX1OR2-NEXT: vmulpd %ymm4, %ymm5, %ymm5 ; AVX1OR2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm6, %ymm4 +; AVX1OR2-NEXT: vmulpd %ymm4, %ymm9, %ymm4 +; AVX1OR2-NEXT: vbroadcastsd 736(%rbp), %ymm5 +; AVX1OR2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm5, %ymm15, %ymm4 ; AVX1OR2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX1OR2-NEXT: vbroadcastsd 736(%rbp), %ymm4 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm12, %ymm5 -; AVX1OR2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm15, %ymm4 +; AVX1OR2-NEXT: vmulpd %ymm5, %ymm8, %ymm4 ; AVX1OR2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX1OR2-NEXT: vbroadcastsd 744(%rbp), %ymm4 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm8, %ymm5 +; AVX1OR2-NEXT: vmulpd %ymm4, %ymm10, %ymm5 ; AVX1OR2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX1OR2-NEXT: vmulpd %ymm4, %ymm7, %ymm4 +; AVX1OR2-NEXT: vbroadcastsd 752(%rbp), %ymm5 +; AVX1OR2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1OR2-NEXT: vmulpd 16(%rbp), %ymm5, %ymm4 ; AVX1OR2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX1OR2-NEXT: vbroadcastsd 752(%rbp), %ymm4 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm11, %ymm5 -; AVX1OR2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 -; AVX1OR2-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 +; AVX1OR2-NEXT: vmulpd 48(%rbp), %ymm5, %ymm4 ; AVX1OR2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX1OR2-NEXT: vbroadcastsd 760(%rbp), %ymm4 ; AVX1OR2-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 ; AVX1OR2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX1OR2-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 +; AVX1OR2-NEXT: vbroadcastsd 768(%rbp), %ymm5 +; AVX1OR2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1OR2-NEXT: vmulpd %ymm5, %ymm6, %ymm4 ; AVX1OR2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX1OR2-NEXT: vbroadcastsd 768(%rbp), %ymm4 -; AVX1OR2-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 -; AVX1OR2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm13, %ymm4 +; AVX1OR2-NEXT: vmulpd 176(%rbp), %ymm5, %ymm4 ; AVX1OR2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX1OR2-NEXT: vbroadcastsd 776(%rbp), %ymm4 -; AVX1OR2-NEXT: vmulpd 240(%rbp), %ymm4, %ymm5 +; AVX1OR2-NEXT: vmulpd %ymm4, %ymm13, %ymm5 ; AVX1OR2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 -; AVX1OR2-NEXT: vmulpd %ymm4, %ymm14, %ymm4 +; AVX1OR2-NEXT: vmulpd %ymm4, %ymm11, %ymm4 ; AVX1OR2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX1OR2-NEXT: vmovapd %ymm3, 480(%rdi) ; AVX1OR2-NEXT: vmovapd %ymm2, 448(%rdi) @@ -3780,7 +3777,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1OR2-NEXT: vmovapd %ymm0, 384(%rdi) ; AVX1OR2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1OR2-NEXT: vmovaps %ymm0, 352(%rdi) -; AVX1OR2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1OR2-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload ; AVX1OR2-NEXT: vmovaps %ymm0, 320(%rdi) ; AVX1OR2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1OR2-NEXT: vmovaps %ymm0, 288(%rdi) @@ -3813,48 +3810,48 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $64, %rsp -; AVX512-NEXT: movq %rdi, %rax ; AVX512-NEXT: vmulpd 16(%rbp){1to8}, %zmm0, %zmm8 +; AVX512-NEXT: movq %rdi, %rax ; AVX512-NEXT: vmulpd 24(%rbp){1to8}, %zmm1, %zmm9 ; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 ; AVX512-NEXT: vmulpd 32(%rbp){1to8}, %zmm2, %zmm9 +; AVX512-NEXT: vmulpd 40(%rbp){1to8}, %zmm3, %zmm10 ; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 -; AVX512-NEXT: vmulpd 40(%rbp){1to8}, %zmm3, %zmm9 -; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vaddpd %zmm10, %zmm8, %zmm8 ; AVX512-NEXT: vmulpd 48(%rbp){1to8}, %zmm4, %zmm9 ; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 ; AVX512-NEXT: vmulpd 56(%rbp){1to8}, %zmm5, %zmm9 ; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 ; AVX512-NEXT: vmulpd 64(%rbp){1to8}, %zmm6, %zmm9 +; AVX512-NEXT: vmulpd 72(%rbp){1to8}, %zmm7, %zmm10 ; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 -; AVX512-NEXT: vmulpd 72(%rbp){1to8}, %zmm7, %zmm9 -; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vaddpd %zmm10, %zmm8, %zmm8 ; AVX512-NEXT: vmulpd 80(%rbp){1to8}, %zmm0, %zmm9 ; AVX512-NEXT: vmulpd 88(%rbp){1to8}, %zmm1, %zmm10 +; AVX512-NEXT: vmulpd 96(%rbp){1to8}, %zmm2, %zmm11 ; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 -; AVX512-NEXT: vmulpd 96(%rbp){1to8}, %zmm2, %zmm10 -; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vaddpd %zmm11, %zmm9, %zmm9 ; AVX512-NEXT: vmulpd 104(%rbp){1to8}, %zmm3, %zmm10 ; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 ; AVX512-NEXT: vmulpd 112(%rbp){1to8}, %zmm4, %zmm10 ; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 ; AVX512-NEXT: vmulpd 120(%rbp){1to8}, %zmm5, %zmm10 +; AVX512-NEXT: vmulpd 128(%rbp){1to8}, %zmm6, %zmm11 ; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 -; AVX512-NEXT: vmulpd 128(%rbp){1to8}, %zmm6, %zmm10 -; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vaddpd %zmm11, %zmm9, %zmm9 ; AVX512-NEXT: vmulpd 136(%rbp){1to8}, %zmm7, %zmm10 +; AVX512-NEXT: vmulpd 144(%rbp){1to8}, %zmm0, %zmm11 +; AVX512-NEXT: vmulpd 152(%rbp){1to8}, %zmm1, %zmm12 ; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 -; AVX512-NEXT: vmulpd 144(%rbp){1to8}, %zmm0, %zmm10 -; AVX512-NEXT: vmulpd 152(%rbp){1to8}, %zmm1, %zmm11 -; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 +; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm10 ; AVX512-NEXT: vmulpd 160(%rbp){1to8}, %zmm2, %zmm11 ; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 ; AVX512-NEXT: vmulpd 168(%rbp){1to8}, %zmm3, %zmm11 ; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 ; AVX512-NEXT: vmulpd 176(%rbp){1to8}, %zmm4, %zmm11 +; AVX512-NEXT: vmulpd 184(%rbp){1to8}, %zmm5, %zmm12 ; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 -; AVX512-NEXT: vmulpd 184(%rbp){1to8}, %zmm5, %zmm11 -; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 +; AVX512-NEXT: vaddpd %zmm12, %zmm10, %zmm10 ; AVX512-NEXT: vmulpd 192(%rbp){1to8}, %zmm6, %zmm11 ; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 ; AVX512-NEXT: vmulpd 200(%rbp){1to8}, %zmm7, %zmm11 @@ -3865,56 +3862,56 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX512-NEXT: vmulpd 224(%rbp){1to8}, %zmm2, %zmm12 ; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 ; AVX512-NEXT: vmulpd 232(%rbp){1to8}, %zmm3, %zmm12 +; AVX512-NEXT: vmulpd 240(%rbp){1to8}, %zmm4, %zmm13 ; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 -; AVX512-NEXT: vmulpd 240(%rbp){1to8}, %zmm4, %zmm12 -; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vaddpd %zmm13, %zmm11, %zmm11 ; AVX512-NEXT: vmulpd 248(%rbp){1to8}, %zmm5, %zmm12 ; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 ; AVX512-NEXT: vmulpd 256(%rbp){1to8}, %zmm6, %zmm12 ; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 ; AVX512-NEXT: vmulpd 264(%rbp){1to8}, %zmm7, %zmm12 +; AVX512-NEXT: vmulpd 272(%rbp){1to8}, %zmm0, %zmm13 ; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 -; AVX512-NEXT: vmulpd 272(%rbp){1to8}, %zmm0, %zmm12 -; AVX512-NEXT: vmulpd 280(%rbp){1to8}, %zmm1, %zmm13 -; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vmulpd 280(%rbp){1to8}, %zmm1, %zmm12 +; AVX512-NEXT: vaddpd %zmm12, %zmm13, %zmm12 ; AVX512-NEXT: vmulpd 288(%rbp){1to8}, %zmm2, %zmm13 +; AVX512-NEXT: vmulpd 296(%rbp){1to8}, %zmm3, %zmm14 ; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 -; AVX512-NEXT: vmulpd 296(%rbp){1to8}, %zmm3, %zmm13 -; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vaddpd %zmm14, %zmm12, %zmm12 ; AVX512-NEXT: vmulpd 304(%rbp){1to8}, %zmm4, %zmm13 ; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 ; AVX512-NEXT: vmulpd 312(%rbp){1to8}, %zmm5, %zmm13 ; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 ; AVX512-NEXT: vmulpd 320(%rbp){1to8}, %zmm6, %zmm13 +; AVX512-NEXT: vmulpd 328(%rbp){1to8}, %zmm7, %zmm14 ; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 -; AVX512-NEXT: vmulpd 328(%rbp){1to8}, %zmm7, %zmm13 -; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vaddpd %zmm14, %zmm12, %zmm12 ; AVX512-NEXT: vmulpd 336(%rbp){1to8}, %zmm0, %zmm13 ; AVX512-NEXT: vmulpd 344(%rbp){1to8}, %zmm1, %zmm14 +; AVX512-NEXT: vmulpd 352(%rbp){1to8}, %zmm2, %zmm15 ; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 -; AVX512-NEXT: vmulpd 352(%rbp){1to8}, %zmm2, %zmm14 -; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 +; AVX512-NEXT: vaddpd %zmm15, %zmm13, %zmm13 ; AVX512-NEXT: vmulpd 360(%rbp){1to8}, %zmm3, %zmm14 ; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 ; AVX512-NEXT: vmulpd 368(%rbp){1to8}, %zmm4, %zmm14 ; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 ; AVX512-NEXT: vmulpd 376(%rbp){1to8}, %zmm5, %zmm14 +; AVX512-NEXT: vmulpd 384(%rbp){1to8}, %zmm6, %zmm15 ; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 -; AVX512-NEXT: vmulpd 384(%rbp){1to8}, %zmm6, %zmm14 -; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 +; AVX512-NEXT: vaddpd %zmm15, %zmm13, %zmm13 ; AVX512-NEXT: vmulpd 392(%rbp){1to8}, %zmm7, %zmm14 +; AVX512-NEXT: vmulpd 400(%rbp){1to8}, %zmm0, %zmm15 +; AVX512-NEXT: vmulpd 408(%rbp){1to8}, %zmm1, %zmm16 ; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 -; AVX512-NEXT: vmulpd 400(%rbp){1to8}, %zmm0, %zmm14 -; AVX512-NEXT: vmulpd 408(%rbp){1to8}, %zmm1, %zmm15 -; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 +; AVX512-NEXT: vaddpd %zmm16, %zmm15, %zmm14 ; AVX512-NEXT: vmulpd 416(%rbp){1to8}, %zmm2, %zmm15 ; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 ; AVX512-NEXT: vmulpd 424(%rbp){1to8}, %zmm3, %zmm15 ; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 ; AVX512-NEXT: vmulpd 432(%rbp){1to8}, %zmm4, %zmm15 +; AVX512-NEXT: vmulpd 440(%rbp){1to8}, %zmm5, %zmm16 ; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 -; AVX512-NEXT: vmulpd 440(%rbp){1to8}, %zmm5, %zmm15 -; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 +; AVX512-NEXT: vaddpd %zmm16, %zmm14, %zmm14 ; AVX512-NEXT: vmulpd 448(%rbp){1to8}, %zmm6, %zmm15 ; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 ; AVX512-NEXT: vmulpd 456(%rbp){1to8}, %zmm7, %zmm15 @@ -3925,9 +3922,9 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX512-NEXT: vmulpd 480(%rbp){1to8}, %zmm2, %zmm1 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmulpd 488(%rbp){1to8}, %zmm3, %zmm1 +; AVX512-NEXT: vmulpd 496(%rbp){1to8}, %zmm4, %zmm2 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmulpd 496(%rbp){1to8}, %zmm4, %zmm1 -; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vmulpd 504(%rbp){1to8}, %zmm5, %zmm1 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmulpd 512(%rbp){1to8}, %zmm6, %zmm1 diff --git a/llvm/test/CodeGen/X86/mem-intrin-base-reg.ll b/llvm/test/CodeGen/X86/mem-intrin-base-reg.ll index 0360b03f95215..c27fd2400db16 100644 --- a/llvm/test/CodeGen/X86/mem-intrin-base-reg.ll +++ b/llvm/test/CodeGen/X86/mem-intrin-base-reg.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=i686-windows -mattr=+sse2 < %s | FileCheck %s target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" @@ -13,6 +14,54 @@ declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture readonly, i32, declare void @llvm.memset.p0.i32(ptr nocapture, i8, i32, i1) define i32 @memcpy_novla_vector(ptr %vp0, ptr %a, ptr %b, i32 %n, i1 zeroext %cond) { +; CHECK-LABEL: memcpy_novla_vector: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: andl $-16, %esp +; CHECK-NEXT: subl $32, %esp +; CHECK-NEXT: movzbl 24(%ebp), %eax +; CHECK-NEXT: movl 12(%ebp), %ecx +; CHECK-NEXT: movl 16(%ebp), %edx +; CHECK-NEXT: movups 112(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 112(%ecx) +; CHECK-NEXT: movups 96(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 96(%ecx) +; CHECK-NEXT: movups 80(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 80(%ecx) +; CHECK-NEXT: movups 64(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 64(%ecx) +; CHECK-NEXT: movups 48(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 48(%ecx) +; CHECK-NEXT: movups 32(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 32(%ecx) +; CHECK-NEXT: movdqu (%edx), %xmm0 +; CHECK-NEXT: movups 16(%edx), %xmm1 +; CHECK-NEXT: movups %xmm1, 16(%ecx) +; CHECK-NEXT: movdqu %xmm0, (%ecx) +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je LBB0_1 +; CHECK-NEXT: # %bb.3: # %spill_vectors +; CHECK-NEXT: movl 8(%ebp), %eax +; CHECK-NEXT: movl (%eax), %esi +; CHECK-NEXT: movdqa 16(%eax), %xmm0 +; CHECK-NEXT: pcmpgtd (%eax), %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: pushl $0 +; CHECK-NEXT: calll _escape_vla_and_icmp +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: jmp LBB0_2 +; CHECK-NEXT: LBB0_1: # %no_vectors +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: LBB0_2: # %no_vectors +; CHECK-NEXT: leal -4(%ebp), %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl %foo = alloca <4 x i32>, align 16 call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 128, i1 false) br i1 %cond, label %spill_vectors, label %no_vectors @@ -31,14 +80,63 @@ spill_vectors: ret i32 %r } -; CHECK-LABEL: _memcpy_novla_vector: -; CHECK: andl $-16, %esp -; CHECK-DAG: movl $32, %ecx -; CHECK-DAG: movl {{.*}}, %esi -; CHECK-DAG: movl {{.*}}, %edi -; CHECK: rep;movsl - define i32 @memcpy_vla_vector(ptr %vp0, ptr %a, ptr %b, i32 %n, i1 zeroext %cond) { +; CHECK-LABEL: memcpy_vla_vector: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: andl $-16, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: movl %esp, %esi +; CHECK-NEXT: movzbl 24(%ebp), %eax +; CHECK-NEXT: movl 12(%ebp), %ecx +; CHECK-NEXT: movl 16(%ebp), %edx +; CHECK-NEXT: movups 112(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 112(%ecx) +; CHECK-NEXT: movups 96(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 96(%ecx) +; CHECK-NEXT: movups 80(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 80(%ecx) +; CHECK-NEXT: movups 64(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 64(%ecx) +; CHECK-NEXT: movups 48(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 48(%ecx) +; CHECK-NEXT: movups 32(%edx), %xmm0 +; CHECK-NEXT: movups %xmm0, 32(%ecx) +; CHECK-NEXT: movdqu (%edx), %xmm0 +; CHECK-NEXT: movups 16(%edx), %xmm1 +; CHECK-NEXT: movups %xmm1, 16(%ecx) +; CHECK-NEXT: movdqu %xmm0, (%ecx) +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je LBB1_1 +; CHECK-NEXT: # %bb.3: # %spill_vectors +; CHECK-NEXT: movl 20(%ebp), %eax +; CHECK-NEXT: movl 8(%ebp), %ecx +; CHECK-NEXT: movl (%ecx), %edi +; CHECK-NEXT: movdqa 16(%ecx), %xmm0 +; CHECK-NEXT: pcmpgtd (%ecx), %xmm0 +; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: addl $3, %eax +; CHECK-NEXT: andl $-4, %eax +; CHECK-NEXT: calll __chkstk +; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: calll _escape_vla_and_icmp +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: jmp LBB1_2 +; CHECK-NEXT: LBB1_1: # %no_vectors +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: LBB1_2: # %no_vectors +; CHECK-NEXT: leal -8(%ebp), %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl %foo = alloca <4 x i32>, align 16 call void @llvm.memcpy.p0.p0.i32(ptr align 4 %a, ptr align 4 %b, i32 128, i1 false) br i1 %cond, label %spill_vectors, label %no_vectors @@ -58,16 +156,56 @@ spill_vectors: ret i32 %r } -; CHECK-LABEL: _memcpy_vla_vector: -; CHECK: andl $-16, %esp -; CHECK: movl %esp, %esi -; CHECK: pushl $128 -; CHECK: calll _memcpy -; CHECK: calll __chkstk - -; stosd doesn't clobber esi, so we can use it. define i32 @memset_vla_vector(ptr %vp0, ptr %a, i32 %n, i1 zeroext %cond) { +; CHECK-LABEL: memset_vla_vector: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: andl $-16, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: movl %esp, %esi +; CHECK-NEXT: movzbl 20(%ebp), %eax +; CHECK-NEXT: movl 12(%ebp), %ecx +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; CHECK-NEXT: movdqu %xmm0, 112(%ecx) +; CHECK-NEXT: movdqu %xmm0, 96(%ecx) +; CHECK-NEXT: movdqu %xmm0, 80(%ecx) +; CHECK-NEXT: movdqu %xmm0, 64(%ecx) +; CHECK-NEXT: movdqu %xmm0, 48(%ecx) +; CHECK-NEXT: movdqu %xmm0, 32(%ecx) +; CHECK-NEXT: movdqu %xmm0, 16(%ecx) +; CHECK-NEXT: movdqu %xmm0, (%ecx) +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je LBB2_1 +; CHECK-NEXT: # %bb.3: # %spill_vectors +; CHECK-NEXT: movl 16(%ebp), %eax +; CHECK-NEXT: movl 8(%ebp), %ecx +; CHECK-NEXT: movl (%ecx), %edi +; CHECK-NEXT: movdqa 16(%ecx), %xmm0 +; CHECK-NEXT: pcmpgtd (%ecx), %xmm0 +; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: addl $3, %eax +; CHECK-NEXT: andl $-4, %eax +; CHECK-NEXT: calll __chkstk +; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: calll _escape_vla_and_icmp +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: jmp LBB2_2 +; CHECK-NEXT: LBB2_1: # %no_vectors +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: LBB2_2: # %no_vectors +; CHECK-NEXT: leal -8(%ebp), %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl %foo = alloca <4 x i32>, align 16 call void @llvm.memset.p0.i32(ptr align 4 %a, i8 42, i32 128, i1 false) br i1 %cond, label %spill_vectors, label %no_vectors @@ -87,13 +225,4 @@ spill_vectors: ret i32 %r } -; CHECK-LABEL: _memset_vla_vector: -; CHECK: andl $-16, %esp -; CHECK: movl %esp, %esi -; CHECK-DAG: movl $707406378, %eax # imm = 0x2A2A2A2A -; CHECK-DAG: movl $32, %ecx -; CHECK-DAG: movl {{.*}}, %edi -; CHECK-NOT: movl {{.*}}, %esi -; CHECK: rep;stosl - ; Add a test for memcmp if we ever add a special lowering for it. diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll index 7d1422d3c961e..2f6dbd967640a 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -78,8 +78,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx ; X86-NEXT: rolw $8, %cx +; X86-NEXT: movzwl (%eax), %edx ; X86-NEXT: rolw $8, %dx ; X86-NEXT: movzwl %cx, %eax ; X86-NEXT: movzwl %dx, %ecx @@ -147,8 +147,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi ; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl (%ecx), %esi ; X86-NEXT: rolw $8, %si ; X86-NEXT: cmpw %si, %dx ; X86-NEXT: jne .LBB9_3 @@ -161,7 +161,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; X86-NEXT: .LBB9_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind @@ -285,7 +285,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind { ; X86-NEXT: .LBB16_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind @@ -330,7 +330,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X86-NEXT: .LBB18_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB18_2: # %endblock ; X86-NEXT: shrl $31, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax @@ -365,7 +365,7 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB19_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -414,7 +414,7 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB21_3: # %endblock ; X86-NEXT: shrl $31, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax @@ -449,7 +449,7 @@ define i32 @length8(ptr %X, ptr %Y) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB22_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -546,9 +546,9 @@ define i1 @length11_eq(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: xorl (%eax), %edx ; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi ; X86-NEXT: movl 7(%ecx), %ecx ; X86-NEXT: xorl 7(%eax), %ecx +; X86-NEXT: orl %edx, %esi ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: sete %al ; X86-NEXT: popl %esi @@ -568,9 +568,9 @@ define i1 @length12_eq(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: xorl (%eax), %edx ; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi ; X86-NEXT: movl 8(%ecx), %ecx ; X86-NEXT: xorl 8(%eax), %ecx +; X86-NEXT: orl %edx, %esi ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: setne %al ; X86-NEXT: popl %esi @@ -611,7 +611,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB29_4: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -674,6 +674,7 @@ define i1 @length14_eq(ptr %X, ptr %Y) nounwind { define i1 @length15_eq(ptr %X, ptr %Y) nounwind { ; X86-LABEL: length15_eq: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -681,15 +682,16 @@ define i1 @length15_eq(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl 4(%edx), %eax ; X86-NEXT: xorl (%ecx), %esi ; X86-NEXT: xorl 4(%ecx), %eax +; X86-NEXT: movl 8(%edx), %edi +; X86-NEXT: xorl 8(%ecx), %edi ; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: xorl 8(%ecx), %esi ; X86-NEXT: movl 11(%edx), %edx ; X86-NEXT: xorl 11(%ecx), %edx -; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %edi, %edx ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind %c = icmp eq i32 %m, 0 @@ -736,7 +738,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB33_5: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -747,6 +749,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { define i1 @length16_eq(ptr %x, ptr %y) nounwind { ; X86-NOSSE-LABEL: length16_eq: ; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -754,19 +757,21 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind { ; X86-NOSSE-NEXT: movl 4(%edx), %eax ; X86-NOSSE-NEXT: xorl (%ecx), %esi ; X86-NOSSE-NEXT: xorl 4(%ecx), %eax +; X86-NOSSE-NEXT: movl 8(%edx), %edi +; X86-NOSSE-NEXT: xorl 8(%ecx), %edi ; X86-NOSSE-NEXT: orl %esi, %eax -; X86-NOSSE-NEXT: movl 8(%edx), %esi -; X86-NOSSE-NEXT: xorl 8(%ecx), %esi ; X86-NOSSE-NEXT: movl 12(%edx), %edx ; X86-NOSSE-NEXT: xorl 12(%ecx), %edx -; X86-NOSSE-NEXT: orl %esi, %edx +; X86-NOSSE-NEXT: orl %edi, %edx ; X86-NOSSE-NEXT: orl %eax, %edx ; X86-NOSSE-NEXT: setne %al ; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: popl %edi ; X86-NOSSE-NEXT: retl ; ; X86-SSE1-LABEL: length16_eq: ; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %edi ; X86-SSE1-NEXT: pushl %esi ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -774,15 +779,16 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind { ; X86-SSE1-NEXT: movl 4(%edx), %eax ; X86-SSE1-NEXT: xorl (%ecx), %esi ; X86-SSE1-NEXT: xorl 4(%ecx), %eax +; X86-SSE1-NEXT: movl 8(%edx), %edi +; X86-SSE1-NEXT: xorl 8(%ecx), %edi ; X86-SSE1-NEXT: orl %esi, %eax -; X86-SSE1-NEXT: movl 8(%edx), %esi -; X86-SSE1-NEXT: xorl 8(%ecx), %esi ; X86-SSE1-NEXT: movl 12(%edx), %edx ; X86-SSE1-NEXT: xorl 12(%ecx), %edx -; X86-SSE1-NEXT: orl %esi, %edx +; X86-SSE1-NEXT: orl %edi, %edx ; X86-SSE1-NEXT: orl %eax, %edx ; X86-SSE1-NEXT: setne %al ; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: popl %edi ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length16_eq: @@ -850,7 +856,7 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB35_5: # %endblock ; X86-NEXT: shrl $31, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax @@ -899,7 +905,7 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: setae %dl -; X86-NEXT: leal -1(%edx,%edx), %edx +; X86-NEXT: leal -1(,%edx,2), %edx ; X86-NEXT: .LBB36_5: # %endblock ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setg %al @@ -915,17 +921,17 @@ define i1 @length16_eq_const(ptr %X) nounwind { ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl $858927408, %ecx # imm = 0x33323130 -; X86-NOSSE-NEXT: xorl (%eax), %ecx -; X86-NOSSE-NEXT: movl $926299444, %edx # imm = 0x37363534 -; X86-NOSSE-NEXT: xorl 4(%eax), %edx +; X86-NOSSE-NEXT: movl $858927408, %edx # imm = 0x33323130 +; X86-NOSSE-NEXT: xorl (%eax), %edx +; X86-NOSSE-NEXT: movl $926299444, %ecx # imm = 0x37363534 +; X86-NOSSE-NEXT: xorl 4(%eax), %ecx +; X86-NOSSE-NEXT: movl $825243960, %esi # imm = 0x31303938 +; X86-NOSSE-NEXT: xorl 8(%eax), %esi +; X86-NOSSE-NEXT: orl %edx, %ecx +; X86-NOSSE-NEXT: movl $892613426, %edx # imm = 0x35343332 +; X86-NOSSE-NEXT: xorl 12(%eax), %edx +; X86-NOSSE-NEXT: orl %esi, %edx ; X86-NOSSE-NEXT: orl %ecx, %edx -; X86-NOSSE-NEXT: movl $825243960, %ecx # imm = 0x31303938 -; X86-NOSSE-NEXT: xorl 8(%eax), %ecx -; X86-NOSSE-NEXT: movl $892613426, %esi # imm = 0x35343332 -; X86-NOSSE-NEXT: xorl 12(%eax), %esi -; X86-NOSSE-NEXT: orl %ecx, %esi -; X86-NOSSE-NEXT: orl %edx, %esi ; X86-NOSSE-NEXT: sete %al ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: retl @@ -934,17 +940,17 @@ define i1 @length16_eq_const(ptr %X) nounwind { ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: pushl %esi ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: movl $858927408, %ecx # imm = 0x33323130 -; X86-SSE1-NEXT: xorl (%eax), %ecx -; X86-SSE1-NEXT: movl $926299444, %edx # imm = 0x37363534 -; X86-SSE1-NEXT: xorl 4(%eax), %edx +; X86-SSE1-NEXT: movl $858927408, %edx # imm = 0x33323130 +; X86-SSE1-NEXT: xorl (%eax), %edx +; X86-SSE1-NEXT: movl $926299444, %ecx # imm = 0x37363534 +; X86-SSE1-NEXT: xorl 4(%eax), %ecx +; X86-SSE1-NEXT: movl $825243960, %esi # imm = 0x31303938 +; X86-SSE1-NEXT: xorl 8(%eax), %esi +; X86-SSE1-NEXT: orl %edx, %ecx +; X86-SSE1-NEXT: movl $892613426, %edx # imm = 0x35343332 +; X86-SSE1-NEXT: xorl 12(%eax), %edx +; X86-SSE1-NEXT: orl %esi, %edx ; X86-SSE1-NEXT: orl %ecx, %edx -; X86-SSE1-NEXT: movl $825243960, %ecx # imm = 0x31303938 -; X86-SSE1-NEXT: xorl 8(%eax), %ecx -; X86-SSE1-NEXT: movl $892613426, %esi # imm = 0x35343332 -; X86-SSE1-NEXT: xorl 12(%eax), %esi -; X86-SSE1-NEXT: orl %ecx, %esi -; X86-SSE1-NEXT: orl %edx, %esi ; X86-SSE1-NEXT: sete %al ; X86-SSE1-NEXT: popl %esi ; X86-SSE1-NEXT: retl @@ -1103,12 +1109,12 @@ define i1 @length24_eq_const(ptr %X) nounwind { ; X86-SSE2-LABEL: length24_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -1116,12 +1122,12 @@ define i1 @length24_eq_const(ptr %X) nounwind { ; X86-SSE41-LABEL: length24_eq_const: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: movdqu 8(%eax), %xmm0 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: por %xmm0, %xmm1 +; X86-SSE41-NEXT: ptest %xmm1, %xmm1 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind @@ -1316,12 +1322,12 @@ define i1 @length31_eq_const(ptr %X) nounwind { ; X86-SSE2-LABEL: length31_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 15(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -1329,12 +1335,12 @@ define i1 @length31_eq_const(ptr %X) nounwind { ; X86-SSE41-LABEL: length31_eq_const: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 15(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: por %xmm0, %xmm1 +; X86-SSE41-NEXT: ptest %xmm1, %xmm1 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind @@ -1531,12 +1537,12 @@ define i1 @length32_eq_const(ptr %X) nounwind { ; X86-SSE2-LABEL: length32_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -1544,12 +1550,12 @@ define i1 @length32_eq_const(ptr %X) nounwind { ; X86-SSE41-LABEL: length32_eq_const: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: por %xmm0, %xmm1 +; X86-SSE41-NEXT: ptest %xmm1, %xmm1 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind @@ -1938,16 +1944,16 @@ define i1 @length63_eq_const(ptr %X) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE2-NEXT: movdqu 47(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm1 +; X86-SSE2-NEXT: movdqu 47(%eax), %xmm2 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm3 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al @@ -1957,16 +1963,16 @@ define i1 @length63_eq_const(ptr %X) nounwind { ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE41-NEXT: movdqu 47(%eax), %xmm3 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm1 +; X86-SSE41-NEXT: movdqu 47(%eax), %xmm2 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE41-NEXT: por %xmm3, %xmm2 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm3 +; X86-SSE41-NEXT: por %xmm2, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm3, %xmm0 ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 ; X86-SSE41-NEXT: ptest %xmm0, %xmm0 ; X86-SSE41-NEXT: sete %al ; X86-SSE41-NEXT: retl @@ -2121,16 +2127,16 @@ define i1 @length64_eq_const(ptr %X) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE2-NEXT: movdqu 48(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm1 +; X86-SSE2-NEXT: movdqu 48(%eax), %xmm2 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm3 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al @@ -2140,16 +2146,16 @@ define i1 @length64_eq_const(ptr %X) nounwind { ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE41-NEXT: movdqu 48(%eax), %xmm3 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm1 +; X86-SSE41-NEXT: movdqu 48(%eax), %xmm2 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE41-NEXT: por %xmm3, %xmm2 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm3 +; X86-SSE41-NEXT: por %xmm2, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm3, %xmm0 ; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 ; X86-SSE41-NEXT: ptest %xmm0, %xmm0 ; X86-SSE41-NEXT: sete %al ; X86-SSE41-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll index 3a3824a4ffe83..6df20dc1e0a61 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -78,8 +78,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length2_lt: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax +; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %cx ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: movzwl %cx, %ecx @@ -140,8 +140,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length3: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax +; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %cx ; X64-NEXT: cmpw %cx, %ax ; X64-NEXT: jne .LBB9_3 @@ -153,7 +153,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; X64-NEXT: .LBB9_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind ret i32 %m @@ -261,7 +261,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind { ; X64-NEXT: .LBB16_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind ret i32 %m @@ -302,7 +302,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: .LBB18_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -332,7 +332,7 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB19_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind @@ -375,7 +375,7 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB21_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax @@ -491,8 +491,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length12: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB29_2 @@ -508,7 +508,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB29_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind @@ -566,8 +566,8 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length16: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB33_2 @@ -583,7 +583,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB33_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind @@ -636,8 +636,8 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length16_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB35_2 @@ -653,7 +653,7 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB35_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax @@ -667,8 +667,8 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length16_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB36_2 @@ -684,7 +684,7 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: setae %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx +; X64-NEXT: leal -1(,%rdx,2), %edx ; X64-NEXT: .LBB36_3: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al @@ -740,15 +740,15 @@ define i32 @length24(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length24: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB38_3 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB38_3 @@ -764,7 +764,7 @@ define i32 @length24(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB38_4: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind @@ -832,15 +832,15 @@ define i1 @length24_lt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length24_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB40_3 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB40_3 @@ -856,7 +856,7 @@ define i1 @length24_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB40_4: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax @@ -870,15 +870,15 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length24_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB41_3 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rax -; X64-NEXT: movq 8(%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq 8(%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB41_3 @@ -894,7 +894,7 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: setae %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx +; X64-NEXT: leal -1(,%rdx,2), %edx ; X64-NEXT: .LBB41_4: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al @@ -907,34 +907,34 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind { define i1 @length24_eq_const(ptr %X) nounwind { ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-SSE41-LABEL: length24_eq_const: ; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE41-NEXT: por %xmm1, %xmm0 -; X64-SSE41-NEXT: ptest %xmm0, %xmm0 +; X64-SSE41-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: por %xmm0, %xmm1 +; X64-SSE41-NEXT: ptest %xmm1, %xmm1 ; X64-SSE41-NEXT: setne %al ; X64-SSE41-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq @@ -960,22 +960,22 @@ define i32 @length31(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length31: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB43_4 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB43_4 ; X64-NEXT: # %bb.2: # %loadbb2 ; X64-NEXT: movq 16(%rdi), %rcx -; X64-NEXT: movq 16(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 16(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB43_4 @@ -991,7 +991,7 @@ define i32 @length31(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB43_5: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind @@ -1028,11 +1028,11 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX-LABEL: length31_eq: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1 -; X64-AVX-NEXT: vpxor 15(%rsi), %xmm1, %xmm1 -; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0 +; X64-AVX-NEXT: vpxor 15(%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq @@ -1058,22 +1058,22 @@ define i1 @length31_lt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length31_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB45_4 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB45_4 ; X64-NEXT: # %bb.2: # %loadbb2 ; X64-NEXT: movq 16(%rdi), %rcx -; X64-NEXT: movq 16(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 16(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB45_4 @@ -1089,7 +1089,7 @@ define i1 @length31_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB45_5: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax @@ -1103,22 +1103,22 @@ define i1 @length31_gt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length31_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB46_4 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rax -; X64-NEXT: movq 8(%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq 8(%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB46_4 ; X64-NEXT: # %bb.2: # %loadbb2 ; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq 16(%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq 16(%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB46_4 @@ -1134,7 +1134,7 @@ define i1 @length31_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: setae %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx +; X64-NEXT: leal -1(,%rdx,2), %edx ; X64-NEXT: .LBB46_5: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al @@ -1174,11 +1174,11 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"= ; ; X64-AVX-LABEL: length31_eq_prefer128: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1 -; X64-AVX-NEXT: vpxor 15(%rsi), %xmm1, %xmm1 -; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0 +; X64-AVX-NEXT: vpxor 15(%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq @@ -1203,34 +1203,34 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"= define i1 @length31_eq_const(ptr %X) nounwind { ; X64-SSE2-LABEL: length31_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 15(%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movdqu 15(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-SSE41-LABEL: length31_eq_const: ; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE41-NEXT: movdqu 15(%rdi), %xmm1 -; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: movdqu 15(%rdi), %xmm0 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE41-NEXT: por %xmm1, %xmm0 -; X64-SSE41-NEXT: ptest %xmm0, %xmm0 +; X64-SSE41-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: por %xmm0, %xmm1 +; X64-SSE41-NEXT: ptest %xmm1, %xmm1 ; X64-SSE41-NEXT: setne %al ; X64-SSE41-NEXT: retq ; ; X64-AVX-LABEL: length31_eq_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1 -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0 ; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq @@ -1256,22 +1256,22 @@ define i32 @length32(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length32: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB49_4 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB49_4 ; X64-NEXT: # %bb.2: # %loadbb2 ; X64-NEXT: movq 16(%rdi), %rcx -; X64-NEXT: movq 16(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 16(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB49_4 @@ -1287,7 +1287,7 @@ define i32 @length32(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB49_5: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind @@ -1369,22 +1369,22 @@ define i1 @length32_lt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length32_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB51_4 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx -; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB51_4 ; X64-NEXT: # %bb.2: # %loadbb2 ; X64-NEXT: movq 16(%rdi), %rcx -; X64-NEXT: movq 16(%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq 16(%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB51_4 @@ -1400,7 +1400,7 @@ define i1 @length32_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB51_5: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax @@ -1414,22 +1414,22 @@ define i1 @length32_gt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length32_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB52_4 ; X64-NEXT: # %bb.1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rax -; X64-NEXT: movq 8(%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq 8(%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB52_4 ; X64-NEXT: # %bb.2: # %loadbb2 ; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq 16(%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq 16(%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB52_4 @@ -1445,7 +1445,7 @@ define i1 @length32_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: setae %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx +; X64-NEXT: leal -1(,%rdx,2), %edx ; X64-NEXT: .LBB52_5: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al @@ -1485,11 +1485,11 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"= ; ; X64-AVX-LABEL: length32_eq_prefer128: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1 -; X64-AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1 -; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm0 +; X64-AVX-NEXT: vpxor 16(%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq @@ -1514,24 +1514,24 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"= define i1 @length32_eq_const(ptr %X) nounwind { ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-SSE41-LABEL: length32_eq_const: ; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE41-NEXT: por %xmm1, %xmm0 -; X64-SSE41-NEXT: ptest %xmm0, %xmm0 +; X64-SSE41-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: por %xmm0, %xmm1 +; X64-SSE41-NEXT: ptest %xmm1, %xmm1 ; X64-SSE41-NEXT: setne %al ; X64-SSE41-NEXT: retq ; @@ -1811,8 +1811,8 @@ define i1 @length48_eq_const(ptr %X) nounwind { ; X64-AVX1-LABEL: length48_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 @@ -1823,8 +1823,8 @@ define i1 @length48_eq_const(ptr %X) nounwind { ; X64-AVX2-LABEL: length48_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 @@ -1835,8 +1835,8 @@ define i1 @length48_eq_const(ptr %X) nounwind { ; X64-AVX512-LABEL: length48_eq_const: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 @@ -1916,11 +1916,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX1-LABEL: length63_eq: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps 31(%rsi), %ymm1, %ymm1 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm0 +; X64-AVX1-NEXT: vxorps 31(%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -1928,11 +1928,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX2-LABEL: length63_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor 31(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor 31(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -1940,11 +1940,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX512-LABEL: length63_eq: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm1 -; X64-AVX512-NEXT: vpxor 31(%rsi), %ymm1, %ymm1 -; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm0 +; X64-AVX512-NEXT: vpxor 31(%rsi), %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX512-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; X64-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: setne %al ; X64-AVX512-NEXT: vzeroupper @@ -2001,16 +2001,16 @@ define i1 @length63_eq_const(ptr %X) nounwind { ; X64-SSE2-LABEL: length63_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm2 -; X64-SSE2-NEXT: movdqu 47(%rdi), %xmm3 -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm1 +; X64-SSE2-NEXT: movdqu 47(%rdi), %xmm2 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE2-NEXT: pand %xmm3, %xmm2 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm3 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 ; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al @@ -2019,27 +2019,27 @@ define i1 @length63_eq_const(ptr %X) nounwind { ; X64-SSE41-LABEL: length63_eq_const: ; X64-SSE41: # %bb.0: ; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm2 -; X64-SSE41-NEXT: movdqu 47(%rdi), %xmm3 -; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm1 +; X64-SSE41-NEXT: movdqu 47(%rdi), %xmm2 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE41-NEXT: por %xmm3, %xmm2 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm3 +; X64-SSE41-NEXT: por %xmm2, %xmm1 +; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE41-NEXT: por %xmm3, %xmm0 ; X64-SSE41-NEXT: por %xmm1, %xmm0 -; X64-SSE41-NEXT: por %xmm2, %xmm0 ; X64-SSE41-NEXT: ptest %xmm0, %xmm0 ; X64-SSE41-NEXT: sete %al ; X64-SSE41-NEXT: retq ; ; X64-AVX1-LABEL: length63_eq_const: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm0 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -2047,11 +2047,11 @@ define i1 @length63_eq_const(ptr %X) nounwind { ; ; X64-AVX2-LABEL: length63_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm0 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper @@ -2059,11 +2059,11 @@ define i1 @length63_eq_const(ptr %X) nounwind { ; ; X64-AVX512-LABEL: length63_eq_const: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm1 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm0 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: sete %al ; X64-AVX512-NEXT: vzeroupper @@ -2141,11 +2141,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX1-LABEL: length64_eq: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0 +; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -2153,11 +2153,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX2-LABEL: length64_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -2232,16 +2232,16 @@ define i1 @length64_eq_const(ptr %X) nounwind { ; X64-SSE2-LABEL: length64_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm2 -; X64-SSE2-NEXT: movdqu 48(%rdi), %xmm3 -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm1 +; X64-SSE2-NEXT: movdqu 48(%rdi), %xmm2 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE2-NEXT: pand %xmm3, %xmm2 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm3 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 ; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al @@ -2250,27 +2250,27 @@ define i1 @length64_eq_const(ptr %X) nounwind { ; X64-SSE41-LABEL: length64_eq_const: ; X64-SSE41: # %bb.0: ; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm2 -; X64-SSE41-NEXT: movdqu 48(%rdi), %xmm3 -; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm1 +; X64-SSE41-NEXT: movdqu 48(%rdi), %xmm2 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE41-NEXT: por %xmm3, %xmm2 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm3 +; X64-SSE41-NEXT: por %xmm2, %xmm1 +; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE41-NEXT: por %xmm3, %xmm0 ; X64-SSE41-NEXT: por %xmm1, %xmm0 -; X64-SSE41-NEXT: por %xmm2, %xmm0 ; X64-SSE41-NEXT: ptest %xmm0, %xmm0 ; X64-SSE41-NEXT: sete %al ; X64-SSE41-NEXT: retq ; ; X64-AVX1-LABEL: length64_eq_const: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -2278,11 +2278,11 @@ define i1 @length64_eq_const(ptr %X) nounwind { ; ; X64-AVX2-LABEL: length64_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper @@ -2579,16 +2579,16 @@ define i1 @length127_eq(ptr %x, ptr %y) nounwind { ; X64-AVX1-LABEL: length127_eq: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2 -; X64-AVX1-NEXT: vmovups 95(%rdi), %ymm3 -; X64-AVX1-NEXT: vxorps 95(%rsi), %ymm3, %ymm3 -; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2 -; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm1 +; X64-AVX1-NEXT: vmovups 95(%rdi), %ymm2 +; X64-AVX1-NEXT: vxorps 95(%rsi), %ymm2, %ymm2 +; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm3 +; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm3, %ymm2 ; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -2597,16 +2597,16 @@ define i1 @length127_eq(ptr %x, ptr %y) nounwind { ; X64-AVX2-LABEL: length127_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 -; X64-AVX2-NEXT: vmovdqu 95(%rdi), %ymm3 -; X64-AVX2-NEXT: vpxor 95(%rsi), %ymm3, %ymm3 -; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2 -; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm1 +; X64-AVX2-NEXT: vmovdqu 95(%rdi), %ymm2 +; X64-AVX2-NEXT: vpxor 95(%rsi), %ymm2, %ymm2 +; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm3 +; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm3, %ymm2 ; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -2715,16 +2715,16 @@ define i1 @length127_eq_const(ptr %X) nounwind { ; X64-AVX1-LABEL: length127_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2 -; X64-AVX1-NEXT: vmovups 95(%rdi), %ymm3 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm1 +; X64-AVX1-NEXT: vmovups 95(%rdi), %ymm2 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm3 +; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -2733,16 +2733,16 @@ define i1 @length127_eq_const(ptr %X) nounwind { ; X64-AVX2-LABEL: length127_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 -; X64-AVX2-NEXT: vmovdqu 95(%rdi), %ymm3 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm1 +; X64-AVX2-NEXT: vmovdqu 95(%rdi), %ymm2 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm3 +; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper @@ -2829,16 +2829,16 @@ define i1 @length128_eq(ptr %x, ptr %y) nounwind { ; X64-AVX1-LABEL: length128_eq: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2 -; X64-AVX1-NEXT: vmovups 96(%rdi), %ymm3 -; X64-AVX1-NEXT: vxorps 96(%rsi), %ymm3, %ymm3 -; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2 -; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm1 +; X64-AVX1-NEXT: vmovups 96(%rdi), %ymm2 +; X64-AVX1-NEXT: vxorps 96(%rsi), %ymm2, %ymm2 +; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm3 +; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm3, %ymm2 ; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -2847,16 +2847,16 @@ define i1 @length128_eq(ptr %x, ptr %y) nounwind { ; X64-AVX2-LABEL: length128_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 -; X64-AVX2-NEXT: vmovdqu 96(%rdi), %ymm3 -; X64-AVX2-NEXT: vpxor 96(%rsi), %ymm3, %ymm3 -; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2 -; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm1 +; X64-AVX2-NEXT: vmovdqu 96(%rdi), %ymm2 +; X64-AVX2-NEXT: vpxor 96(%rsi), %ymm2, %ymm2 +; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm3 +; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm3, %ymm2 ; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -2965,16 +2965,16 @@ define i1 @length128_eq_const(ptr %X) nounwind { ; X64-AVX1-LABEL: length128_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2 -; X64-AVX1-NEXT: vmovups 96(%rdi), %ymm3 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm1 +; X64-AVX1-NEXT: vmovups 96(%rdi), %ymm2 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm3 +; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -2983,16 +2983,16 @@ define i1 @length128_eq_const(ptr %X) nounwind { ; X64-AVX2-LABEL: length128_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 -; X64-AVX2-NEXT: vmovdqu 96(%rdi), %ymm3 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm1 +; X64-AVX2-NEXT: vmovdqu 96(%rdi), %ymm2 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm3 +; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper @@ -3103,9 +3103,9 @@ define i1 @length192_eq(ptr %x, ptr %y) nounwind { ; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2 ; X64-AVX512BW-NEXT: vpcmpneqb 64(%rsi), %zmm1, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k1 +; X64-AVX512BW-NEXT: vpcmpneqb 128(%rsi), %zmm2, %k2 ; X64-AVX512BW-NEXT: korq %k0, %k1, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb 128(%rsi), %zmm2, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: kortestq %k2, %k0 ; X64-AVX512BW-NEXT: setne %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq @@ -3117,9 +3117,9 @@ define i1 @length192_eq(ptr %x, ptr %y) nounwind { ; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 ; X64-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k0 ; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k1 +; X64-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k2 ; X64-AVX512F-NEXT: korw %k0, %k1, %k0 -; X64-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: kortestw %k2, %k0 ; X64-AVX512F-NEXT: setne %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -3141,9 +3141,9 @@ define i1 @length192_eq(ptr %x, ptr %y) nounwind { ; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 ; X64-MIC-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k1 +; X64-MIC-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k2 ; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: kortestw %k2, %k0 ; X64-MIC-AVX512F-NEXT: setne %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq @@ -3223,9 +3223,9 @@ define i1 @length192_eq_const(ptr %X) nounwind { ; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2 ; X64-AVX512BW-NEXT: vpcmpneqb .L.str+64(%rip), %zmm1, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k1 +; X64-AVX512BW-NEXT: vpcmpneqb .L.str+128(%rip), %zmm2, %k2 ; X64-AVX512BW-NEXT: korq %k0, %k1, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb .L.str+128(%rip), %zmm2, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: kortestq %k2, %k0 ; X64-AVX512BW-NEXT: sete %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq @@ -3237,9 +3237,9 @@ define i1 @length192_eq_const(ptr %X) nounwind { ; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 ; X64-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k0 ; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k2 ; X64-AVX512F-NEXT: korw %k0, %k1, %k0 -; X64-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: kortestw %k2, %k0 ; X64-AVX512F-NEXT: sete %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -3262,9 +3262,9 @@ define i1 @length192_eq_const(ptr %X) nounwind { ; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 ; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k2 ; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: kortestw %k2, %k0 ; X64-MIC-AVX512F-NEXT: sete %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq @@ -3317,9 +3317,9 @@ define i1 @length255_eq(ptr %x, ptr %y) nounwind { ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-AVX512BW-NEXT: vmovdqu64 191(%rdi), %zmm2 +; X64-AVX512BW-NEXT: vpcmpneqb 191(%rsi), %zmm2, %k0 ; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-AVX512BW-NEXT: vmovdqu64 191(%rdi), %zmm3 -; X64-AVX512BW-NEXT: vpcmpneqb 191(%rsi), %zmm3, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb 128(%rsi), %zmm2, %k1 ; X64-AVX512BW-NEXT: korq %k0, %k1, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb 64(%rsi), %zmm1, %k1 @@ -3334,9 +3334,9 @@ define i1 @length255_eq(ptr %x, ptr %y) nounwind { ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm2 +; X64-AVX512F-NEXT: vpcmpneqd 191(%rsi), %zmm2, %k0 ; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm3 -; X64-AVX512F-NEXT: vpcmpneqd 191(%rsi), %zmm3, %k0 ; X64-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1 ; X64-AVX512F-NEXT: korw %k0, %k1, %k0 ; X64-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k1 @@ -3361,9 +3361,9 @@ define i1 @length255_eq(ptr %x, ptr %y) nounwind { ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-MIC-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-MIC-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm2 +; X64-MIC-AVX512F-NEXT: vpcmpneqd 191(%rsi), %zmm2, %k0 ; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-MIC-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm3 -; X64-MIC-AVX512F-NEXT: vpcmpneqd 191(%rsi), %zmm3, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1 ; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k1 @@ -3446,9 +3446,9 @@ define i1 @length255_eq_const(ptr %X) nounwind { ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-AVX512BW-NEXT: vmovdqu64 191(%rdi), %zmm2 +; X64-AVX512BW-NEXT: vpcmpneqb .L.str+191(%rip), %zmm2, %k0 ; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-AVX512BW-NEXT: vmovdqu64 191(%rdi), %zmm3 -; X64-AVX512BW-NEXT: vpcmpneqb .L.str+191(%rip), %zmm3, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb .L.str+128(%rip), %zmm2, %k1 ; X64-AVX512BW-NEXT: korq %k0, %k1, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb .L.str+64(%rip), %zmm1, %k1 @@ -3463,9 +3463,9 @@ define i1 @length255_eq_const(ptr %X) nounwind { ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm2 +; X64-AVX512F-NEXT: vpcmpneqd .L.str+191(%rip), %zmm2, %k0 ; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm3 -; X64-AVX512F-NEXT: vpcmpneqd .L.str+191(%rip), %zmm3, %k0 ; X64-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1 ; X64-AVX512F-NEXT: korw %k0, %k1, %k0 ; X64-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k1 @@ -3491,9 +3491,9 @@ define i1 @length255_eq_const(ptr %X) nounwind { ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-MIC-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-MIC-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm2 +; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+191(%rip), %zmm2, %k0 ; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-MIC-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm3 -; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+191(%rip), %zmm3, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1 ; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k1 @@ -3552,9 +3552,9 @@ define i1 @length256_eq(ptr %x, ptr %y) nounwind { ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm2 +; X64-AVX512BW-NEXT: vpcmpneqb 192(%rsi), %zmm2, %k0 ; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm3 -; X64-AVX512BW-NEXT: vpcmpneqb 192(%rsi), %zmm3, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb 128(%rsi), %zmm2, %k1 ; X64-AVX512BW-NEXT: korq %k0, %k1, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb 64(%rsi), %zmm1, %k1 @@ -3569,9 +3569,9 @@ define i1 @length256_eq(ptr %x, ptr %y) nounwind { ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm2 +; X64-AVX512F-NEXT: vpcmpneqd 192(%rsi), %zmm2, %k0 ; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm3 -; X64-AVX512F-NEXT: vpcmpneqd 192(%rsi), %zmm3, %k0 ; X64-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1 ; X64-AVX512F-NEXT: korw %k0, %k1, %k0 ; X64-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k1 @@ -3596,9 +3596,9 @@ define i1 @length256_eq(ptr %x, ptr %y) nounwind { ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-MIC-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-MIC-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm2 +; X64-MIC-AVX512F-NEXT: vpcmpneqd 192(%rsi), %zmm2, %k0 ; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-MIC-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm3 -; X64-MIC-AVX512F-NEXT: vpcmpneqd 192(%rsi), %zmm3, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1 ; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k1 @@ -3681,9 +3681,9 @@ define i1 @length256_eq_const(ptr %X) nounwind { ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm2 +; X64-AVX512BW-NEXT: vpcmpneqb .L.str+192(%rip), %zmm2, %k0 ; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm3 -; X64-AVX512BW-NEXT: vpcmpneqb .L.str+192(%rip), %zmm3, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb .L.str+128(%rip), %zmm2, %k1 ; X64-AVX512BW-NEXT: korq %k0, %k1, %k0 ; X64-AVX512BW-NEXT: vpcmpneqb .L.str+64(%rip), %zmm1, %k1 @@ -3698,9 +3698,9 @@ define i1 @length256_eq_const(ptr %X) nounwind { ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm2 +; X64-AVX512F-NEXT: vpcmpneqd .L.str+192(%rip), %zmm2, %k0 ; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm3 -; X64-AVX512F-NEXT: vpcmpneqd .L.str+192(%rip), %zmm3, %k0 ; X64-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1 ; X64-AVX512F-NEXT: korw %k0, %k1, %k0 ; X64-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k1 @@ -3726,9 +3726,9 @@ define i1 @length256_eq_const(ptr %X) nounwind { ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; X64-MIC-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; X64-MIC-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm2 +; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+192(%rip), %zmm2, %k0 ; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2 -; X64-MIC-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm3 -; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+192(%rip), %zmm3, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1 ; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0 ; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k1 diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll index 09f02c3f56346..8f83ed2d38825 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll @@ -76,8 +76,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind optsize { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi ; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl (%ecx), %esi ; X86-NEXT: rolw $8, %si ; X86-NEXT: cmpw %si, %dx ; X86-NEXT: jne .LBB4_3 @@ -89,7 +89,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind optsize { ; X86-NEXT: .LBB4_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB4_2: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -179,7 +179,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind optsize { ; X86-NEXT: .LBB9_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB9_2: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -229,7 +229,7 @@ define i32 @length8(ptr %X, ptr %Y) nounwind optsize { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB11_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -430,12 +430,12 @@ define i1 @length24_eq_const(ptr %X) nounwind optsize { ; X86-SSE2-LABEL: length24_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -506,12 +506,12 @@ define i1 @length32_eq_const(ptr %X) nounwind optsize { ; X86-SSE2-LABEL: length32_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll index 4fe67fa0883de..6914ca62a7a0d 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll @@ -68,8 +68,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind optsize { ; X64-LABEL: length3: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax +; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %cx ; X64-NEXT: cmpw %cx, %ax ; X64-NEXT: jne .LBB4_3 @@ -81,7 +81,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind optsize { ; X64-NEXT: .LBB4_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind ret i32 %m @@ -159,7 +159,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind optsize { ; X64-NEXT: .LBB9_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind ret i32 %m @@ -240,8 +240,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind optsize { ; X64-LABEL: length12: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB15_2 @@ -257,7 +257,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind optsize { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB15_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind @@ -270,8 +270,8 @@ define i32 @length16(ptr %X, ptr %Y) nounwind optsize { ; X64-LABEL: length16: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB16_2 @@ -287,7 +287,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind optsize { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind @@ -384,23 +384,23 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize { define i1 @length24_eq_const(ptr %X) nounwind optsize { ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq @@ -460,12 +460,12 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize { define i1 @length32_eq_const(ptr %X) nounwind optsize { ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq @@ -514,11 +514,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize { ; ; X64-AVX1-LABEL: length64_eq: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0 +; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -526,11 +526,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize { ; ; X64-AVX2-LABEL: length64_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -554,11 +554,11 @@ define i1 @length64_eq_const(ptr %X) nounwind optsize { ; ; X64-AVX1-LABEL: length64_eq_const: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -566,11 +566,11 @@ define i1 @length64_eq_const(ptr %X) nounwind optsize { ; ; X64-AVX2-LABEL: length64_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll index 1b3fd6d4ddd3b..105114049d573 100644 --- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll @@ -76,8 +76,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi ; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl (%ecx), %esi ; X86-NEXT: rolw $8, %si ; X86-NEXT: cmpw %si, %dx ; X86-NEXT: jne .LBB4_3 @@ -89,7 +89,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 { ; X86-NEXT: .LBB4_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB4_2: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -179,7 +179,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 { ; X86-NEXT: .LBB9_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB9_2: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -229,7 +229,7 @@ define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB11_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -430,12 +430,12 @@ define i1 @length24_eq_const(ptr %X) nounwind !prof !14 { ; X86-SSE2-LABEL: length24_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -506,12 +506,12 @@ define i1 @length32_eq_const(ptr %X) nounwind !prof !14 { ; X86-SSE2-LABEL: length32_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll index 26ee94afbce88..4847ac9a72f4c 100644 --- a/llvm/test/CodeGen/X86/memcmp-pgso.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll @@ -68,8 +68,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-LABEL: length3: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax +; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %cx ; X64-NEXT: cmpw %cx, %ax ; X64-NEXT: jne .LBB4_3 @@ -81,7 +81,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-NEXT: .LBB4_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind ret i32 %m @@ -159,7 +159,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-NEXT: .LBB9_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind ret i32 %m @@ -240,8 +240,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-LABEL: length12: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB15_2 @@ -257,7 +257,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB15_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind @@ -270,8 +270,8 @@ define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-LABEL: length16: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB16_2 @@ -287,7 +287,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind @@ -384,23 +384,23 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 { define i1 @length24_eq_const(ptr %X) nounwind !prof !14 { ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq @@ -460,12 +460,12 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 { define i1 @length32_eq_const(ptr %X) nounwind !prof !14 { ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq @@ -514,11 +514,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 { ; ; X64-AVX1-LABEL: length64_eq: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0 +; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -526,11 +526,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 { ; ; X64-AVX2-LABEL: length64_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -554,11 +554,11 @@ define i1 @length64_eq_const(ptr %X) nounwind !prof !14 { ; ; X64-AVX1-LABEL: length64_eq_const: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -566,11 +566,11 @@ define i1 @length64_eq_const(ptr %X) nounwind !prof !14 { ; ; X64-AVX2-LABEL: length64_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll index 28e732be9191d..94761a6d616cb 100644 --- a/llvm/test/CodeGen/X86/memcmp-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-x32.ll @@ -106,8 +106,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx ; X86-NEXT: rolw $8, %cx +; X86-NEXT: movzwl (%eax), %edx ; X86-NEXT: rolw $8, %dx ; X86-NEXT: movzwl %cx, %eax ; X86-NEXT: movzwl %dx, %ecx @@ -175,8 +175,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi ; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl (%ecx), %esi ; X86-NEXT: rolw $8, %si ; X86-NEXT: cmpw %si, %dx ; X86-NEXT: jne .LBB11_3 @@ -189,7 +189,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; X86-NEXT: .LBB11_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind @@ -313,7 +313,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind { ; X86-NEXT: .LBB18_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind @@ -358,7 +358,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X86-NEXT: .LBB20_3: # %res_block ; X86-NEXT: setae %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB20_2: # %endblock ; X86-NEXT: shrl $31, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax @@ -393,7 +393,7 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB21_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -425,7 +425,7 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB22_3: # %endblock ; X86-NEXT: shrl $31, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax @@ -477,7 +477,7 @@ define i32 @length8(ptr %X, ptr %Y) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: .LBB24_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -975,12 +975,12 @@ define i1 @length24_eq_const(ptr %X) nounwind { ; X86-SSE2-LABEL: length24_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -988,12 +988,12 @@ define i1 @length24_eq_const(ptr %X) nounwind { ; X86-SSE41-LABEL: length24_eq_const: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: movdqu 8(%eax), %xmm0 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: por %xmm0, %xmm1 +; X86-SSE41-NEXT: ptest %xmm1, %xmm1 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind @@ -1188,12 +1188,12 @@ define i1 @length31_eq_const(ptr %X) nounwind { ; X86-SSE2-LABEL: length31_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 15(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -1201,12 +1201,12 @@ define i1 @length31_eq_const(ptr %X) nounwind { ; X86-SSE41-LABEL: length31_eq_const: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 15(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: por %xmm0, %xmm1 +; X86-SSE41-NEXT: ptest %xmm1, %xmm1 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind @@ -1403,12 +1403,12 @@ define i1 @length32_eq_const(ptr %X) nounwind { ; X86-SSE2-LABEL: length32_eq_const: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -1416,12 +1416,12 @@ define i1 @length32_eq_const(ptr %X) nounwind { ; X86-SSE41-LABEL: length32_eq_const: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: por %xmm0, %xmm1 +; X86-SSE41-NEXT: ptest %xmm1, %xmm1 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index 9e713bfa6c392..1a53940097cea 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -104,8 +104,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length2_lt: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax +; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %cx ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: movzwl %cx, %ecx @@ -166,8 +166,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length3: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %ax +; X64-NEXT: movzwl (%rsi), %ecx ; X64-NEXT: rolw $8, %cx ; X64-NEXT: cmpw %cx, %ax ; X64-NEXT: jne .LBB11_3 @@ -179,7 +179,7 @@ define i32 @length3(ptr %X, ptr %Y) nounwind { ; X64-NEXT: .LBB11_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind ret i32 %m @@ -317,7 +317,7 @@ define i32 @length5(ptr %X, ptr %Y) nounwind { ; X64-NEXT: .LBB20_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind ret i32 %m @@ -358,7 +358,7 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: .LBB22_3: # %res_block ; X64-NEXT: setae %al ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq @@ -388,7 +388,7 @@ define i32 @length7(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB23_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind @@ -416,7 +416,7 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl %edx, %ecx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB24_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax @@ -547,8 +547,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length12: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB33_2 @@ -564,7 +564,7 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB33_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind @@ -605,8 +605,8 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length15: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB36_2 @@ -622,7 +622,7 @@ define i32 @length15(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB36_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind @@ -633,8 +633,8 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length15_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB37_2 @@ -650,7 +650,7 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB37_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax @@ -679,7 +679,7 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rcx, %rdx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB38_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind @@ -720,7 +720,7 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rax, %rcx ; X64-NEXT: setae %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx +; X64-NEXT: leal -1(,%rdx,2), %edx ; X64-NEXT: .LBB40_3: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al @@ -736,8 +736,8 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-LABEL: length16: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB41_2 @@ -753,7 +753,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB41_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind @@ -806,8 +806,8 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length16_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx +; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: jne .LBB43_2 @@ -823,7 +823,7 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: .LBB43_3: # %endblock ; X64-NEXT: shrl $31, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax @@ -837,8 +837,8 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-LABEL: length16_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rax +; X64-NEXT: movq (%rsi), %rcx ; X64-NEXT: bswapq %rcx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: jne .LBB44_2 @@ -854,7 +854,7 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: setae %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx +; X64-NEXT: leal -1(,%rdx,2), %edx ; X64-NEXT: .LBB44_3: # %endblock ; X64-NEXT: testl %edx, %edx ; X64-NEXT: setg %al @@ -1005,34 +1005,34 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind { define i1 @length24_eq_const(ptr %X) nounwind { ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-SSE41-LABEL: length24_eq_const: ; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE41-NEXT: por %xmm1, %xmm0 -; X64-SSE41-NEXT: ptest %xmm0, %xmm0 +; X64-SSE41-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: por %xmm0, %xmm1 +; X64-SSE41-NEXT: ptest %xmm1, %xmm1 ; X64-SSE41-NEXT: setne %al ; X64-SSE41-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq @@ -1093,11 +1093,11 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX-LABEL: length31_eq: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1 -; X64-AVX-NEXT: vpxor 15(%rsi), %xmm1, %xmm1 -; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0 +; X64-AVX-NEXT: vpxor 15(%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq @@ -1179,11 +1179,11 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"= ; ; X64-AVX-LABEL: length31_eq_prefer128: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1 -; X64-AVX-NEXT: vpxor 15(%rsi), %xmm1, %xmm1 -; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0 +; X64-AVX-NEXT: vpxor 15(%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq @@ -1208,34 +1208,34 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"= define i1 @length31_eq_const(ptr %X) nounwind { ; X64-SSE2-LABEL: length31_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 15(%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movdqu 15(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-SSE41-LABEL: length31_eq_const: ; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE41-NEXT: movdqu 15(%rdi), %xmm1 -; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: movdqu 15(%rdi), %xmm0 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE41-NEXT: por %xmm1, %xmm0 -; X64-SSE41-NEXT: ptest %xmm0, %xmm0 +; X64-SSE41-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: por %xmm0, %xmm1 +; X64-SSE41-NEXT: ptest %xmm1, %xmm1 ; X64-SSE41-NEXT: setne %al ; X64-SSE41-NEXT: retq ; ; X64-AVX-LABEL: length31_eq_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1 -; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0 ; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq @@ -1397,11 +1397,11 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"= ; ; X64-AVX-LABEL: length32_eq_prefer128: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 -; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1 -; X64-AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1 -; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm0 +; X64-AVX-NEXT: vpxor 16(%rsi), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1 +; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq @@ -1426,24 +1426,24 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"= define i1 @length32_eq_const(ptr %X) nounwind { ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; X64-SSE41-LABEL: length32_eq_const: ; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 -; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 -; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE41-NEXT: por %xmm1, %xmm0 -; X64-SSE41-NEXT: ptest %xmm0, %xmm0 +; X64-SSE41-NEXT: movdqu (%rdi), %xmm1 +; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE41-NEXT: por %xmm0, %xmm1 +; X64-SSE41-NEXT: ptest %xmm1, %xmm1 ; X64-SSE41-NEXT: setne %al ; X64-SSE41-NEXT: retq ; @@ -1624,8 +1624,8 @@ define i1 @length48_eq_const(ptr %X) nounwind { ; X64-AVX1-LABEL: length48_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 @@ -1636,8 +1636,8 @@ define i1 @length48_eq_const(ptr %X) nounwind { ; X64-AVX2-LABEL: length48_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 @@ -1648,8 +1648,8 @@ define i1 @length48_eq_const(ptr %X) nounwind { ; X64-AVX512-LABEL: length48_eq_const: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 @@ -1696,11 +1696,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX1-LABEL: length63_eq: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps 31(%rsi), %ymm1, %ymm1 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm0 +; X64-AVX1-NEXT: vxorps 31(%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -1708,11 +1708,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX2-LABEL: length63_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor 31(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor 31(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -1720,11 +1720,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX512-LABEL: length63_eq: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm1 -; X64-AVX512-NEXT: vpxor 31(%rsi), %ymm1, %ymm1 -; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm0 +; X64-AVX512-NEXT: vpxor 31(%rsi), %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX512-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; X64-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: setne %al ; X64-AVX512-NEXT: vzeroupper @@ -1791,11 +1791,11 @@ define i1 @length63_eq_const(ptr %X) nounwind { ; ; X64-AVX1-LABEL: length63_eq_const: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm0 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -1803,11 +1803,11 @@ define i1 @length63_eq_const(ptr %X) nounwind { ; ; X64-AVX2-LABEL: length63_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm0 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper @@ -1815,11 +1815,11 @@ define i1 @length63_eq_const(ptr %X) nounwind { ; ; X64-AVX512-LABEL: length63_eq_const: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm1 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm0 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: sete %al ; X64-AVX512-NEXT: vzeroupper @@ -1864,11 +1864,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX1-LABEL: length64_eq: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0 +; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -1876,11 +1876,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind { ; ; X64-AVX2-LABEL: length64_eq: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -1965,11 +1965,11 @@ define i1 @length64_eq_const(ptr %X) nounwind { ; ; X64-AVX1-LABEL: length64_eq_const: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups (%rdi), %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -1977,11 +1977,11 @@ define i1 @length64_eq_const(ptr %X) nounwind { ; ; X64-AVX2-LABEL: length64_eq_const: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/memcpy-scoped-aa.ll b/llvm/test/CodeGen/X86/memcpy-scoped-aa.ll index d3b86786a630c..b183bda54aa1e 100644 --- a/llvm/test/CodeGen/X86/memcpy-scoped-aa.ll +++ b/llvm/test/CodeGen/X86/memcpy-scoped-aa.ll @@ -18,10 +18,8 @@ define i32 @test_memcpy(ptr nocapture %p, ptr nocapture readonly %q) { ; MIR-NEXT: {{ $}} ; MIR-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi ; MIR-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; MIR-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, killed [[MOV64rm1]] :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOV64rm]] :: (store (s64) into %ir.p0, align 4, !alias.scope !0, !noalias !3) + ; MIR-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s128) from %ir.p1, align 4, !alias.scope !0, !noalias !3) + ; MIR-NEXT: MOVUPSmr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into %ir.p0, align 4, !alias.scope !0, !noalias !3) ; MIR-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0) ; MIR-NEXT: [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0) ; MIR-NEXT: $eax = COPY [[ADD32rm]] @@ -44,10 +42,8 @@ define i32 @test_memcpy_inline(ptr nocapture %p, ptr nocapture readonly %q) { ; MIR-NEXT: {{ $}} ; MIR-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi ; MIR-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; MIR-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, killed [[MOV64rm1]] :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOV64rm]] :: (store (s64) into %ir.p0, align 4, !alias.scope !0, !noalias !3) + ; MIR-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s128) from %ir.p1, align 4, !alias.scope !0, !noalias !3) + ; MIR-NEXT: MOVUPSmr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into %ir.p0, align 4, !alias.scope !0, !noalias !3) ; MIR-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0) ; MIR-NEXT: [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0) ; MIR-NEXT: $eax = COPY [[ADD32rm]] @@ -70,10 +66,8 @@ define i32 @test_memmove(ptr nocapture %p, ptr nocapture readonly %q) { ; MIR-NEXT: {{ $}} ; MIR-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi ; MIR-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; MIR-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOV64rm]] :: (store (s64) into %ir.p0, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, killed [[MOV64rm1]] :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope !0, !noalias !3) + ; MIR-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s128) from %ir.p1, align 4, !alias.scope !0, !noalias !3) + ; MIR-NEXT: MOVUPSmr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into %ir.p0, align 4, !alias.scope !0, !noalias !3) ; MIR-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0) ; MIR-NEXT: [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0) ; MIR-NEXT: $eax = COPY [[ADD32rm]] @@ -96,9 +90,8 @@ define i32 @test_memset(ptr nocapture %p, ptr nocapture readonly %q) { ; MIR-NEXT: {{ $}} ; MIR-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi ; MIR-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; MIR-NEXT: [[MOV64ri:%[0-9]+]]:gr64 = MOV64ri -6148914691236517206 - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, [[MOV64ri]] :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope !0, !noalias !3) - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, [[MOV64ri]] :: (store (s64) into %ir.p0, align 4, !alias.scope !0, !noalias !3) + ; MIR-NEXT: [[MOVAPSrm:%[0-9]+]]:vr128 = MOVAPSrm $rip, 1, $noreg, %const.0, $noreg :: (load (s128) from constant-pool) + ; MIR-NEXT: MOVUPSmr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOVAPSrm]] :: (store (s128) into %ir.p0, align 4, !alias.scope !0, !noalias !3) ; MIR-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0) ; MIR-NEXT: [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0) ; MIR-NEXT: $eax = COPY [[ADD32rm]] @@ -119,10 +112,8 @@ define i32 @test_mempcpy(ptr nocapture %p, ptr nocapture readonly %q) { ; MIR-NEXT: {{ $}} ; MIR-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi ; MIR-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; MIR-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s64) from %ir.p1, align 1, !alias.scope !0, !noalias !3) - ; MIR-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 24, $noreg :: (load (s64) from %ir.p1 + 8, align 1, !alias.scope !0, !noalias !3) - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 8, $noreg, killed [[MOV64rm1]] :: (store (s64) into %ir.p0 + 8, align 1, !alias.scope !0, !noalias !3) - ; MIR-NEXT: MOV64mr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOV64rm]] :: (store (s64) into %ir.p0, align 1, !alias.scope !0, !noalias !3) + ; MIR-NEXT: [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm [[COPY1]], 1, $noreg, 16, $noreg :: (load (s128) from %ir.p1, align 1, !alias.scope !0, !noalias !3) + ; MIR-NEXT: MOVUPSmr [[COPY1]], 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into %ir.p0, align 1, !alias.scope !0, !noalias !3) ; MIR-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.q, !alias.scope !3, !noalias !0) ; MIR-NEXT: [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 4, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.q1, !alias.scope !3, !noalias !0) ; MIR-NEXT: $eax = COPY [[ADD32rm]] diff --git a/llvm/test/CodeGen/X86/memset-inline.ll b/llvm/test/CodeGen/X86/memset-inline.ll index d3999c01a5d71..529d9c17d6797 100644 --- a/llvm/test/CodeGen/X86/memset-inline.ll +++ b/llvm/test/CodeGen/X86/memset-inline.ll @@ -56,11 +56,11 @@ define void @memset_8(ptr %a, i8 %value) nounwind { define void @memset_16(ptr %a, i8 %value) nounwind { ; SSE2-LABEL: memset_16: ; SSE2: # %bb.0: -; SSE2-NEXT: movzbl %sil, %eax -; SSE2-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 -; SSE2-NEXT: imulq %rax, %rcx -; SSE2-NEXT: movq %rcx, 8(%rdi) -; SSE2-NEXT: movq %rcx, (%rdi) +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: movdqu %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: memset_16: @@ -92,13 +92,12 @@ define void @memset_16(ptr %a, i8 %value) nounwind { define void @memset_32(ptr %a, i8 %value) nounwind { ; SSE2-LABEL: memset_32: ; SSE2: # %bb.0: -; SSE2-NEXT: movzbl %sil, %eax -; SSE2-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 -; SSE2-NEXT: imulq %rax, %rcx -; SSE2-NEXT: movq %rcx, 24(%rdi) -; SSE2-NEXT: movq %rcx, 16(%rdi) -; SSE2-NEXT: movq %rcx, 8(%rdi) -; SSE2-NEXT: movq %rcx, (%rdi) +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: movdqu %xmm0, 16(%rdi) +; SSE2-NEXT: movdqu %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: memset_32: @@ -133,17 +132,14 @@ define void @memset_32(ptr %a, i8 %value) nounwind { define void @memset_64(ptr %a, i8 %value) nounwind { ; SSE2-LABEL: memset_64: ; SSE2: # %bb.0: -; SSE2-NEXT: movzbl %sil, %eax -; SSE2-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 -; SSE2-NEXT: imulq %rax, %rcx -; SSE2-NEXT: movq %rcx, 56(%rdi) -; SSE2-NEXT: movq %rcx, 48(%rdi) -; SSE2-NEXT: movq %rcx, 40(%rdi) -; SSE2-NEXT: movq %rcx, 32(%rdi) -; SSE2-NEXT: movq %rcx, 24(%rdi) -; SSE2-NEXT: movq %rcx, 16(%rdi) -; SSE2-NEXT: movq %rcx, 8(%rdi) -; SSE2-NEXT: movq %rcx, (%rdi) +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: movdqu %xmm0, 48(%rdi) +; SSE2-NEXT: movdqu %xmm0, 32(%rdi) +; SSE2-NEXT: movdqu %xmm0, 16(%rdi) +; SSE2-NEXT: movdqu %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: memset_64: @@ -346,8 +342,8 @@ define void @bzero_8(ptr %a) nounwind { define void @bzero_16(ptr %a) nounwind { ; SSE2-LABEL: bzero_16: ; SSE2: # %bb.0: -; SSE2-NEXT: movq $0, 8(%rdi) -; SSE2-NEXT: movq $0, (%rdi) +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movups %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: bzero_16: @@ -374,10 +370,9 @@ define void @bzero_16(ptr %a) nounwind { define void @bzero_32(ptr %a) nounwind { ; SSE2-LABEL: bzero_32: ; SSE2: # %bb.0: -; SSE2-NEXT: movq $0, 24(%rdi) -; SSE2-NEXT: movq $0, 16(%rdi) -; SSE2-NEXT: movq $0, 8(%rdi) -; SSE2-NEXT: movq $0, (%rdi) +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movups %xmm0, 16(%rdi) +; SSE2-NEXT: movups %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: bzero_32: @@ -407,14 +402,11 @@ define void @bzero_32(ptr %a) nounwind { define void @bzero_64(ptr %a) nounwind { ; SSE2-LABEL: bzero_64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq $0, 56(%rdi) -; SSE2-NEXT: movq $0, 48(%rdi) -; SSE2-NEXT: movq $0, 40(%rdi) -; SSE2-NEXT: movq $0, 32(%rdi) -; SSE2-NEXT: movq $0, 24(%rdi) -; SSE2-NEXT: movq $0, 16(%rdi) -; SSE2-NEXT: movq $0, 8(%rdi) -; SSE2-NEXT: movq $0, (%rdi) +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movups %xmm0, 48(%rdi) +; SSE2-NEXT: movups %xmm0, 32(%rdi) +; SSE2-NEXT: movups %xmm0, 16(%rdi) +; SSE2-NEXT: movups %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE4-LABEL: bzero_64: diff --git a/llvm/test/CodeGen/X86/memset-minsize.ll b/llvm/test/CodeGen/X86/memset-minsize.ll index d66500ea31a0d..9461e249d542e 100644 --- a/llvm/test/CodeGen/X86/memset-minsize.ll +++ b/llvm/test/CodeGen/X86/memset-minsize.ll @@ -14,10 +14,15 @@ entry: define void @small_memset_to_rep_stos(ptr %ptr) minsize nounwind { ; CHECK-LABEL: small_memset_to_rep_stos: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq $32 -; CHECK-NEXT: popq %rcx -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: rep;stosl %eax, %es:(%rdi) +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movups %xmm0, 112(%rdi) +; CHECK-NEXT: movups %xmm0, 96(%rdi) +; CHECK-NEXT: movups %xmm0, 80(%rdi) +; CHECK-NEXT: movups %xmm0, 64(%rdi) +; CHECK-NEXT: movups %xmm0, 48(%rdi) +; CHECK-NEXT: movups %xmm0, 32(%rdi) +; CHECK-NEXT: movups %xmm0, 16(%rdi) +; CHECK-NEXT: movups %xmm0, (%rdi) ; CHECK-NEXT: retq entry: call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 128, i1 false) @@ -127,10 +132,15 @@ entry: define void @small_memset_to_rep_stos_64(ptr %ptr) minsize nounwind { ; CHECK-LABEL: small_memset_to_rep_stos_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq $32 -; CHECK-NEXT: popq %rcx -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: rep;stosl %eax, %es:(%rdi) +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movups %xmm0, 112(%rdi) +; CHECK-NEXT: movups %xmm0, 96(%rdi) +; CHECK-NEXT: movups %xmm0, 80(%rdi) +; CHECK-NEXT: movups %xmm0, 64(%rdi) +; CHECK-NEXT: movups %xmm0, 48(%rdi) +; CHECK-NEXT: movups %xmm0, 32(%rdi) +; CHECK-NEXT: movups %xmm0, 16(%rdi) +; CHECK-NEXT: movups %xmm0, (%rdi) ; CHECK-NEXT: retq entry: call void @llvm.memset.p0.i64(ptr align 8 %ptr, i8 0, i64 128, i1 false) diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll index d07b0f64d68c1..42e8a3605ec8e 100644 --- a/llvm/test/CodeGen/X86/memset-nonzero.ll +++ b/llvm/test/CodeGen/X86/memset-nonzero.ll @@ -15,9 +15,8 @@ define void @memset_16_nonzero_bytes(ptr %x) { ; SSE-LABEL: memset_16_nonzero_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A -; SSE-NEXT: movq %rax, 8(%rdi) -; SSE-NEXT: movq %rax, (%rdi) +; SSE-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; SSE-NEXT: movups %xmm0, (%rdi) ; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_16_nonzero_bytes: @@ -38,11 +37,9 @@ define void @memset_16_nonzero_bytes(ptr %x) { define void @memset_32_nonzero_bytes(ptr %x) { ; SSE-LABEL: memset_32_nonzero_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A -; SSE-NEXT: movq %rax, 24(%rdi) -; SSE-NEXT: movq %rax, 16(%rdi) -; SSE-NEXT: movq %rax, 8(%rdi) -; SSE-NEXT: movq %rax, (%rdi) +; SSE-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; SSE-NEXT: movups %xmm0, 16(%rdi) +; SSE-NEXT: movups %xmm0, (%rdi) ; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_32_nonzero_bytes: @@ -54,7 +51,7 @@ define void @memset_32_nonzero_bytes(ptr %x) { ; ; AVX-LABEL: memset_32_nonzero_bytes: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX-NEXT: vmovups %ymm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -65,15 +62,11 @@ define void @memset_32_nonzero_bytes(ptr %x) { define void @memset_64_nonzero_bytes(ptr %x) { ; SSE-LABEL: memset_64_nonzero_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A -; SSE-NEXT: movq %rax, 56(%rdi) -; SSE-NEXT: movq %rax, 48(%rdi) -; SSE-NEXT: movq %rax, 40(%rdi) -; SSE-NEXT: movq %rax, 32(%rdi) -; SSE-NEXT: movq %rax, 24(%rdi) -; SSE-NEXT: movq %rax, 16(%rdi) -; SSE-NEXT: movq %rax, 8(%rdi) -; SSE-NEXT: movq %rax, (%rdi) +; SSE-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; SSE-NEXT: movups %xmm0, 48(%rdi) +; SSE-NEXT: movups %xmm0, 32(%rdi) +; SSE-NEXT: movups %xmm0, 16(%rdi) +; SSE-NEXT: movups %xmm0, (%rdi) ; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_64_nonzero_bytes: @@ -87,7 +80,7 @@ define void @memset_64_nonzero_bytes(ptr %x) { ; ; AVX1-LABEL: memset_64_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) ; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper @@ -95,7 +88,7 @@ define void @memset_64_nonzero_bytes(ptr %x) { ; ; AVX2-LABEL: memset_64_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX2-NEXT: vmovups %ymm0, 32(%rdi) ; AVX2-NEXT: vmovups %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper @@ -110,7 +103,7 @@ define void @memset_64_nonzero_bytes(ptr %x) { ; ; AVX512BW-LABEL: memset_64_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX512BW-NEXT: vmovups %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -122,23 +115,15 @@ define void @memset_64_nonzero_bytes(ptr %x) { define void @memset_128_nonzero_bytes(ptr %x) { ; SSE-LABEL: memset_128_nonzero_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A -; SSE-NEXT: movq %rax, 120(%rdi) -; SSE-NEXT: movq %rax, 112(%rdi) -; SSE-NEXT: movq %rax, 104(%rdi) -; SSE-NEXT: movq %rax, 96(%rdi) -; SSE-NEXT: movq %rax, 88(%rdi) -; SSE-NEXT: movq %rax, 80(%rdi) -; SSE-NEXT: movq %rax, 72(%rdi) -; SSE-NEXT: movq %rax, 64(%rdi) -; SSE-NEXT: movq %rax, 56(%rdi) -; SSE-NEXT: movq %rax, 48(%rdi) -; SSE-NEXT: movq %rax, 40(%rdi) -; SSE-NEXT: movq %rax, 32(%rdi) -; SSE-NEXT: movq %rax, 24(%rdi) -; SSE-NEXT: movq %rax, 16(%rdi) -; SSE-NEXT: movq %rax, 8(%rdi) -; SSE-NEXT: movq %rax, (%rdi) +; SSE-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; SSE-NEXT: movups %xmm0, 112(%rdi) +; SSE-NEXT: movups %xmm0, 96(%rdi) +; SSE-NEXT: movups %xmm0, 80(%rdi) +; SSE-NEXT: movups %xmm0, 64(%rdi) +; SSE-NEXT: movups %xmm0, 48(%rdi) +; SSE-NEXT: movups %xmm0, 32(%rdi) +; SSE-NEXT: movups %xmm0, 16(%rdi) +; SSE-NEXT: movups %xmm0, (%rdi) ; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_128_nonzero_bytes: @@ -156,7 +141,7 @@ define void @memset_128_nonzero_bytes(ptr %x) { ; ; AVX1-LABEL: memset_128_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX1-NEXT: vmovups %ymm0, 96(%rdi) ; AVX1-NEXT: vmovups %ymm0, 64(%rdi) ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) @@ -166,7 +151,7 @@ define void @memset_128_nonzero_bytes(ptr %x) { ; ; AVX2-LABEL: memset_128_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX2-NEXT: vmovups %ymm0, 96(%rdi) ; AVX2-NEXT: vmovups %ymm0, 64(%rdi) ; AVX2-NEXT: vmovups %ymm0, 32(%rdi) @@ -184,7 +169,7 @@ define void @memset_128_nonzero_bytes(ptr %x) { ; ; AVX512BW-LABEL: memset_128_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) ; AVX512BW-NEXT: vmovups %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -196,9 +181,24 @@ define void @memset_128_nonzero_bytes(ptr %x) { define void @memset_256_nonzero_bytes(ptr %x) { ; SSE-LABEL: memset_256_nonzero_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movl $256, %edx # imm = 0x100 -; SSE-NEXT: movl $42, %esi -; SSE-NEXT: jmp memset@PLT # TAILCALL +; SSE-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; SSE-NEXT: movups %xmm0, 240(%rdi) +; SSE-NEXT: movups %xmm0, 224(%rdi) +; SSE-NEXT: movups %xmm0, 208(%rdi) +; SSE-NEXT: movups %xmm0, 192(%rdi) +; SSE-NEXT: movups %xmm0, 176(%rdi) +; SSE-NEXT: movups %xmm0, 160(%rdi) +; SSE-NEXT: movups %xmm0, 144(%rdi) +; SSE-NEXT: movups %xmm0, 128(%rdi) +; SSE-NEXT: movups %xmm0, 112(%rdi) +; SSE-NEXT: movups %xmm0, 96(%rdi) +; SSE-NEXT: movups %xmm0, 80(%rdi) +; SSE-NEXT: movups %xmm0, 64(%rdi) +; SSE-NEXT: movups %xmm0, 48(%rdi) +; SSE-NEXT: movups %xmm0, 32(%rdi) +; SSE-NEXT: movups %xmm0, 16(%rdi) +; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_256_nonzero_bytes: ; SSE2FAST: # %bb.0: @@ -223,7 +223,7 @@ define void @memset_256_nonzero_bytes(ptr %x) { ; ; AVX1-LABEL: memset_256_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX1-NEXT: vmovups %ymm0, 224(%rdi) ; AVX1-NEXT: vmovups %ymm0, 192(%rdi) ; AVX1-NEXT: vmovups %ymm0, 160(%rdi) @@ -237,7 +237,7 @@ define void @memset_256_nonzero_bytes(ptr %x) { ; ; AVX2-LABEL: memset_256_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX2-NEXT: vmovups %ymm0, 224(%rdi) ; AVX2-NEXT: vmovups %ymm0, 192(%rdi) ; AVX2-NEXT: vmovups %ymm0, 160(%rdi) @@ -261,7 +261,7 @@ define void @memset_256_nonzero_bytes(ptr %x) { ; ; AVX512BW-LABEL: memset_256_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi) ; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi) ; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) @@ -279,11 +279,11 @@ declare ptr @__memset_chk(ptr, i32, i64, i64) define void @memset_16_nonconst_bytes(ptr %x, i8 %c) { ; SSE-LABEL: memset_16_nonconst_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movzbl %sil, %eax -; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 -; SSE-NEXT: imulq %rax, %rcx -; SSE-NEXT: movq %rcx, 8(%rdi) -; SSE-NEXT: movq %rcx, (%rdi) +; SSE-NEXT: movd %esi, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqu %xmm0, (%rdi) ; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_16_nonconst_bytes: @@ -323,13 +323,12 @@ define void @memset_16_nonconst_bytes(ptr %x, i8 %c) { define void @memset_32_nonconst_bytes(ptr %x, i8 %c) { ; SSE-LABEL: memset_32_nonconst_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movzbl %sil, %eax -; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 -; SSE-NEXT: imulq %rax, %rcx -; SSE-NEXT: movq %rcx, 24(%rdi) -; SSE-NEXT: movq %rcx, 16(%rdi) -; SSE-NEXT: movq %rcx, 8(%rdi) -; SSE-NEXT: movq %rcx, (%rdi) +; SSE-NEXT: movd %esi, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqu %xmm0, 16(%rdi) +; SSE-NEXT: movdqu %xmm0, (%rdi) ; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_32_nonconst_bytes: @@ -373,17 +372,14 @@ define void @memset_32_nonconst_bytes(ptr %x, i8 %c) { define void @memset_64_nonconst_bytes(ptr %x, i8 %c) { ; SSE-LABEL: memset_64_nonconst_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movzbl %sil, %eax -; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 -; SSE-NEXT: imulq %rax, %rcx -; SSE-NEXT: movq %rcx, 56(%rdi) -; SSE-NEXT: movq %rcx, 48(%rdi) -; SSE-NEXT: movq %rcx, 40(%rdi) -; SSE-NEXT: movq %rcx, 32(%rdi) -; SSE-NEXT: movq %rcx, 24(%rdi) -; SSE-NEXT: movq %rcx, 16(%rdi) -; SSE-NEXT: movq %rcx, 8(%rdi) -; SSE-NEXT: movq %rcx, (%rdi) +; SSE-NEXT: movd %esi, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqu %xmm0, 48(%rdi) +; SSE-NEXT: movdqu %xmm0, 32(%rdi) +; SSE-NEXT: movdqu %xmm0, 16(%rdi) +; SSE-NEXT: movdqu %xmm0, (%rdi) ; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_64_nonconst_bytes: @@ -440,25 +436,18 @@ define void @memset_64_nonconst_bytes(ptr %x, i8 %c) { define void @memset_128_nonconst_bytes(ptr %x, i8 %c) { ; SSE-LABEL: memset_128_nonconst_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movzbl %sil, %eax -; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 -; SSE-NEXT: imulq %rax, %rcx -; SSE-NEXT: movq %rcx, 120(%rdi) -; SSE-NEXT: movq %rcx, 112(%rdi) -; SSE-NEXT: movq %rcx, 104(%rdi) -; SSE-NEXT: movq %rcx, 96(%rdi) -; SSE-NEXT: movq %rcx, 88(%rdi) -; SSE-NEXT: movq %rcx, 80(%rdi) -; SSE-NEXT: movq %rcx, 72(%rdi) -; SSE-NEXT: movq %rcx, 64(%rdi) -; SSE-NEXT: movq %rcx, 56(%rdi) -; SSE-NEXT: movq %rcx, 48(%rdi) -; SSE-NEXT: movq %rcx, 40(%rdi) -; SSE-NEXT: movq %rcx, 32(%rdi) -; SSE-NEXT: movq %rcx, 24(%rdi) -; SSE-NEXT: movq %rcx, 16(%rdi) -; SSE-NEXT: movq %rcx, 8(%rdi) -; SSE-NEXT: movq %rcx, (%rdi) +; SSE-NEXT: movd %esi, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqu %xmm0, 112(%rdi) +; SSE-NEXT: movdqu %xmm0, 96(%rdi) +; SSE-NEXT: movdqu %xmm0, 80(%rdi) +; SSE-NEXT: movdqu %xmm0, 64(%rdi) +; SSE-NEXT: movdqu %xmm0, 48(%rdi) +; SSE-NEXT: movdqu %xmm0, 32(%rdi) +; SSE-NEXT: movdqu %xmm0, 16(%rdi) +; SSE-NEXT: movdqu %xmm0, (%rdi) ; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_128_nonconst_bytes: @@ -525,8 +514,27 @@ define void @memset_128_nonconst_bytes(ptr %x, i8 %c) { define void @memset_256_nonconst_bytes(ptr %x, i8 %c) { ; SSE-LABEL: memset_256_nonconst_bytes: ; SSE: # %bb.0: -; SSE-NEXT: movl $256, %edx # imm = 0x100 -; SSE-NEXT: jmp memset@PLT # TAILCALL +; SSE-NEXT: movd %esi, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqu %xmm0, 240(%rdi) +; SSE-NEXT: movdqu %xmm0, 224(%rdi) +; SSE-NEXT: movdqu %xmm0, 208(%rdi) +; SSE-NEXT: movdqu %xmm0, 192(%rdi) +; SSE-NEXT: movdqu %xmm0, 176(%rdi) +; SSE-NEXT: movdqu %xmm0, 160(%rdi) +; SSE-NEXT: movdqu %xmm0, 144(%rdi) +; SSE-NEXT: movdqu %xmm0, 128(%rdi) +; SSE-NEXT: movdqu %xmm0, 112(%rdi) +; SSE-NEXT: movdqu %xmm0, 96(%rdi) +; SSE-NEXT: movdqu %xmm0, 80(%rdi) +; SSE-NEXT: movdqu %xmm0, 64(%rdi) +; SSE-NEXT: movdqu %xmm0, 48(%rdi) +; SSE-NEXT: movdqu %xmm0, 32(%rdi) +; SSE-NEXT: movdqu %xmm0, 16(%rdi) +; SSE-NEXT: movdqu %xmm0, (%rdi) +; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_256_nonconst_bytes: ; SSE2FAST: # %bb.0: diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll index 480a0970bd39d..df87f1b2b2e0d 100644 --- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll @@ -19,30 +19,21 @@ define void @bork(ptr nocapture align 4 %dst) nounwind { ; SLOW_32: # %bb.0: ; SLOW_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SLOW_32-NEXT: xorps %xmm0, %xmm0 -; SLOW_32-NEXT: movsd %xmm0, 72(%eax) -; SLOW_32-NEXT: movsd %xmm0, 64(%eax) -; SLOW_32-NEXT: movsd %xmm0, 56(%eax) -; SLOW_32-NEXT: movsd %xmm0, 48(%eax) -; SLOW_32-NEXT: movsd %xmm0, 40(%eax) -; SLOW_32-NEXT: movsd %xmm0, 32(%eax) -; SLOW_32-NEXT: movsd %xmm0, 24(%eax) -; SLOW_32-NEXT: movsd %xmm0, 16(%eax) -; SLOW_32-NEXT: movsd %xmm0, 8(%eax) -; SLOW_32-NEXT: movsd %xmm0, (%eax) +; SLOW_32-NEXT: movups %xmm0, 64(%eax) +; SLOW_32-NEXT: movups %xmm0, 48(%eax) +; SLOW_32-NEXT: movups %xmm0, 32(%eax) +; SLOW_32-NEXT: movups %xmm0, 16(%eax) +; SLOW_32-NEXT: movups %xmm0, (%eax) ; SLOW_32-NEXT: retl ; ; SLOW_64-LABEL: bork: ; SLOW_64: # %bb.0: -; SLOW_64-NEXT: movq $0, 72(%rdi) -; SLOW_64-NEXT: movq $0, 64(%rdi) -; SLOW_64-NEXT: movq $0, 56(%rdi) -; SLOW_64-NEXT: movq $0, 48(%rdi) -; SLOW_64-NEXT: movq $0, 40(%rdi) -; SLOW_64-NEXT: movq $0, 32(%rdi) -; SLOW_64-NEXT: movq $0, 24(%rdi) -; SLOW_64-NEXT: movq $0, 16(%rdi) -; SLOW_64-NEXT: movq $0, 8(%rdi) -; SLOW_64-NEXT: movq $0, (%rdi) +; SLOW_64-NEXT: xorps %xmm0, %xmm0 +; SLOW_64-NEXT: movups %xmm0, 64(%rdi) +; SLOW_64-NEXT: movups %xmm0, 48(%rdi) +; SLOW_64-NEXT: movups %xmm0, 32(%rdi) +; SLOW_64-NEXT: movups %xmm0, 16(%rdi) +; SLOW_64-NEXT: movups %xmm0, (%rdi) ; SLOW_64-NEXT: retq call void @llvm.memset.p0.i64(ptr align 4 %dst, i8 0, i64 80, i1 false) ret void diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll index 9c3057e4e42a4..1d28c044ab262 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -365,9 +365,9 @@ define void @merge_2_v4f32_align1_ntstore(ptr %a0, ptr %a1) nounwind { ; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax ; X64-SSE41-NEXT: movntiq %rax, 8(%rsi) ; X64-SSE41-NEXT: movq %xmm0, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm1, %rcx ; X64-SSE41-NEXT: movntiq %rax, (%rsi) -; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax -; X64-SSE41-NEXT: movntiq %rax, 24(%rsi) +; X64-SSE41-NEXT: movntiq %rcx, 24(%rsi) ; X64-SSE41-NEXT: movq %xmm1, %rax ; X64-SSE41-NEXT: movntiq %rax, 16(%rsi) ; X64-SSE41-NEXT: retq @@ -379,9 +379,9 @@ define void @merge_2_v4f32_align1_ntstore(ptr %a0, ptr %a1) nounwind { ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax ; X64-AVX-NEXT: movntiq %rax, 8(%rsi) ; X64-AVX-NEXT: vmovq %xmm0, %rax +; X64-AVX-NEXT: vpextrq $1, %xmm1, %rcx ; X64-AVX-NEXT: movntiq %rax, (%rsi) -; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax -; X64-AVX-NEXT: movntiq %rax, 24(%rsi) +; X64-AVX-NEXT: movntiq %rcx, 24(%rsi) ; X64-AVX-NEXT: vmovq %xmm1, %rax ; X64-AVX-NEXT: movntiq %rax, 16(%rsi) ; X64-AVX-NEXT: retq @@ -476,9 +476,9 @@ define void @merge_2_v4f32_align1(ptr %a0, ptr %a1) nounwind { ; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax ; X64-SSE41-NEXT: movntiq %rax, 8(%rsi) ; X64-SSE41-NEXT: movq %xmm0, %rax +; X64-SSE41-NEXT: pextrq $1, %xmm1, %rcx ; X64-SSE41-NEXT: movntiq %rax, (%rsi) -; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax -; X64-SSE41-NEXT: movntiq %rax, 24(%rsi) +; X64-SSE41-NEXT: movntiq %rcx, 24(%rsi) ; X64-SSE41-NEXT: movq %xmm1, %rax ; X64-SSE41-NEXT: movntiq %rax, 16(%rsi) ; X64-SSE41-NEXT: retq @@ -490,9 +490,9 @@ define void @merge_2_v4f32_align1(ptr %a0, ptr %a1) nounwind { ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax ; X64-AVX-NEXT: movntiq %rax, 8(%rsi) ; X64-AVX-NEXT: vmovq %xmm0, %rax +; X64-AVX-NEXT: vpextrq $1, %xmm1, %rcx ; X64-AVX-NEXT: movntiq %rax, (%rsi) -; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax -; X64-AVX-NEXT: movntiq %rax, 24(%rsi) +; X64-AVX-NEXT: movntiq %rcx, 24(%rsi) ; X64-AVX-NEXT: vmovq %xmm1, %rax ; X64-AVX-NEXT: movntiq %rax, 16(%rsi) ; X64-AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/merge-store-constants.ll b/llvm/test/CodeGen/X86/merge-store-constants.ll index 8030d5f08fa57..e7778c0aaf322 100644 --- a/llvm/test/CodeGen/X86/merge-store-constants.ll +++ b/llvm/test/CodeGen/X86/merge-store-constants.ll @@ -58,14 +58,14 @@ define void @big_nonzero_32_bytes_splat(ptr nocapture %a) { ; X32-LABEL: big_nonzero_32_bytes_splat: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] +; X32-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] ; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: big_nonzero_32_bytes_splat: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] ; X64-NEXT: vmovups %ymm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/merge_store.ll b/llvm/test/CodeGen/X86/merge_store.ll index afe0ef969a40e..2ce92eed7bf2f 100644 --- a/llvm/test/CodeGen/X86/merge_store.ll +++ b/llvm/test/CodeGen/X86/merge_store.ll @@ -5,12 +5,11 @@ define void @merge_store(ptr nocapture %a) { ; CHECK-LABEL: merge_store: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq %rcx, (%rdi,%rax,4) -; CHECK-NEXT: movq %rcx, 8(%rdi,%rax,4) +; CHECK-NEXT: movups %xmm0, (%rdi,%rax,4) ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpl $1000, %eax # imm = 0x3E8 ; CHECK-NEXT: jl .LBB0_1 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index ac6b7e54ca5b5..8eed5143ac403 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -107,9 +107,9 @@ define <4 x i32> @vec128_i32_signed_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwi ; ; XOPAVX2-LABEL: vec128_i32_signed_reg_reg: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] +; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 @@ -232,9 +232,9 @@ define <4 x i32> @vec128_i32_unsigned_reg_reg(<4 x i32> %a1, <4 x i32> %a2) noun ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,1,1,1] ; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1,1,1,1] -; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 @@ -266,9 +266,9 @@ define <4 x i32> @vec128_i32_unsigned_reg_reg(<4 x i32> %a1, <4 x i32> %a2) noun ; ; XOPAVX2-LABEL: vec128_i32_unsigned_reg_reg: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] +; XOPAVX2-NEXT: vpcomgtud %xmm1, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; XOPAVX2-NEXT: vpminud %xmm1, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 @@ -426,9 +426,9 @@ define <4 x i32> @vec128_i32_signed_mem_reg(ptr %a1_addr, <4 x i32> %a2) nounwin ; XOPAVX2-LABEL: vec128_i32_signed_mem_reg: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 -; XOPAVX2-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] +; XOPAVX2-NEXT: vpcomgtd %xmm0, %xmm1, %xmm3 +; XOPAVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; XOPAVX2-NEXT: vpminsd %xmm0, %xmm1, %xmm3 ; XOPAVX2-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 @@ -586,9 +586,9 @@ define <4 x i32> @vec128_i32_signed_reg_mem(<4 x i32> %a1, ptr %a2_addr) nounwin ; XOPAVX2-LABEL: vec128_i32_signed_reg_mem: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 -; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] +; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 @@ -753,9 +753,9 @@ define <4 x i32> @vec128_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm0 ; XOPAVX2-NEXT: vmovdqa (%rsi), %xmm1 -; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] +; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm3 +; XOPAVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 ; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 ; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 @@ -830,71 +830,38 @@ define <4 x i32> @vec128_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; Values come from regs define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwind { -; SSE2-LABEL: vec128_i64_signed_reg_reg: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1] -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psubq %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: psrlq $1, %xmm3 -; SSE2-NEXT: psrlq $33, %xmm4 -; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: psrlq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: paddq %xmm4, %xmm2 -; SSE2-NEXT: psllq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: paddq %xmm3, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: vec128_i64_signed_reg_reg: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psubq %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: psrlq $1, %xmm3 -; SSE41-NEXT: psrlq $33, %xmm4 -; SSE41-NEXT: pmuludq %xmm1, %xmm4 -; SSE41-NEXT: psrlq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm3, %xmm2 -; SSE41-NEXT: paddq %xmm4, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm1, %xmm3 -; SSE41-NEXT: paddq %xmm3, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: vec128_i64_signed_reg_reg: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psubq %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,1] +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psubq %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrlq $1, %xmm3 +; SSE-NEXT: psrlq $33, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: vec128_i64_signed_reg_reg: ; AVX: # %bb.0: @@ -940,7 +907,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -976,7 +943,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1005,71 +972,38 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi } define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwind { -; SSE2-LABEL: vec128_i64_unsigned_reg_reg: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1] -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psubq %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: psrlq $1, %xmm3 -; SSE2-NEXT: psrlq $33, %xmm4 -; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: psrlq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: paddq %xmm4, %xmm2 -; SSE2-NEXT: psllq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: paddq %xmm3, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: vec128_i64_unsigned_reg_reg: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psubq %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: psrlq $1, %xmm3 -; SSE41-NEXT: psrlq $33, %xmm4 -; SSE41-NEXT: pmuludq %xmm1, %xmm4 -; SSE41-NEXT: psrlq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm3, %xmm2 -; SSE41-NEXT: paddq %xmm4, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm1, %xmm3 -; SSE41-NEXT: paddq %xmm3, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: vec128_i64_unsigned_reg_reg: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psubq %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,1] +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psubq %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrlq $1, %xmm3 +; SSE-NEXT: psrlq $33, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: vec128_i64_unsigned_reg_reg: ; AVX1: # %bb.0: @@ -1096,7 +1030,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; ; AVX2-LABEL: vec128_i64_unsigned_reg_reg: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1141,7 +1075,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1177,7 +1111,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1208,73 +1142,39 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; Values are loaded. Only check signed case. define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwind { -; SSE2-LABEL: vec128_i64_signed_mem_reg: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: psubq %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 -; SSE2-NEXT: psrlq $33, %xmm5 -; SSE2-NEXT: pmuludq %xmm4, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: paddq %xmm5, %xmm2 -; SSE2-NEXT: psllq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: vec128_i64_signed_mem_reg: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psubq %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] -; SSE41-NEXT: por %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: psubq %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $33, %xmm5 -; SSE41-NEXT: pmuludq %xmm4, %xmm5 -; SSE41-NEXT: psrlq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm0, %xmm2 -; SSE41-NEXT: paddq %xmm5, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm4, %xmm0 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: vec128_i64_signed_mem_reg: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psubq %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,1] +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: psubq %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: psrlq $1, %xmm0 +; SSE-NEXT: psrlq $33, %xmm5 +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm0, %xmm2 +; SSE-NEXT: paddq %xmm5, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: paddq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: vec128_i64_signed_mem_reg: ; AVX: # %bb.0: @@ -1322,7 +1222,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1359,7 +1259,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1389,75 +1289,40 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin } define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwind { -; SSE2-LABEL: vec128_i64_signed_reg_mem: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1] -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psubq %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlq $1, %xmm4 -; SSE2-NEXT: psrlq $33, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: psrlq $32, %xmm3 -; SSE2-NEXT: pmuludq %xmm4, %xmm3 -; SSE2-NEXT: paddq %xmm1, %xmm3 -; SSE2-NEXT: psllq $32, %xmm3 -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: paddq %xmm4, %xmm0 -; SSE2-NEXT: paddq %xmm3, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: vec128_i64_signed_reg_mem: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [1,1] -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: psubq %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psubq %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psrlq $1, %xmm4 -; SSE41-NEXT: psrlq $33, %xmm1 -; SSE41-NEXT: pmuludq %xmm2, %xmm1 -; SSE41-NEXT: psrlq $32, %xmm3 -; SSE41-NEXT: pmuludq %xmm4, %xmm3 -; SSE41-NEXT: paddq %xmm1, %xmm3 -; SSE41-NEXT: psllq $32, %xmm3 -; SSE41-NEXT: pmuludq %xmm2, %xmm4 -; SSE41-NEXT: paddq %xmm4, %xmm0 -; SSE41-NEXT: paddq %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: vec128_i64_signed_reg_mem: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1] +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psubq %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: psubq %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrlq $1, %xmm4 +; SSE-NEXT: psrlq $33, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: paddq %xmm1, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: vec128_i64_signed_reg_mem: ; AVX: # %bb.0: @@ -1505,7 +1370,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1542,7 +1407,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1572,75 +1437,40 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin } define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { -; SSE2-LABEL: vec128_i64_signed_mem_mem: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: psubq %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 -; SSE2-NEXT: psrlq $33, %xmm5 -; SSE2-NEXT: pmuludq %xmm4, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: paddq %xmm5, %xmm2 -; SSE2-NEXT: psllq $32, %xmm2 -; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: vec128_i64_signed_mem_mem: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: movdqa (%rsi), %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psubq %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] -; SSE41-NEXT: por %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: psubq %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $33, %xmm5 -; SSE41-NEXT: pmuludq %xmm4, %xmm5 -; SSE41-NEXT: psrlq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm0, %xmm2 -; SSE41-NEXT: paddq %xmm5, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm4, %xmm0 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: vec128_i64_signed_mem_mem: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psubq %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,1] +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: psubq %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: psrlq $1, %xmm0 +; SSE-NEXT: psrlq $33, %xmm5 +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm0, %xmm2 +; SSE-NEXT: paddq %xmm5, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: paddq %xmm1, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: vec128_i64_signed_mem_mem: ; AVX: # %bb.0: @@ -1690,7 +1520,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1728,7 +1558,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1832,7 +1662,7 @@ define <8 x i16> @vec128_i16_signed_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwi ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -1957,7 +1787,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 @@ -2065,7 +1895,7 @@ define <8 x i16> @vec128_i16_signed_mem_reg(ptr %a1_addr, <8 x i16> %a2) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 @@ -2173,7 +2003,7 @@ define <8 x i16> @vec128_i16_signed_reg_mem(<8 x i16> %a1, ptr %a2_addr) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2286,7 +2116,7 @@ define <8 x i16> @vec128_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2368,7 +2198,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; SSE41-NEXT: psubb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 @@ -2511,7 +2341,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 @@ -2591,7 +2421,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; SSE41-NEXT: psubb %xmm2, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm5 @@ -2740,7 +2570,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 @@ -2822,7 +2652,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 @@ -2972,7 +2802,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 @@ -3053,7 +2883,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; SSE41-NEXT: psubb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 @@ -3203,7 +3033,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 @@ -3286,7 +3116,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 @@ -3443,7 +3273,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index a8021e3164f34..da08dee43fdec 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -399,7 +399,8 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 ; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 @@ -424,20 +425,20 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; ; AVX2-LABEL: vec256_i64_signed_reg_reg: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -457,7 +458,8 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 ; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 @@ -485,20 +487,20 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 -; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2 +; AVX512F-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm3 ; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -520,20 +522,20 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 -; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm3 ; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq @@ -570,7 +572,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm9 ; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm4 @@ -598,9 +601,9 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1] ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm3 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 @@ -631,7 +634,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 ; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 @@ -659,20 +663,20 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm3 ; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 -; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2 +; AVX512F-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm3 ; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -694,20 +698,20 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 -; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm3 ; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 -; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm3 ; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq @@ -741,7 +745,8 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 ; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 @@ -767,20 +772,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX2-LABEL: vec256_i64_signed_mem_reg: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -801,7 +806,8 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 ; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 @@ -829,20 +835,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm3 ; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlq $1, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubq %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlq $1, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlq $33, %ymm0, %ymm0 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512F-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq @@ -865,20 +871,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 -; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2 +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm3 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 -; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm0, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm0, %ymm0 +; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm0, %ymm3 ; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm0, %ymm0 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 -; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq @@ -911,7 +917,8 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 ; AVX1-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 @@ -937,20 +944,20 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; AVX2-LABEL: vec256_i64_signed_reg_mem: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -971,7 +978,8 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; XOP-NEXT: vpsrlq $1, %xmm2, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm1, %xmm1 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 ; XOP-NEXT: vpmuludq %xmm1, %xmm9, %xmm1 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 @@ -999,20 +1007,20 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 -; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2 +; AVX512F-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm3 ; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1035,20 +1043,20 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 -; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm3 ; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq @@ -1082,7 +1090,8 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm7 ; AVX1-NEXT: vpsrlq $33, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm9 ; AVX1-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 @@ -1109,20 +1118,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 ; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1144,7 +1153,8 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vpsrlq $1, %xmm1, %xmm6 ; XOP-NEXT: vpsrlq $1, %xmm0, %xmm7 ; XOP-NEXT: vpsrlq $33, %xmm0, %xmm0 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: vmovddup {{.*#+}} xmm8 = [1,1] +; XOP-NEXT: # xmm8 = mem[0,0] ; XOP-NEXT: vpor %xmm5, %xmm8, %xmm9 ; XOP-NEXT: vpmuludq %xmm0, %xmm9, %xmm0 ; XOP-NEXT: vpsrlq $32, %xmm5, %xmm5 @@ -1172,20 +1182,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 -; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2 +; AVX512F-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm3 ; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1209,20 +1219,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 -; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm3 ; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 +; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll index 5f6337e29d685..61c2343622c66 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -721,7 +721,7 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 @@ -805,7 +805,7 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 @@ -864,7 +864,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 @@ -896,7 +896,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vmovdqa64 {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 @@ -954,7 +954,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 @@ -986,7 +986,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vmovdqa64 {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 @@ -1045,7 +1045,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 @@ -1078,7 +1078,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vmovdqa64 {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index 1921cf383b2f2..cec3edf7a5104 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -18,7 +18,7 @@ define i32 @scalar_i32_signed_reg_reg(i32 %a1, i32 %a2) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: setle %cl -; X64-NEXT: leal -1(%rcx,%rcx), %ecx +; X64-NEXT: leal -1(,%rcx,2), %ecx ; X64-NEXT: subl %edi, %esi ; X64-NEXT: cmovgel %esi, %eax ; X64-NEXT: shrl %eax @@ -34,7 +34,7 @@ define i32 @scalar_i32_signed_reg_reg(i32 %a1, i32 %a2) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: leal -1(,%eax,2), %edx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %esi, %eax ; X86-NEXT: jg .LBB0_2 @@ -84,7 +84,7 @@ define i32 @scalar_i32_unsigned_reg_reg(i32 %a1, i32 %a2) nounwind { ; X86-NEXT: movl %edi, %esi ; X86-NEXT: subl %ecx, %esi ; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: leal -1(,%eax,2), %edx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %edi, %eax ; X86-NEXT: ja .LBB1_2 @@ -118,7 +118,7 @@ define i32 @scalar_i32_signed_mem_reg(ptr %a1_addr, i32 %a2) nounwind { ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: setle %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx +; X64-NEXT: leal -1(,%rdx,2), %edx ; X64-NEXT: subl %ecx, %esi ; X64-NEXT: cmovgel %esi, %eax ; X64-NEXT: shrl %eax @@ -135,7 +135,7 @@ define i32 @scalar_i32_signed_mem_reg(ptr %a1_addr, i32 %a2) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %esi +; X86-NEXT: leal -1(,%eax,2), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %edx, %eax ; X86-NEXT: jg .LBB2_2 @@ -168,7 +168,7 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, ptr %a2_addr) nounwind { ; X64-NEXT: movl %edi, %edx ; X64-NEXT: subl %eax, %edx ; X64-NEXT: setle %cl -; X64-NEXT: leal -1(%rcx,%rcx), %ecx +; X64-NEXT: leal -1(,%rcx,2), %ecx ; X64-NEXT: subl %edi, %eax ; X64-NEXT: cmovll %edx, %eax ; X64-NEXT: shrl %eax @@ -185,7 +185,7 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, ptr %a2_addr) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: leal -1(,%eax,2), %edx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %esi, %eax ; X86-NEXT: jg .LBB3_2 @@ -219,7 +219,7 @@ define i32 @scalar_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X64-NEXT: movl %ecx, %esi ; X64-NEXT: subl %eax, %esi ; X64-NEXT: setle %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx +; X64-NEXT: leal -1(,%rdx,2), %edx ; X64-NEXT: subl %ecx, %eax ; X64-NEXT: cmovll %esi, %eax ; X64-NEXT: shrl %eax @@ -237,7 +237,7 @@ define i32 @scalar_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: setle %al -; X86-NEXT: leal -1(%eax,%eax), %edx +; X86-NEXT: leal -1(,%eax,2), %edx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: subl %esi, %eax ; X86-NEXT: jg .LBB4_2 @@ -276,7 +276,7 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: setle %cl -; X64-NEXT: leaq -1(%rcx,%rcx), %rcx +; X64-NEXT: leaq -1(,%rcx,2), %rcx ; X64-NEXT: subq %rdi, %rsi ; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: shrq %rax @@ -290,35 +290,36 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: orl $1, %ebx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: subl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setl %cl +; X86-NEXT: movzbl %cl, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: orl $1, %ebp +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: subl %edi, %eax +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: jl .LBB5_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: .LBB5_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ebx, %ebp -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: shrdl $1, %esi, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: imull %ecx, %ebx +; X86-NEXT: shrl %esi +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: imull %ebp, %esi +; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: popl %esi @@ -359,34 +360,33 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %eax, %ebp -; X86-NEXT: sbbl %ecx, %esi -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: orl $1, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: subl %ebp, %esi +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: cmpl %ebp, %esi +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: orl $1, %edx +; X86-NEXT: subl %esi, %ebp ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: xorl %edx, %eax -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: subl %edx, %esi -; X86-NEXT: sbbl %edx, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: shldl $31, %esi, %eax -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %edi +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: xorl %ebx, %ebp +; X86-NEXT: subl %ebx, %ebp +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: shldl $31, %ebp, %eax +; X86-NEXT: shrl %ebx +; X86-NEXT: imull %eax, %edi +; X86-NEXT: imull %edx, %ebx +; X86-NEXT: mull %edx ; X86-NEXT: addl %ebx, %edx -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %edi, %ebp -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi @@ -415,7 +415,7 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind { ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: setle %dl -; X64-NEXT: leaq -1(%rdx,%rdx), %rdx +; X64-NEXT: leaq -1(,%rdx,2), %rdx ; X64-NEXT: subq %rcx, %rsi ; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: shrq %rax @@ -430,41 +430,42 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %ebx -; X86-NEXT: movl 4(%eax), %esi -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: orl $1, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ebx, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl (%ecx), %ebp +; X86-NEXT: movl 4(%ecx), %edi +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: setl %cl +; X86-NEXT: movzbl %cl, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: orl $1, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl %ebp, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %edi, %edx ; X86-NEXT: jl .LBB7_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: .LBB7_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: shrdl $1, %esi, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-NEXT: mull %edi +; X86-NEXT: imull %ecx, %ebx +; X86-NEXT: shrl %esi +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: imull %edi, %esi +; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: addl $12, %esp @@ -493,7 +494,7 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind { ; X64-NEXT: movq %rdi, %rdx ; X64-NEXT: subq %rax, %rdx ; X64-NEXT: setle %cl -; X64-NEXT: leaq -1(%rcx,%rcx), %rcx +; X64-NEXT: leaq -1(,%rcx,2), %rcx ; X64-NEXT: subq %rdi, %rax ; X64-NEXT: cmovlq %rdx, %rax ; X64-NEXT: shrq %rax @@ -507,36 +508,37 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: orl $1, %ebx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: subl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %eax +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setl %cl +; X86-NEXT: movzbl %cl, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: orl $1, %ebp +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: subl %edi, %eax +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: jl .LBB8_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: .LBB8_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ebx, %ebp -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: shrdl $1, %esi, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: imull %ecx, %ebx +; X86-NEXT: shrl %esi +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: imull %ebp, %esi +; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: popl %esi @@ -565,7 +567,7 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X64-NEXT: movq %rcx, %rsi ; X64-NEXT: subq %rax, %rsi ; X64-NEXT: setle %dl -; X64-NEXT: leaq -1(%rdx,%rdx), %rdx +; X64-NEXT: leaq -1(,%rdx,2), %rdx ; X64-NEXT: subq %rcx, %rax ; X64-NEXT: cmovlq %rsi, %rax ; X64-NEXT: shrq %rax @@ -580,42 +582,43 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $12, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ebx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl 4(%eax), %ecx -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: setl %al -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: orl $1, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ebx, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %ebp +; X86-NEXT: movl 4(%eax), %edi +; X86-NEXT: movl (%ecx), %eax +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: setl %cl +; X86-NEXT: movzbl %cl, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: orl $1, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl %ebp, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %edi, %edx ; X86-NEXT: jl .LBB9_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: .LBB9_2: -; X86-NEXT: shrdl $1, %ebp, %eax -; X86-NEXT: shrl %ebp -; X86-NEXT: imull %eax, %edi -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: shrdl $1, %esi, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-NEXT: mull %edi +; X86-NEXT: imull %ecx, %ebx +; X86-NEXT: shrl %esi +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: imull %edi, %esi +; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: addl $12, %esp @@ -649,7 +652,7 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpw %si, %di ; X64-NEXT: setle %al -; X64-NEXT: leal -1(%rax,%rax), %ecx +; X64-NEXT: leal -1(,%rax,2), %ecx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: movswl %di, %edx @@ -677,7 +680,7 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpw %dx, %cx ; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx +; X86-NEXT: leal -1(,%ebx,2), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -702,7 +705,7 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpw %di, %si ; X64-NEXT: setae %al -; X64-NEXT: leal -1(%rax,%rax), %ecx +; X64-NEXT: leal -1(,%rax,2), %ecx ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: movzwl %di, %edx @@ -730,7 +733,7 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpw %cx, %dx ; X86-NEXT: setae %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx +; X86-NEXT: leal -1(,%ebx,2), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -758,7 +761,7 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpw %si, %cx ; X64-NEXT: setle %al -; X64-NEXT: leal -1(%rax,%rax), %edx +; X64-NEXT: leal -1(,%rax,2), %edx ; X64-NEXT: movl %ecx, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: movswl %si, %esi @@ -786,7 +789,7 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind { ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpw %dx, %cx ; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx +; X86-NEXT: leal -1(,%ebx,2), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -813,7 +816,7 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind { ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpw %ax, %di ; X64-NEXT: setle %cl -; X64-NEXT: leal -1(%rcx,%rcx), %ecx +; X64-NEXT: leal -1(,%rcx,2), %ecx ; X64-NEXT: movl %edi, %edx ; X64-NEXT: subl %eax, %edx ; X64-NEXT: movswl %di, %esi @@ -841,7 +844,7 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind { ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpw %dx, %cx ; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx +; X86-NEXT: leal -1(,%ebx,2), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax @@ -869,7 +872,7 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpw %ax, %cx ; X64-NEXT: setle %dl -; X64-NEXT: leal -1(%rdx,%rdx), %edx +; X64-NEXT: leal -1(,%rdx,2), %edx ; X64-NEXT: movl %ecx, %esi ; X64-NEXT: subl %eax, %esi ; X64-NEXT: subl %ecx, %eax @@ -897,7 +900,7 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpw %dx, %cx ; X86-NEXT: setle %bl -; X86-NEXT: leal -1(%ebx,%ebx), %edx +; X86-NEXT: leal -1(,%ebx,2), %edx ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: imull %edx, %eax diff --git a/llvm/test/CodeGen/X86/misched-critical-path.ll b/llvm/test/CodeGen/X86/misched-critical-path.ll index 2a95aaa46d4a4..946fcfe13bd3c 100644 --- a/llvm/test/CodeGen/X86/misched-critical-path.ll +++ b/llvm/test/CodeGen/X86/misched-critical-path.ll @@ -9,10 +9,10 @@ ; ; CHECK: SU(8): CMP8rr %4:gr8, %3:gr8, implicit-def $eflags ; CHECK: Predecessors: -; CHECK-NEXT: SU(6): Data Latency=0 Reg=%4 +; CHECK-NEXT: SU(6): Data Latency=1 Reg=%4 ; CHECK-NEXT: SU(7): Out Latency=0 ; CHECK-NEXT: SU(5): Out Latency=0 -; CHECK-NEXT: SU(3): Data Latency=4 Reg=%3 +; CHECK-NEXT: SU(3): Data Latency=5 Reg=%3 define void @misched_bug() nounwind { entry: %v0 = load i8, ptr @sc, align 1 diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 73d459ba77026..45c7ced973d95 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -547,8 +547,8 @@ define void @ti64(double %a, double %b) nounwind { ; X86-LABEL: ti64: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, 0 ; X86-NEXT: movl %ecx, 4 diff --git a/llvm/test/CodeGen/X86/mmx-build-vector.ll b/llvm/test/CodeGen/X86/mmx-build-vector.ll index d8a010bacc683..f1f884a2faec6 100644 --- a/llvm/test/CodeGen/X86/mmx-build-vector.ll +++ b/llvm/test/CodeGen/X86/mmx-build-vector.ll @@ -649,11 +649,11 @@ define void @build_v2f32_01(ptr%p0, float %a0, float %a1) nounwind { ; ; X86-SSE-LABEL: build_v2f32_01: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: movdq2q %xmm0, %mm0 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: movdq2q %xmm0, %mm1 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] ; X86-SSE-NEXT: paddd %mm1, %mm1 ; X86-SSE-NEXT: movq %mm1, (%eax) @@ -725,9 +725,9 @@ define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind { ; ; X86-SSE-LABEL: build_v2f32_u1: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] ; X86-SSE-NEXT: paddd %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%eax) @@ -798,9 +798,9 @@ define void @build_v2f32_00(ptr%p0, float %a0, float %a1) nounwind { ; ; X86-SSE-LABEL: build_v2f32_00: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] ; X86-SSE-NEXT: paddd %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%eax) diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll index 6fe3bc4973185..9957e5072fb7c 100644 --- a/llvm/test/CodeGen/X86/mmx-fold-load.ll +++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll @@ -308,8 +308,10 @@ define i64 @tt0(<1 x i64> %t, ptr %q) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: paddb (%rsi), %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %v = load <1 x i64>, ptr %q @@ -347,8 +349,10 @@ define i64 @tt1(<1 x i64> %t, ptr %q) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: paddw (%rsi), %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %v = load <1 x i64>, ptr %q @@ -385,8 +389,10 @@ define i64 @tt2(<1 x i64> %t, ptr %q) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: paddd (%rsi), %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %v = load <1 x i64>, ptr %q @@ -423,8 +429,10 @@ define i64 @tt3(<1 x i64> %t, ptr %q) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: paddq (%rsi), %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %v = load <1 x i64>, ptr %q @@ -461,8 +469,10 @@ define i64 @tt4(<1 x i64> %t, ptr %q) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: paddusb (%rsi), %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %v = load <1 x i64>, ptr %q @@ -499,8 +509,10 @@ define i64 @tt5(<1 x i64> %t, ptr %q) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: paddusw (%rsi), %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %v = load <1 x i64>, ptr %q @@ -537,8 +549,10 @@ define i64 @tt6(<1 x i64> %t, ptr %q) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: psrlw (%rsi), %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %v = load <1 x i64>, ptr %q @@ -575,8 +589,10 @@ define i64 @tt7(<1 x i64> %t, ptr %q) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: psrld (%rsi), %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %v = load <1 x i64>, ptr %q @@ -613,8 +629,10 @@ define i64 @tt8(<1 x i64> %t, ptr %q) nounwind { ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: psrlq (%rsi), %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: %v = load <1 x i64>, ptr %q diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 9b624a935bada..2abfa758fbb20 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -1984,7 +1984,7 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) { ; KNL-LABEL: allones_v2i64_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,1] +; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1] ; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al @@ -3185,7 +3185,7 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) { ; KNL-LABEL: allones_v2i64_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] +; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] ; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al @@ -4098,7 +4098,7 @@ define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) { ; KNL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $7, %edi @@ -4143,7 +4143,7 @@ define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) { ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $3, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,4), %eax @@ -4200,7 +4200,7 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) { ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,8), %eax @@ -4247,7 +4247,7 @@ define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) { ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vcmpeq_uqps %zmm1, %zmm0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $3, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,4), %eax @@ -4291,7 +4291,7 @@ define i1 @movmsk_v2f64_var(<2 x double> %x, <2 x double> %y, i32 %z) { ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vcmplepd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,8), %eax diff --git a/llvm/test/CodeGen/X86/movtopush.ll b/llvm/test/CodeGen/X86/movtopush.ll index 22929fa4b8a17..b9149e14a34e0 100644 --- a/llvm/test/CodeGen/X86/movtopush.ll +++ b/llvm/test/CodeGen/X86/movtopush.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL ; RUN: llc < %s -mtriple=i686-windows -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH ; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64 @@ -22,43 +23,150 @@ declare void @llvm.stackrestore(ptr) ; We should get pushes for x86, even though there is a reserved call frame. ; Make sure we don't touch x86-64, and that turning it off works. +define void @test1() { ; NORMAL-LABEL: test1: -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -; X64-LABEL: test1: -; X64: movl $1, %ecx -; X64-NEXT: movl $2, %edx -; X64-NEXT: movl $3, %r8d -; X64-NEXT: movl $4, %r9d -; X64-NEXT: callq good +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: retl +; ; NOPUSH-LABEL: test1: -; NOPUSH: subl $16, %esp -; NOPUSH-NEXT: movl $4, 12(%esp) -; NOPUSH-NEXT: movl $3, 8(%esp) -; NOPUSH-NEXT: movl $2, 4(%esp) -; NOPUSH-NEXT: movl $1, (%esp) -; NOPUSH-NEXT: call -; NOPUSH-NEXT: addl $16, %esp -define void @test1() { +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: addl $16, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test1: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: .seh_stackalloc 40 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movl $1, %ecx +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $3, %r8d +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq good +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $40, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test1: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: retl entry: call void @good(i32 1, i32 2, i32 3, i32 4) ret void } ; If we have a reserved frame, we should have pushes -; NORMAL-LABEL: test2: -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call define void @test2(i32 %k) { +; NORMAL-LABEL: test2: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: pushl %ebp +; NORMAL-NEXT: movl %esp, %ebp +; NORMAL-NEXT: movl 8(%ebp), %eax +; NORMAL-NEXT: shll $2, %eax +; NORMAL-NEXT: calll __chkstk +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: movl %ebp, %esp +; NORMAL-NEXT: popl %ebp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test2: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: pushl %ebp +; NOPUSH-NEXT: movl %esp, %ebp +; NOPUSH-NEXT: movl 8(%ebp), %eax +; NOPUSH-NEXT: shll $2, %eax +; NOPUSH-NEXT: calll __chkstk +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: movl %ebp, %esp +; NOPUSH-NEXT: popl %ebp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test2: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rbp +; X64-NEXT: .seh_pushreg %rbp +; X64-NEXT: movq %rsp, %rbp +; X64-NEXT: .seh_setframe %rbp, 0 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: leaq 15(,%rax,4), %rax +; X64-NEXT: andq $-16, %rax +; X64-NEXT: callq __chkstk +; X64-NEXT: subq %rax, %rsp +; X64-NEXT: subq $32, %rsp +; X64-NEXT: movl $1, %ecx +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $3, %r8d +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq good +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: movq %rbp, %rsp +; X64-NEXT: popq %rbp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test2: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: pushl %ebp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: .cfi_offset %ebp, -8 +; LINUX-NEXT: movl %esp, %ebp +; LINUX-NEXT: .cfi_def_cfa_register %ebp +; LINUX-NEXT: subl $8, %esp +; LINUX-NEXT: movl 8(%ebp), %eax +; LINUX-NEXT: movl %esp, %ecx +; LINUX-NEXT: leal 15(,%eax,4), %eax +; LINUX-NEXT: andl $-16, %eax +; LINUX-NEXT: subl %eax, %ecx +; LINUX-NEXT: movl %ecx, %esp +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: movl %ebp, %esp +; LINUX-NEXT: popl %ebp +; LINUX-NEXT: .cfi_def_cfa %esp, 4 +; LINUX-NEXT: retl entry: %a = alloca i32, i32 %k call void @good(i32 1, i32 2, i32 3, i32 4) @@ -67,30 +175,128 @@ entry: ; Again, we expect a sequence of 4 immediate pushes ; Checks that we generate the right pushes for >8bit immediates -; NORMAL-LABEL: test2b: -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: pushl $4096 -; NORMAL-NEXT: pushl $3072 -; NORMAL-NEXT: pushl $2048 -; NORMAL-NEXT: pushl $1024 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp define void @test2b() optsize { +; NORMAL-LABEL: test2b: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: pushl $4096 # imm = 0x1000 +; NORMAL-NEXT: pushl $3072 # imm = 0xC00 +; NORMAL-NEXT: pushl $2048 # imm = 0x800 +; NORMAL-NEXT: pushl $1024 # imm = 0x400 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test2b: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl $4096, {{[0-9]+}}(%esp) # imm = 0x1000 +; NOPUSH-NEXT: movl $3072, {{[0-9]+}}(%esp) # imm = 0xC00 +; NOPUSH-NEXT: movl $2048, {{[0-9]+}}(%esp) # imm = 0x800 +; NOPUSH-NEXT: movl $1024, (%esp) # imm = 0x400 +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: addl $16, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test2b: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: .seh_stackalloc 40 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movl $1024, %ecx # imm = 0x400 +; X64-NEXT: movl $2048, %edx # imm = 0x800 +; X64-NEXT: movl $3072, %r8d # imm = 0xC00 +; X64-NEXT: movl $4096, %r9d # imm = 0x1000 +; X64-NEXT: callq good +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $40, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test2b: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: pushl $4096 # imm = 0x1000 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3072 # imm = 0xC00 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2048 # imm = 0x800 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1024 # imm = 0x400 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: retl entry: call void @good(i32 1024, i32 2048, i32 3072, i32 4096) ret void } ; The first push should push a register -; NORMAL-LABEL: test3: -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl %e{{..}} -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp define void @test3(i32 %k) optsize { +; NORMAL-LABEL: test3: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %eax +; NORMAL-NEXT: incl %eax +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test3: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: incl %eax +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: addl $16, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test3: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: .seh_stackalloc 40 +; X64-NEXT: .seh_endprologue +; X64-NEXT: incl %ecx +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $3, %r8d +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq good +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $40, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test3: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: incl %eax +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: retl entry: %f = add i32 %k, 1 call void @good(i32 %f, i32 2, i32 3, i32 4) @@ -98,28 +304,105 @@ entry: } ; We support weird calling conventions -; NORMAL-LABEL: test4: -; NORMAL: movl $2, %eax -; NORMAL-NEXT: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $12, %esp define void @test4() optsize { +; NORMAL-LABEL: test4: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: movl $2, %eax +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _inreg +; NORMAL-NEXT: addl $12, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test4: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $12, %esp +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: movl $2, %eax +; NOPUSH-NEXT: calll _inreg +; NOPUSH-NEXT: addl $12, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test4: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: .seh_stackalloc 40 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movl $1, %ecx +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $3, %r8d +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq inreg +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $40, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test4: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $16, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset 16 +; LINUX-NEXT: movl $2, %eax +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll inreg@PLT +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: retl entry: call void @inreg(i32 1, i32 inreg 2, i32 3, i32 4) ret void } -; NORMAL-LABEL: test4b: -; NORMAL: movl 4(%esp), %ecx -; NORMAL-NEXT: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: ret define void @test4b(ptr %f) optsize { +; NORMAL-LABEL: test4b: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _thiscall +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test4b: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: calll _thiscall +; NOPUSH-NEXT: retl +; +; LINUX-LABEL: test4b: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll thiscall@PLT +; LINUX-NEXT: .cfi_adjust_cfa_offset -16 +; LINUX-NEXT: addl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 4 +; LINUX-NEXT: retl entry: call x86_thiscallcc void @thiscall(ptr %f, i32 1, i32 2, i32 3, i32 4) ret void @@ -128,13 +411,54 @@ entry: ; Check that pushing the addresses of globals (Or generally, things that ; aren't exactly immediates) isn't broken. ; Fixes PR21878. -; NORMAL-LABEL: test6: -; NORMAL: pushl $_ext -; NORMAL-NEXT: call declare void @f(ptr) @ext = external dso_local constant i8 define void @test6() { +; NORMAL-LABEL: test6: +; NORMAL: # %bb.0: +; NORMAL-NEXT: pushl %ebp +; NORMAL-NEXT: movl %esp, %ebp +; NORMAL-NEXT: pushl $_ext +; NORMAL-NEXT: calll _f +; NORMAL-NEXT: addl $4, %esp +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: movl %ebp, %esp +; NORMAL-NEXT: popl %ebp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test6: +; NOPUSH: # %bb.0: +; NOPUSH-NEXT: pushl %ebp +; NOPUSH-NEXT: movl %esp, %ebp +; NOPUSH-NEXT: subl $4, %esp +; NOPUSH-NEXT: movl $_ext, (%esp) +; NOPUSH-NEXT: calll _f +; NOPUSH-NEXT: addl $4, %esp +; NOPUSH-NEXT: pushl %eax +; NOPUSH-NEXT: movl %ebp, %esp +; NOPUSH-NEXT: popl %ebp +; NOPUSH-NEXT: retl +; +; LINUX-LABEL: test6: +; LINUX: # %bb.0: +; LINUX-NEXT: pushl %ebp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: .cfi_offset %ebp, -8 +; LINUX-NEXT: movl %esp, %ebp +; LINUX-NEXT: .cfi_def_cfa_register %ebp +; LINUX-NEXT: subl $8, %esp +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: pushl $ext +; LINUX-NEXT: calll f@PLT +; LINUX-NEXT: addl $16, %esp +; LINUX-NEXT: movl %esp, %eax +; LINUX-NEXT: addl $-16, %eax +; LINUX-NEXT: movl %eax, %esp +; LINUX-NEXT: movl %ebp, %esp +; LINUX-NEXT: popl %ebp +; LINUX-NEXT: .cfi_def_cfa %esp, 4 +; LINUX-NEXT: retl call void @f(ptr @ext) br label %bb bb: @@ -143,16 +467,65 @@ bb: } ; Check that we fold simple cases into the push -; NORMAL-LABEL: test7: -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: movl 4(%esp), [[EAX:%e..]] -; NORMAL-NEXT: pushl $4 -; NORMAL-NEXT: pushl ([[EAX]]) -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp define void @test7(ptr %ptr) optsize { +; NORMAL-LABEL: test7: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %eax +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl (%eax) +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test7: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl (%eax), %eax +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: addl $16, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test7: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: .seh_stackalloc 40 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movl (%rcx), %r8d +; X64-NEXT: movl $1, %ecx +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq good +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $40, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test7: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl (%eax) +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: retl entry: %val = load i32, ptr %ptr call void @good(i32 1, i32 2, i32 %val, i32 4) @@ -163,14 +536,63 @@ entry: ; In particular, at the second push, %b was at 12(%esp) and ; %a wast at 8(%esp), but the second push bumped %esp, so %a ; is now it at 12(%esp) -; NORMAL-LABEL: test8: -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl 12(%esp) -; NORMAL-NEXT: pushl 12(%esp) -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp define void @test8(i32 %a, i32 %b) optsize { +; NORMAL-LABEL: test8: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl {{[0-9]+}}(%esp) +; NORMAL-NEXT: pushl {{[0-9]+}}(%esp) +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test8: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: addl $16, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test8: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $40, %rsp +; X64-NEXT: .seh_stackalloc 40 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movl %edx, %r8d +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: movl $1, %ecx +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq good +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $40, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test8: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl {{[0-9]+}}(%esp) +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl {{[0-9]+}}(%esp) +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: retl entry: call void @good(i32 1, i32 %a, i32 %b, i32 4) ret void @@ -179,26 +601,125 @@ entry: ; If one function is using push instructions, and the other isn't ; (because it has frame-index references), then we must resolve ; these references correctly. -; NORMAL-LABEL: test9: -; NORMAL-NOT: leal (%esp), -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -; NORMAL-NEXT: movl (%esp), [[E1:%e..]] -; NORMAL-NEXT: movl 4(%esp), [[E2:%e..]] -; NORMAL-NEXT: leal 16(%esp), [[E3:%e..]] -; NORMAL-NEXT: leal 12(%esp), [[E4:%e..]] -; NORMAL-NEXT: pushl [[E3]] -; NORMAL-NEXT: pushl [[E4]] -; NORMAL-NEXT: pushl $6 -; NORMAL-NEXT: pushl [[E2]] -; NORMAL-NEXT: pushl [[E1]] -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $20, %esp define void @test9() optsize { +; NORMAL-LABEL: test9: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: pushl %ebp +; NORMAL-NEXT: movl %esp, %ebp +; NORMAL-NEXT: pushl %esi +; NORMAL-NEXT: andl $-8, %esp +; NORMAL-NEXT: subl $24, %esp +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: movl (%esp), %eax +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NORMAL-NEXT: leal {{[0-9]+}}(%esp), %edx +; NORMAL-NEXT: leal {{[0-9]+}}(%esp), %esi +; NORMAL-NEXT: pushl %edx +; NORMAL-NEXT: pushl %esi +; NORMAL-NEXT: pushl $6 +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _struct +; NORMAL-NEXT: addl $20, %esp +; NORMAL-NEXT: leal -4(%ebp), %esp +; NORMAL-NEXT: popl %esi +; NORMAL-NEXT: popl %ebp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test9: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: pushl %ebp +; NOPUSH-NEXT: movl %esp, %ebp +; NOPUSH-NEXT: andl $-8, %esp +; NOPUSH-NEXT: subl $40, %esp +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: leal {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: leal {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $6, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _struct +; NOPUSH-NEXT: movl %ebp, %esp +; NOPUSH-NEXT: popl %ebp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test9: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $72, %rsp +; X64-NEXT: .seh_stackalloc 72 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movl $1, %ecx +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $3, %r8d +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq good +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %r8 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %rax, (%rcx) +; X64-NEXT: movl $6, %edx +; X64-NEXT: # kill: def $r8d killed $r8d killed $r8 +; X64-NEXT: # kill: def $r9d killed $r9d killed $r9 +; X64-NEXT: callq struct +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $72, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test9: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: pushl %esi +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: subl $24, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 32 +; LINUX-NEXT: .cfi_offset %esi, -8 +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $4, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -4 +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; LINUX-NEXT: leal {{[0-9]+}}(%esp), %edx +; LINUX-NEXT: leal {{[0-9]+}}(%esp), %esi +; LINUX-NEXT: pushl %edx +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %esi +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $6 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll struct@PLT +; LINUX-NEXT: addl $56, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -56 +; LINUX-NEXT: popl %esi +; LINUX-NEXT: .cfi_def_cfa_offset 4 +; LINUX-NEXT: retl entry: %p = alloca i32, align 4 %q = alloca i32, align 4 @@ -213,18 +734,99 @@ entry: ; We can end up with an indirect call which gets reloaded on the spot. ; Make sure we reference the correct stack slot - we spill into (%esp) ; and reload from 16(%esp) due to the pushes. -; NORMAL-LABEL: test10: -; NORMAL: movl $_good, [[ALLOC:.*]] -; NORMAL-NEXT: movl [[ALLOC]], [[EAX:%e..]] -; NORMAL-NEXT: movl [[EAX]], (%esp) # 4-byte Spill -; NORMAL: nop -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: calll *16(%esp) -; NORMAL-NEXT: addl $24, %esp define void @test10() optsize { +; NORMAL-LABEL: test10: +; NORMAL: # %bb.0: +; NORMAL-NEXT: pushl %ebp +; NORMAL-NEXT: pushl %ebx +; NORMAL-NEXT: pushl %edi +; NORMAL-NEXT: pushl %esi +; NORMAL-NEXT: subl $8, %esp +; NORMAL-NEXT: movl $_good, {{[0-9]+}}(%esp) +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %eax +; NORMAL-NEXT: movl %eax, (%esp) # 4-byte Spill +; NORMAL-NEXT: #APP +; NORMAL-NEXT: nop +; NORMAL-NEXT: #NO_APP +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll *{{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; NORMAL-NEXT: addl $24, %esp +; NORMAL-NEXT: popl %esi +; NORMAL-NEXT: popl %edi +; NORMAL-NEXT: popl %ebx +; NORMAL-NEXT: popl %ebp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test10: +; NOPUSH: # %bb.0: +; NOPUSH-NEXT: pushl %ebp +; NOPUSH-NEXT: pushl %ebx +; NOPUSH-NEXT: pushl %edi +; NOPUSH-NEXT: pushl %esi +; NOPUSH-NEXT: subl $24, %esp +; NOPUSH-NEXT: movl $_good, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; NOPUSH-NEXT: #APP +; NOPUSH-NEXT: nop +; NOPUSH-NEXT: #NO_APP +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: calll *{{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; NOPUSH-NEXT: addl $24, %esp +; NOPUSH-NEXT: popl %esi +; NOPUSH-NEXT: popl %edi +; NOPUSH-NEXT: popl %ebx +; NOPUSH-NEXT: popl %ebp +; NOPUSH-NEXT: retl +; +; LINUX-LABEL: test10: +; LINUX: # %bb.0: +; LINUX-NEXT: pushl %ebp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: pushl %ebx +; LINUX-NEXT: .cfi_def_cfa_offset 12 +; LINUX-NEXT: pushl %edi +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: pushl %esi +; LINUX-NEXT: .cfi_def_cfa_offset 20 +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 32 +; LINUX-NEXT: .cfi_offset %esi, -20 +; LINUX-NEXT: .cfi_offset %edi, -16 +; LINUX-NEXT: .cfi_offset %ebx, -12 +; LINUX-NEXT: .cfi_offset %ebp, -8 +; LINUX-NEXT: movl $good, {{[0-9]+}}(%esp) +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; LINUX-NEXT: #APP +; LINUX-NEXT: nop +; LINUX-NEXT: #NO_APP +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll *{{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: popl %esi +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: popl %edi +; LINUX-NEXT: .cfi_def_cfa_offset 12 +; LINUX-NEXT: popl %ebx +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: popl %ebp +; LINUX-NEXT: .cfi_def_cfa_offset 4 +; LINUX-NEXT: retl %stack_fptr = alloca ptr store ptr @good, ptr %stack_fptr %good_ptr = load volatile ptr, ptr %stack_fptr @@ -235,17 +837,69 @@ define void @test10() optsize { ; We can't fold the load from the global into the push because of ; interference from the store -; NORMAL-LABEL: test11: -; NORMAL: movl _the_global, [[EAX:%e..]] -; NORMAL-NEXT: movl $42, _the_global -; NORMAL-NEXT: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl [[EAX]] -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp @the_global = external dso_local global i32 define void @test11() optsize { +; NORMAL-LABEL: test11: +; NORMAL: # %bb.0: +; NORMAL-NEXT: movl _the_global, %eax +; NORMAL-NEXT: movl $42, _the_global +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test11: +; NOPUSH: # %bb.0: +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl _the_global, %eax +; NOPUSH-NEXT: movl $42, _the_global +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: addl $16, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test11: +; X64: # %bb.0: +; X64-NEXT: subq $40, %rsp +; X64-NEXT: .seh_stackalloc 40 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movl the_global(%rip), %ecx +; X64-NEXT: movl $42, the_global(%rip) +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $3, %r8d +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq good +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $40, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test11: +; LINUX: # %bb.0: +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: movl the_global, %eax +; LINUX-NEXT: movl $42, the_global +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: retl %myload = load i32, ptr @the_global store i32 42, ptr @the_global call void @good(i32 %myload, i32 2, i32 3, i32 4) @@ -254,13 +908,140 @@ define void @test11() optsize { ; Converting one mov into a push isn't worth it when ; doing so forces too much overhead for other calls. -; NORMAL-LABEL: test12: -; NORMAL: pushl $8 -; NORMAL-NEXT: pushl $7 -; NORMAL-NEXT: pushl $6 -; NORMAL-NEXT: pushl $5 -; NORMAL-NEXT: calll _good define void @test12() optsize { +; NORMAL-LABEL: test12: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: subl $8, %esp +; NORMAL-NEXT: movl (%esp), %eax +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _struct +; NORMAL-NEXT: addl $20, %esp +; NORMAL-NEXT: pushl $8 +; NORMAL-NEXT: pushl $7 +; NORMAL-NEXT: pushl $6 +; NORMAL-NEXT: pushl $5 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: movl (%esp), %eax +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NORMAL-NEXT: pushl $12 +; NORMAL-NEXT: pushl $11 +; NORMAL-NEXT: pushl $10 +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _struct +; NORMAL-NEXT: addl $28, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test12: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $28, %esp +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _struct +; NOPUSH-NEXT: movl $8, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $7, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $6, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $5, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: movl $12, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $11, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $10, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _struct +; NOPUSH-NEXT: addl $28, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test12: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $72, %rsp +; X64-NEXT: .seh_stackalloc 72 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %rax, (%rcx) +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $3, %r8d +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq struct +; X64-NEXT: movl $5, %ecx +; X64-NEXT: movl $6, %edx +; X64-NEXT: movl $7, %r8d +; X64-NEXT: movl $8, %r9d +; X64-NEXT: callq good +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %rax, (%rcx) +; X64-NEXT: movl $10, %edx +; X64-NEXT: movl $11, %r8d +; X64-NEXT: movl $12, %r9d +; X64-NEXT: callq struct +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $72, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test12: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $24, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset 24 +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll struct@PLT +; LINUX-NEXT: addl $32, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -32 +; LINUX-NEXT: pushl $8 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $7 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $6 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $5 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $4, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -4 +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; LINUX-NEXT: pushl $12 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $11 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $10 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll struct@PLT +; LINUX-NEXT: addl $44, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -44 +; LINUX-NEXT: retl entry: %s = alloca %struct.s, align 4 call void @struct(ptr byval(%struct.s) %s, i32 2, i32 3, i32 4) @@ -270,29 +1051,128 @@ entry: } ; But if the gains outweigh the overhead, we should do it -; NORMAL-LABEL: test12b: -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: calll _good -; NORMAL-NEXT: addl $16, %esp -; NORMAL=NEXT: movl (%esp), %eax -; NORMAL=NEXT: movl 4(%esp), %ecx -; NORMAL=NEXT: pushl $8 -; NORMAL=NEXT: pushl $7 -; NORMAL=NEXT: pushl $6 -; NORMAL=NEXT: pushl %ecx -; NORMAL=NEXT: pushl %eax -; NORMAL=NEXT: calll _struct -; NORMAL=NEXT: addl $20, %esp -; NORMAL=NEXT: pushl $12 -; NORMAL=NEXT: pushl $11 -; NORMAL=NEXT: pushl $10 -; NORMAL=NEXT: pushl $9 -; NORMAL=NEXT: calll _good -; NORMAL=NEXT: addl $16, %esp define void @test12b() optsize { +; NORMAL-LABEL: test12b: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: subl $8, %esp +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: movl (%esp), %eax +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NORMAL-NEXT: pushl $8 +; NORMAL-NEXT: pushl $7 +; NORMAL-NEXT: pushl $6 +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _struct +; NORMAL-NEXT: addl $20, %esp +; NORMAL-NEXT: pushl $12 +; NORMAL-NEXT: pushl $11 +; NORMAL-NEXT: pushl $10 +; NORMAL-NEXT: pushl $9 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $24, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test12b: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $28, %esp +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: movl $8, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $7, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $6, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _struct +; NOPUSH-NEXT: movl $12, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $11, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $10, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $9, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: addl $28, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test12b: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $56, %rsp +; X64-NEXT: .seh_stackalloc 56 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movl $1, %ecx +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $3, %r8d +; X64-NEXT: movl $4, %r9d +; X64-NEXT: callq good +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %rax, (%rcx) +; X64-NEXT: movl $6, %edx +; X64-NEXT: movl $7, %r8d +; X64-NEXT: movl $8, %r9d +; X64-NEXT: callq struct +; X64-NEXT: movl $9, %ecx +; X64-NEXT: movl $10, %edx +; X64-NEXT: movl $11, %r8d +; X64-NEXT: movl $12, %r9d +; X64-NEXT: callq good +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $56, %rsp +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test12b: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $4, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -4 +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; LINUX-NEXT: pushl $8 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $7 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $6 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll struct@PLT +; LINUX-NEXT: addl $32, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -32 +; LINUX-NEXT: pushl $12 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $11 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $10 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $9 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: retl entry: %s = alloca %struct.s, align 4 call void @good(i32 1, i32 2, i32 3, i32 4) @@ -304,17 +1184,96 @@ entry: ; Make sure the add does not prevent folding loads into pushes. ; val1 and val2 will not be folded into pushes since they have ; an additional use, but val3 should be. -; NORMAL-LABEL: test13: -; NORMAL: movl ([[P1:%e..]]), [[V1:%e..]] -; NORMAL-NEXT: movl ([[P2:%e..]]), [[V2:%e..]] -; NORMAL-NEXT: , [[ADD:%e..]] -; NORMAL-NEXT: pushl [[ADD]] -; NORMAL-NEXT: pushl ([[P3:%e..]]) -; NORMAL-NEXT: pushl [[V2]] -; NORMAL-NEXT: pushl [[V1]] -; NORMAL-NEXT: calll _good -; NORMAL: movl [[P3]], %eax define ptr @test13(ptr inreg %ptr1, ptr inreg %ptr2, ptr inreg %ptr3) optsize { +; NORMAL-LABEL: test13: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: pushl %esi +; NORMAL-NEXT: movl %ecx, %esi +; NORMAL-NEXT: movl (%eax), %eax +; NORMAL-NEXT: movl (%edx), %ecx +; NORMAL-NEXT: leal (%eax,%ecx), %edx +; NORMAL-NEXT: pushl %edx +; NORMAL-NEXT: pushl (%esi) +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: movl %esi, %eax +; NORMAL-NEXT: popl %esi +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test13: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: pushl %edi +; NOPUSH-NEXT: pushl %esi +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl %ecx, %esi +; NOPUSH-NEXT: movl (%eax), %eax +; NOPUSH-NEXT: movl (%edx), %ecx +; NOPUSH-NEXT: movl (%esi), %edx +; NOPUSH-NEXT: leal (%eax,%ecx), %edi +; NOPUSH-NEXT: movl %edi, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %edx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: movl %esi, %eax +; NOPUSH-NEXT: addl $16, %esp +; NOPUSH-NEXT: popl %esi +; NOPUSH-NEXT: popl %edi +; NOPUSH-NEXT: retl +; +; X64-LABEL: test13: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rsi +; X64-NEXT: .seh_pushreg %rsi +; X64-NEXT: subq $32, %rsp +; X64-NEXT: .seh_stackalloc 32 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movq %r8, %rsi +; X64-NEXT: movl (%rcx), %ecx +; X64-NEXT: movl (%rdx), %edx +; X64-NEXT: movl (%r8), %r8d +; X64-NEXT: leal (%rcx,%rdx), %r9d +; X64-NEXT: # kill: def $ecx killed $ecx killed $rcx +; X64-NEXT: # kill: def $edx killed $edx killed $rdx +; X64-NEXT: callq good +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $32, %rsp +; X64-NEXT: popq %rsi +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test13: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: pushl %esi +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: subl $8, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: .cfi_offset %esi, -8 +; LINUX-NEXT: movl %ecx, %esi +; LINUX-NEXT: movl (%eax), %eax +; LINUX-NEXT: movl (%edx), %ecx +; LINUX-NEXT: leal (%eax,%ecx), %edx +; LINUX-NEXT: pushl %edx +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl (%esi) +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $16, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -16 +; LINUX-NEXT: movl %esi, %eax +; LINUX-NEXT: addl $8, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: popl %esi +; LINUX-NEXT: .cfi_def_cfa_offset 4 +; LINUX-NEXT: retl entry: %val1 = load i32, ptr %ptr1 %val2 = load i32, ptr %ptr2 @@ -325,24 +1284,52 @@ entry: } ; Make sure to fold adjacent stack adjustments. -; LINUX-LABEL: pr27140: -; LINUX: subl $12, %esp -; LINUX: .cfi_def_cfa_offset 16 -; LINUX-NOT: sub -; LINUX: pushl $4 -; LINUX: .cfi_adjust_cfa_offset 4 -; LINUX: pushl $3 -; LINUX: .cfi_adjust_cfa_offset 4 -; LINUX: pushl $2 -; LINUX: .cfi_adjust_cfa_offset 4 -; LINUX: pushl $1 -; LINUX: .cfi_adjust_cfa_offset 4 -; LINUX: calll good -; LINUX: addl $28, %esp -; LINUX: .cfi_adjust_cfa_offset -28 -; LINUX-NOT: add -; LINUX: retl define void @pr27140() optsize { +; NORMAL-LABEL: pr27140: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: pr27140: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $16, %esp +; NOPUSH-NEXT: movl $4, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $3, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $2, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl $1, (%esp) +; NOPUSH-NEXT: calll _good +; NOPUSH-NEXT: addl $16, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: pr27140: +; X64: # %bb.0: # %entry +; X64-NEXT: movl $1, %ecx +; X64-NEXT: movl $2, %edx +; X64-NEXT: movl $3, %r8d +; X64-NEXT: movl $4, %r9d +; X64-NEXT: jmp good # TAILCALL +; +; LINUX-LABEL: pr27140: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: pushl $4 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $3 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $2 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll good@PLT +; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -28 +; LINUX-NEXT: retl entry: tail call void @good(i32 1, i32 2, i32 3, i32 4) ret void @@ -351,16 +1338,134 @@ entry: ; Check that a stack restore (leal -4(%ebp), %esp) doesn't get merged with a ; stack adjustment (addl $12, %esp). Just because it's a lea doesn't mean it's ; simply decreasing the stack pointer. -; NORMAL-LABEL: test14: -; NORMAL: calll _B_func -; NORMAL: leal -4(%ebp), %esp -; NORMAL-NOT: %esp -; NORMAL: retl %struct.A = type { i32, i32 } %struct.B = type { i8 } declare x86_thiscallcc ptr @B_ctor(ptr returned, ptr byval(%struct.A)) declare void @B_func(ptr sret(%struct.B), ptr, i32) define void @test14(ptr %a) { +; NORMAL-LABEL: test14: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: pushl %ebp +; NORMAL-NEXT: movl %esp, %ebp +; NORMAL-NEXT: pushl %esi +; NORMAL-NEXT: andl $-8, %esp +; NORMAL-NEXT: subl $24, %esp +; NORMAL-NEXT: movl 8(%ebp), %eax +; NORMAL-NEXT: movl (%eax), %edx +; NORMAL-NEXT: movl 4(%eax), %eax +; NORMAL-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NORMAL-NEXT: movl %edx, {{[0-9]+}}(%esp) +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %eax +; NORMAL-NEXT: leal {{[0-9]+}}(%esp), %esi +; NORMAL-NEXT: movl %esi, %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %edx +; NORMAL-NEXT: calll _B_ctor +; NORMAL-NEXT: leal {{[0-9]+}}(%esp), %eax +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: pushl %esi +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _B_func +; NORMAL-NEXT: addl $12, %esp +; NORMAL-NEXT: leal -4(%ebp), %esp +; NORMAL-NEXT: popl %esi +; NORMAL-NEXT: popl %ebp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: test14: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: pushl %ebp +; NOPUSH-NEXT: movl %esp, %ebp +; NOPUSH-NEXT: pushl %esi +; NOPUSH-NEXT: andl $-8, %esp +; NOPUSH-NEXT: subl $32, %esp +; NOPUSH-NEXT: movl 8(%ebp), %eax +; NOPUSH-NEXT: movl (%eax), %ecx +; NOPUSH-NEXT: movl 4(%eax), %eax +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %ecx, (%esp) +; NOPUSH-NEXT: leal {{[0-9]+}}(%esp), %esi +; NOPUSH-NEXT: movl %esi, %ecx +; NOPUSH-NEXT: calll _B_ctor +; NOPUSH-NEXT: subl $8, %esp +; NOPUSH-NEXT: movl %esi, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: leal {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: movl $1, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _B_func +; NOPUSH-NEXT: leal -4(%ebp), %esp +; NOPUSH-NEXT: popl %esi +; NOPUSH-NEXT: popl %ebp +; NOPUSH-NEXT: retl +; +; X64-LABEL: test14: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rsi +; X64-NEXT: .seh_pushreg %rsi +; X64-NEXT: subq $64, %rsp +; X64-NEXT: .seh_stackalloc 64 +; X64-NEXT: .seh_endprologue +; X64-NEXT: movq (%rcx), %rax +; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: callq B_ctor +; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: movl $1, %r8d +; X64-NEXT: callq B_func +; X64-NEXT: nop +; X64-NEXT: .seh_startepilogue +; X64-NEXT: addq $64, %rsp +; X64-NEXT: popq %rsi +; X64-NEXT: .seh_endepilogue +; X64-NEXT: retq +; X64-NEXT: .seh_endproc +; +; LINUX-LABEL: test14: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: pushl %esi +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: subl $24, %esp +; LINUX-NEXT: .cfi_def_cfa_offset 32 +; LINUX-NEXT: .cfi_offset %esi, -8 +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: movl (%eax), %edx +; LINUX-NEXT: movl 4(%eax), %eax +; LINUX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; LINUX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; LINUX-NEXT: subl $8, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset 8 +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: leal {{[0-9]+}}(%esp), %esi +; LINUX-NEXT: movl %esi, %ecx +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %edx +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll B_ctor@PLT +; LINUX-NEXT: .cfi_adjust_cfa_offset -8 +; LINUX-NEXT: addl $4, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -4 +; LINUX-NEXT: leal {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: pushl $1 +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %esi +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: .cfi_adjust_cfa_offset 4 +; LINUX-NEXT: calll B_func@PLT +; LINUX-NEXT: .cfi_adjust_cfa_offset -4 +; LINUX-NEXT: addl $36, %esp +; LINUX-NEXT: .cfi_adjust_cfa_offset -36 +; LINUX-NEXT: popl %esi +; LINUX-NEXT: .cfi_def_cfa_offset 4 +; LINUX-NEXT: retl entry: %ref.tmp = alloca %struct.B, align 1 %agg.tmp = alloca i64, align 8 @@ -372,115 +1477,226 @@ entry: ret void } -; NORMAL-LABEL: pr34863_16 -; NORMAL: movl 4(%esp), %eax -; NORMAL-NEXT: pushl $65535 -; NORMAL-NEXT: pushl $0 -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: calll _eightparams16 -; NORMAL-NEXT: addl $32, %esp -; -; NOPUSH-LABEL: pr34863_16 -; NOPUSH: subl $32, %esp -; NOPUSH-NEXT: movl 36(%esp), %eax -; NOPUSH-NEXT: movl %eax, 20(%esp) -; NOPUSH-NEXT: movl %eax, 16(%esp) -; NOPUSH-NEXT: movl %eax, 12(%esp) -; NOPUSH-NEXT: movl %eax, 8(%esp) -; NOPUSH-NEXT: movl %eax, 4(%esp) -; NOPUSH-NEXT: movl %eax, (%esp) -; NOPUSH-NEXT: movl $65535, 28(%esp) -; NOPUSH-NEXT: andl $0, 24(%esp) -; NOPUSH-NEXT: calll _eightparams16 -; NOPUSH-NEXT: addl $32, %esp define void @pr34863_16(i16 %x) minsize nounwind { +; NORMAL-LABEL: pr34863_16: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %eax +; NORMAL-NEXT: pushl $65535 # imm = 0xFFFF +; NORMAL-NEXT: pushl $0 +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _eightparams16 +; NORMAL-NEXT: addl $32, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: pr34863_16: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $32, %esp +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: movl $65535, {{[0-9]+}}(%esp) # imm = 0xFFFF +; NOPUSH-NEXT: andl $0, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _eightparams16 +; NOPUSH-NEXT: addl $32, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: pr34863_16: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $72, %rsp +; X64-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; X64-NEXT: orw $-1, {{[0-9]+}}(%rsp) +; X64-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; X64-NEXT: andw $0, {{[0-9]+}}(%rsp) +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: movl %ecx, %r8d +; X64-NEXT: movl %ecx, %r9d +; X64-NEXT: callq eightparams16 +; X64-NEXT: addq $72, %rsp +; X64-NEXT: retq +; +; LINUX-LABEL: pr34863_16: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: pushl $65535 # imm = 0xFFFF +; LINUX-NEXT: pushl $0 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: calll eightparams16@PLT +; LINUX-NEXT: addl $44, %esp +; LINUX-NEXT: retl entry: tail call void @eightparams16(i16 %x, i16 %x, i16 %x, i16 %x, i16 %x, i16 %x, i16 0, i16 -1) ret void } -; NORMAL-LABEL: pr34863_32 -; NORMAL: movl 4(%esp), %eax -; NORMAL-NEXT: pushl $-1 -; NORMAL-NEXT: pushl $0 -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: calll _eightparams -; NORMAL-NEXT: addl $32, %esp -; -; NOPUSH-LABEL: pr34863_32 -; NOPUSH: subl $32, %esp -; NOPUSH-NEXT: movl 36(%esp), %eax -; NOPUSH-NEXT: movl %eax, 20(%esp) -; NOPUSH-NEXT: movl %eax, 16(%esp) -; NOPUSH-NEXT: movl %eax, 12(%esp) -; NOPUSH-NEXT: movl %eax, 8(%esp) -; NOPUSH-NEXT: movl %eax, 4(%esp) -; NOPUSH-NEXT: movl %eax, (%esp) -; NOPUSH-NEXT: orl $-1, 28(%esp) -; NOPUSH-NEXT: andl $0, 24(%esp) -; NOPUSH-NEXT: calll _eightparams -; NOPUSH-NEXT: addl $32, %esp define void @pr34863_32(i32 %x) minsize nounwind { +; NORMAL-LABEL: pr34863_32: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %eax +; NORMAL-NEXT: pushl $-1 +; NORMAL-NEXT: pushl $0 +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _eightparams +; NORMAL-NEXT: addl $32, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: pr34863_32: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $32, %esp +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: orl $-1, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: andl $0, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _eightparams +; NOPUSH-NEXT: addl $32, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: pr34863_32: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $72, %rsp +; X64-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; X64-NEXT: orl $-1, {{[0-9]+}}(%rsp) +; X64-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; X64-NEXT: andl $0, {{[0-9]+}}(%rsp) +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: movl %ecx, %r8d +; X64-NEXT: movl %ecx, %r9d +; X64-NEXT: callq eightparams +; X64-NEXT: addq $72, %rsp +; X64-NEXT: retq +; +; LINUX-LABEL: pr34863_32: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: pushl $-1 +; LINUX-NEXT: pushl $0 +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: calll eightparams@PLT +; LINUX-NEXT: addl $44, %esp +; LINUX-NEXT: retl entry: tail call void @eightparams(i32 %x, i32 %x, i32 %x, i32 %x, i32 %x, i32 %x, i32 0, i32 -1) ret void } -; NORMAL-LABEL: pr34863_64 -; NORMAL: movl 4(%esp), %eax -; NORMAL-NEXT: movl 8(%esp), %ecx -; NORMAL-NEXT: pushl $-1 -; NORMAL-NEXT: pushl $-1 -; NORMAL-NEXT: pushl $0 -; NORMAL-NEXT: pushl $0 -; NORMAL-NEXT: pushl %ecx -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %ecx -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %ecx -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %ecx -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %ecx -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: pushl %ecx -; NORMAL-NEXT: pushl %eax -; NORMAL-NEXT: calll _eightparams64 -; NORMAL-NEXT: addl $64, %esp -; -; NOPUSH-LABEL: pr34863_64 -; NOPUSH: subl $64, %esp -; NOPUSH-NEXT: movl 68(%esp), %eax -; NOPUSH-NEXT: movl 72(%esp), %ecx -; NOPUSH-NEXT: movl %ecx, 44(%esp) -; NOPUSH-NEXT: movl %eax, 40(%esp) -; NOPUSH-NEXT: movl %ecx, 36(%esp) -; NOPUSH-NEXT: movl %eax, 32(%esp) -; NOPUSH-NEXT: movl %ecx, 28(%esp) -; NOPUSH-NEXT: movl %eax, 24(%esp) -; NOPUSH-NEXT: movl %ecx, 20(%esp) -; NOPUSH-NEXT: movl %eax, 16(%esp) -; NOPUSH-NEXT: movl %ecx, 12(%esp) -; NOPUSH-NEXT: movl %eax, 8(%esp) -; NOPUSH-NEXT: movl %ecx, 4(%esp) -; NOPUSH-NEXT: movl %eax, (%esp) -; NOPUSH-NEXT: orl $-1, 60(%esp) -; NOPUSH-NEXT: orl $-1, 56(%esp) -; NOPUSH-NEXT: andl $0, 52(%esp) -; NOPUSH-NEXT: andl $0, 48(%esp) -; NOPUSH-NEXT: calll _eightparams64 -; NOPUSH-NEXT: addl $64, %esp define void @pr34863_64(i64 %x) minsize nounwind { +; NORMAL-LABEL: pr34863_64: +; NORMAL: # %bb.0: # %entry +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %eax +; NORMAL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NORMAL-NEXT: pushl $-1 +; NORMAL-NEXT: pushl $-1 +; NORMAL-NEXT: pushl $0 +; NORMAL-NEXT: pushl $0 +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: pushl %ecx +; NORMAL-NEXT: pushl %eax +; NORMAL-NEXT: calll _eightparams64 +; NORMAL-NEXT: addl $64, %esp +; NORMAL-NEXT: retl +; +; NOPUSH-LABEL: pr34863_64: +; NOPUSH: # %bb.0: # %entry +; NOPUSH-NEXT: subl $64, %esp +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOPUSH-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: orl $-1, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: orl $-1, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: andl $0, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: movl %eax, (%esp) +; NOPUSH-NEXT: andl $0, {{[0-9]+}}(%esp) +; NOPUSH-NEXT: calll _eightparams64 +; NOPUSH-NEXT: addl $64, %esp +; NOPUSH-NEXT: retl +; +; X64-LABEL: pr34863_64: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $72, %rsp +; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; X64-NEXT: orq $-1, {{[0-9]+}}(%rsp) +; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; X64-NEXT: andq $0, {{[0-9]+}}(%rsp) +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: callq eightparams64 +; X64-NEXT: addq $72, %rsp +; X64-NEXT: retq +; +; LINUX-LABEL: pr34863_64: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: subl $12, %esp +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %eax +; LINUX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; LINUX-NEXT: pushl $-1 +; LINUX-NEXT: pushl $-1 +; LINUX-NEXT: pushl $0 +; LINUX-NEXT: pushl $0 +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: pushl %ecx +; LINUX-NEXT: pushl %eax +; LINUX-NEXT: calll eightparams64@PLT +; LINUX-NEXT: addl $76, %esp +; LINUX-NEXT: retl entry: tail call void @eightparams64(i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 0, i64 -1) ret void diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll index b1aa789e53cd7..ec3d5ef585abf 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i16.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll @@ -814,8 +814,10 @@ define i16 @test_mul_spec(i16 %x) nounwind { ; X86-LABEL: test_mul_spec: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NEXT: leal 2(%eax,%eax,4), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: addl $42, %ecx +; X86-NEXT: leal (%eax,%eax,4), %eax +; X86-NEXT: addl $2, %eax ; X86-NEXT: imull %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl @@ -823,8 +825,10 @@ define i16 @test_mul_spec(i16 %x) nounwind { ; X64-LABEL: test_mul_spec: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal 42(%rdi,%rdi,8), %ecx -; X64-NEXT: leal 2(%rdi,%rdi,4), %eax +; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: addl $42, %ecx +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: addl $2, %eax ; X64-NEXT: imull %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll index 79889b9ace406..043f827d930eb 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i32.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll @@ -1336,8 +1336,10 @@ define i32 @test_mul_spec(i32 %x) nounwind { ; X86-LABEL: test_mul_spec: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NEXT: leal 2(%eax,%eax,4), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: addl $42, %ecx +; X86-NEXT: leal (%eax,%eax,4), %eax +; X86-NEXT: addl $2, %eax ; X86-NEXT: imull %ecx, %eax ; X86-NEXT: retl ; @@ -1362,8 +1364,10 @@ define i32 @test_mul_spec(i32 %x) nounwind { ; X86-NOOPT-LABEL: test_mul_spec: ; X86-NOOPT: # %bb.0: ; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NOOPT-NEXT: leal 2(%eax,%eax,4), %eax +; X86-NOOPT-NEXT: leal (%eax,%eax,8), %ecx +; X86-NOOPT-NEXT: addl $42, %ecx +; X86-NOOPT-NEXT: leal (%eax,%eax,4), %eax +; X86-NOOPT-NEXT: addl $2, %eax ; X86-NOOPT-NEXT: imull %ecx, %eax ; X86-NOOPT-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll index 03dd5351c78ac..e1635662d7805 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i64.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll @@ -57,10 +57,10 @@ define i64 @test_mul_by_2(i64 %x) { define i64 @test_mul_by_3(i64 %x) { ; X86-LABEL: test_mul_by_3: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx ; X86-NEXT: movl $3, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; @@ -108,10 +108,10 @@ define i64 @test_mul_by_4(i64 %x) { define i64 @test_mul_by_5(i64 %x) { ; X86-LABEL: test_mul_by_5: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx ; X86-NEXT: movl $5, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,4), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; @@ -134,10 +134,10 @@ define i64 @test_mul_by_5(i64 %x) { define i64 @test_mul_by_6(i64 %x) { ; X86-LABEL: test_mul_by_6: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx ; X86-NEXT: movl $6, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; @@ -166,12 +166,17 @@ define i64 @test_mul_by_6(i64 %x) { define i64 @test_mul_by_7(i64 %x) { ; X86-LABEL: test_mul_by_7: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (,%eax,8), %ecx -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl $7, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (,%ecx,8), %esi +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_7: @@ -224,10 +229,10 @@ define i64 @test_mul_by_8(i64 %x) { define i64 @test_mul_by_9(i64 %x) { ; X86-LABEL: test_mul_by_9: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: movl $9, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; @@ -250,10 +255,10 @@ define i64 @test_mul_by_9(i64 %x) { define i64 @test_mul_by_10(i64 %x) { ; X86-LABEL: test_mul_by_10: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx ; X86-NEXT: movl $10, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,4), %ecx ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; @@ -282,12 +287,17 @@ define i64 @test_mul_by_10(i64 %x) { define i64 @test_mul_by_11(i64 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl $11, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,4), %esi +; X86-NEXT: leal (%ecx,%esi,2), %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_11: @@ -326,10 +336,10 @@ define i64 @test_mul_by_11(i64 %x) { define i64 @test_mul_by_12(i64 %x) { ; X86-LABEL: test_mul_by_12: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx ; X86-NEXT: movl $12, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: leal (%edx,%ecx,4), %edx ; X86-NEXT: retl ; @@ -358,12 +368,17 @@ define i64 @test_mul_by_12(i64 %x) { define i64 @test_mul_by_13(i64 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl $13, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %esi +; X86-NEXT: leal (%ecx,%esi,4), %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_13: @@ -402,13 +417,18 @@ define i64 @test_mul_by_13(i64 %x) { define i64 @test_mul_by_14(i64 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx), %eax -; X86-NEXT: shll $4, %ecx -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $14, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: leal (%ecx,%ecx), %esi +; X86-NEXT: shll $4, %ecx +; X86-NEXT: subl %esi, %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_14: @@ -438,9 +458,9 @@ define i64 @test_mul_by_14(i64 %x) { define i64 @test_mul_by_15(i64 %x) { ; X86-LABEL: test_mul_by_15: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $15, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: leal (%ecx,%ecx,4), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: addl %ecx, %edx @@ -497,13 +517,18 @@ define i64 @test_mul_by_16(i64 %x) { define i64 @test_mul_by_17(i64 %x) { ; X86-LABEL: test_mul_by_17: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $4, %ecx -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $17, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shll $4, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_17: @@ -532,10 +557,10 @@ define i64 @test_mul_by_17(i64 %x) { define i64 @test_mul_by_18(i64 %x) { ; X86-LABEL: test_mul_by_18: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: movl $18, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %ecx ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; @@ -564,12 +589,17 @@ define i64 @test_mul_by_18(i64 %x) { define i64 @test_mul_by_19(i64 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl $19, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %esi +; X86-NEXT: leal (%ecx,%esi,2), %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_19: @@ -608,10 +638,10 @@ define i64 @test_mul_by_19(i64 %x) { define i64 @test_mul_by_20(i64 %x) { ; X86-LABEL: test_mul_by_20: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx ; X86-NEXT: movl $20, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,4), %ecx ; X86-NEXT: leal (%edx,%ecx,4), %edx ; X86-NEXT: retl ; @@ -640,12 +670,17 @@ define i64 @test_mul_by_20(i64 %x) { define i64 @test_mul_by_21(i64 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl $21, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,4), %esi +; X86-NEXT: leal (%ecx,%esi,4), %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_21: @@ -688,10 +723,10 @@ define i64 @test_mul_by_22(i64 %x) { ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: leal (%ecx,%eax,4), %esi ; X86-NEXT: movl $22, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: leal (%ecx,%ecx,4), %esi +; X86-NEXT: leal (%ecx,%esi,4), %esi ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi @@ -736,13 +771,18 @@ define i64 @test_mul_by_22(i64 %x) { define i64 @test_mul_by_23(i64 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: shll $3, %ecx -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $23, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: leal (%ecx,%ecx,2), %esi +; X86-NEXT: shll $3, %esi +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_23: @@ -783,10 +823,10 @@ define i64 @test_mul_by_23(i64 %x) { define i64 @test_mul_by_24(i64 %x) { ; X86-LABEL: test_mul_by_24: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx ; X86-NEXT: movl $24, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: leal (%edx,%ecx,8), %edx ; X86-NEXT: retl ; @@ -815,9 +855,9 @@ define i64 @test_mul_by_24(i64 %x) { define i64 @test_mul_by_25(i64 %x) { ; X86-LABEL: test_mul_by_25: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $25, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: leal (%ecx,%ecx,4), %ecx ; X86-NEXT: leal (%ecx,%ecx,4), %ecx ; X86-NEXT: addl %ecx, %edx @@ -852,10 +892,10 @@ define i64 @test_mul_by_26(i64 %x) { ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: leal (%eax,%eax,4), %esi ; X86-NEXT: movl $26, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: leal (%ecx,%ecx,4), %esi +; X86-NEXT: leal (%esi,%esi,4), %esi ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi @@ -900,9 +940,9 @@ define i64 @test_mul_by_26(i64 %x) { define i64 @test_mul_by_27(i64 %x) { ; X86-LABEL: test_mul_by_27: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $27, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: leal (%ecx,%ecx,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: addl %ecx, %edx @@ -937,10 +977,10 @@ define i64 @test_mul_by_28(i64 %x) { ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %esi ; X86-NEXT: movl $28, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: leal (%ecx,%ecx,8), %esi +; X86-NEXT: leal (%esi,%esi,2), %esi ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi @@ -989,11 +1029,11 @@ define i64 @test_mul_by_29(i64 %x) { ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %esi -; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %esi ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: leal (%esi,%esi,2), %esi +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi @@ -1040,13 +1080,18 @@ define i64 @test_mul_by_29(i64 %x) { define i64 @test_mul_by_30(i64 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx), %eax -; X86-NEXT: shll $5, %ecx -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $30, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: leal (%ecx,%ecx), %esi +; X86-NEXT: shll $5, %ecx +; X86-NEXT: subl %esi, %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_30: @@ -1076,13 +1121,18 @@ define i64 @test_mul_by_30(i64 %x) { define i64 @test_mul_by_31(i64 %x) { ; X86-LABEL: test_mul_by_31: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $31, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shll $5, %esi +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: addl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_31: @@ -1137,12 +1187,17 @@ define i64 @test_mul_by_32(i64 %x) { define i64 @test_mul_by_37(i64 %x) { ; X86-LABEL: test_mul_by_37: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl $37, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %esi +; X86-NEXT: leal (%ecx,%esi,4), %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_37: @@ -1181,12 +1236,17 @@ define i64 @test_mul_by_37(i64 %x) { define i64 @test_mul_by_41(i64 %x) { ; X86-LABEL: test_mul_by_41: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,8), %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl $41, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,4), %esi +; X86-NEXT: leal (%ecx,%esi,8), %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_41: @@ -1225,13 +1285,18 @@ define i64 @test_mul_by_41(i64 %x) { define i64 @test_mul_by_62(i64 %x) { ; X86-LABEL: test_mul_by_62: ; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx), %eax -; X86-NEXT: shll $6, %ecx -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $62, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: leal (%ecx,%ecx), %esi +; X86-NEXT: shll $6, %ecx +; X86-NEXT: subl %esi, %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_62: @@ -1261,13 +1326,18 @@ define i64 @test_mul_by_62(i64 %x) { define i64 @test_mul_by_66(i64 %x) { ; X86-LABEL: test_mul_by_66: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $6, %ecx -; X86-NEXT: leal (%ecx,%eax,2), %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $66, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shll $6, %esi +; X86-NEXT: leal (%esi,%ecx,2), %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_66: @@ -1310,12 +1380,17 @@ define i64 @test_mul_by_66(i64 %x) { define i64 @test_mul_by_73(i64 %x) { ; X86-LABEL: test_mul_by_73: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%eax,%ecx,8), %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl $73, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %esi +; X86-NEXT: leal (%ecx,%esi,8), %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_73: @@ -1354,13 +1429,18 @@ define i64 @test_mul_by_73(i64 %x) { define i64 @test_mul_by_520(i64 %x) { ; X86-LABEL: test_mul_by_520: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $9, %ecx -; X86-NEXT: leal (%ecx,%eax,8), %ecx +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $520, %eax # imm = 0x208 ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shll $9, %esi +; X86-NEXT: leal (%esi,%ecx,8), %ecx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_520: @@ -1497,13 +1577,14 @@ define i64 @test_mul_spec(i64 %x) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl $9, %ecx ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: leal (%ebp,%ebp,8), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: leal (,%ebp,8), %eax +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: addl $42, %esi ; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: movl $5, %edx @@ -1511,7 +1592,8 @@ define i64 @test_mul_spec(i64 %x) nounwind { ; X86-NEXT: mull %edx ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: leal (%ebp,%ebp,4), %eax +; X86-NEXT: leal (,%ebp,4), %eax +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: addl $2, %edi ; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: movl %esi, %eax @@ -1533,13 +1615,14 @@ define i64 @test_mul_spec(i64 %x) nounwind { ; X86-NOOPT-NEXT: pushl %edi ; X86-NOOPT-NEXT: pushl %esi ; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NOOPT-NEXT: movl $9, %ecx ; X86-NOOPT-NEXT: movl %edi, %eax ; X86-NOOPT-NEXT: mull %ecx ; X86-NOOPT-NEXT: movl %eax, %esi ; X86-NOOPT-NEXT: movl %edx, %ecx -; X86-NOOPT-NEXT: leal (%ebp,%ebp,8), %eax +; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NOOPT-NEXT: leal (,%ebp,8), %eax +; X86-NOOPT-NEXT: addl %ebp, %eax ; X86-NOOPT-NEXT: addl $42, %esi ; X86-NOOPT-NEXT: adcl %eax, %ecx ; X86-NOOPT-NEXT: movl $5, %edx @@ -1547,7 +1630,8 @@ define i64 @test_mul_spec(i64 %x) nounwind { ; X86-NOOPT-NEXT: mull %edx ; X86-NOOPT-NEXT: movl %eax, %edi ; X86-NOOPT-NEXT: movl %edx, %ebx -; X86-NOOPT-NEXT: leal (%ebp,%ebp,4), %eax +; X86-NOOPT-NEXT: leal (,%ebp,4), %eax +; X86-NOOPT-NEXT: addl %ebp, %eax ; X86-NOOPT-NEXT: addl $2, %edi ; X86-NOOPT-NEXT: adcl %eax, %ebx ; X86-NOOPT-NEXT: movl %esi, %eax @@ -1620,9 +1704,9 @@ define i64 @PR111325(i64 %a0, i1 %a1) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $1, %cl -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpb $1, %cl ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: orl %edx, %eax @@ -1633,9 +1717,9 @@ define i64 @PR111325(i64 %a0, i1 %a1) { ; X86-NOOPT: # %bb.0: # %entry ; X86-NOOPT-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOOPT-NEXT: andb $1, %cl -; X86-NOOPT-NEXT: xorl %eax, %eax ; X86-NOOPT-NEXT: xorl %edx, %edx ; X86-NOOPT-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NOOPT-NEXT: xorl %eax, %eax ; X86-NOOPT-NEXT: cmpb $1, %cl ; X86-NOOPT-NEXT: sbbl %eax, %eax ; X86-NOOPT-NEXT: orl %edx, %eax diff --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll index 1f9e7a93ad0b9..008c1c708dd99 100644 --- a/llvm/test/CodeGen/X86/mul-constant-result.ll +++ b/llvm/test/CodeGen/X86/mul-constant-result.ll @@ -13,9 +13,9 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 { ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl $2, %edx ; X86-NEXT: movl $1, %eax ; X86-NEXT: movl $1, %esi +; X86-NEXT: cmpl $2, %edx ; X86-NEXT: jge .LBB0_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl %edx, %esi diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll index bb93e34fda7c4..8aadabbdba0b6 100644 --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -10,24 +10,25 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $400, %esp # imm = 0x190 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 60(%eax), %ebp -; X86-NEXT: movl 56(%eax), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl 56(%esi), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl (%ecx), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl 60(%esi), %ebp ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %esi +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %edi, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl 4(%ebx), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl 4(%eax), %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -2259,29 +2260,29 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl %esi, %edi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: mull %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: setb %cl -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ebx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax @@ -3138,33 +3139,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: mull %ecx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: imull %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %esi, %edx +; X86-NEXT: imull %ebp, %ecx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: imull %ebp, %eax +; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: addl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3183,25 +3184,25 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %edi, %esi ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %ebp +; X86-NEXT: mull %edi ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx @@ -3209,13 +3210,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: imull %eax, %ecx +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %ecx, %eax +; X86-NEXT: addl %eax, %edx ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: addl %edx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3360,18 +3362,19 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %eax @@ -3386,10 +3389,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3400,26 +3403,27 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl 112(%esi), %edi -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 112(%ecx), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl 116(%esi), %eax +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: imull %edi, %eax +; X86-NEXT: addl %eax, %edx +; X86-NEXT: movl 116(%ecx), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: imull %eax, %ebx ; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl 120(%esi), %eax +; X86-NEXT: movl 120(%ecx), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: imull %esi, %ecx @@ -3460,14 +3464,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: imull %ebp, %ecx ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %ebp, %eax +; X86-NEXT: addl %eax, %edx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: addl %edx, %eax @@ -4268,13 +4272,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: imull %eax, %ecx -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %esi, %eax +; X86-NEXT: addl %eax, %edx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: addl %edx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill @@ -4323,16 +4329,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl 124(%ebx), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %eax, %ecx ; X86-NEXT: movl 120(%ebx), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl 124(%ebx), %eax -; X86-NEXT: imull %ecx, %eax -; X86-NEXT: addl %eax, %esi ; X86-NEXT: movl 112(%ebx), %edi ; X86-NEXT: movl 116(%ebx), %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4517,13 +4522,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %edi, %esi -; X86-NEXT: imull %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull %ecx, %eax +; X86-NEXT: addl %eax, %edx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: addl %edx, %eax @@ -4573,13 +4579,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: imull %esi, %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %esi, %eax +; X86-NEXT: addl %eax, %edx ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: addl %edx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4824,13 +4830,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: pushq %rbx ; X64-NEXT: subq $240, %rsp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq 40(%rdi), %rbx -; X64-NEXT: movq 32(%rdi), %r12 ; X64-NEXT: movq 56(%rdi), %r15 ; X64-NEXT: movq 48(%rdi), %r10 +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rsi), %r11 -; X64-NEXT: movq 8(%rsi), %r14 +; X64-NEXT: movq 8(%rsi), %r8 ; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -4845,20 +4851,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r8, %rcx +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %r9, %r8 ; X64-NEXT: adcq %rsi, %r10 +; X64-NEXT: movq 32(%r14), %r12 ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r10, %rsi -; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: adcq %r9, %r14 ; X64-NEXT: movq %r12, %rax ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 @@ -4871,7 +4879,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %r9, %r11 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -4879,7 +4887,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: setb %r10b ; X64-NEXT: movq %rbx, %r11 ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %r9, %r15 @@ -4888,60 +4896,60 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %rdi, %r15 ; X64-NEXT: adcq %r8, %rbx ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %rcx +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r13), %r8 +; X64-NEXT: movq 16(%r13), %r9 ; X64-NEXT: movq %r12, %r10 ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %r11, %rax ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rdi, %r12 -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r8 ; X64-NEXT: movq 24(%r13), %rbp ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: addq %r12, %rax ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: adcq %r9, %r13 +; X64-NEXT: adcq %r8, %r13 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r13, %r9 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r13, %r8 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %r15, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %r15, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %rbx, %r12 ; X64-NEXT: movq %r12, (%rsp) # 8-byte Spill -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r8 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq %rcx, %rdi +; X64-NEXT: addq %rsi, %r8 +; X64-NEXT: adcq %r14, %rdi ; X64-NEXT: setb %r10b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rcx @@ -4949,13 +4957,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %r9, %r11 +; X64-NEXT: addq %r8, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %rdi, %rbx ; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -4969,11 +4977,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %r11, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 24(%r14), %r8 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 24(%r14), %r9 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rdi @@ -4987,7 +4995,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %rdi, %rbx ; X64-NEXT: adcq %rsi, %r15 ; X64-NEXT: setb %sil -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi @@ -5021,37 +5029,37 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %rsi, %r13 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %r9, %r13 +; X64-NEXT: addq %r8, %r13 ; X64-NEXT: adcq %rbx, %r12 ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: movq %r14, %rax ; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rsi, %rbx -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r8 ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: addq %rbx, %rax ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: adcq %r9, %rbp -; X64-NEXT: setb %r9b +; X64-NEXT: adcq %r8, %rbp +; X64-NEXT: setb %r8b ; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %rbp, %rsi -; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r13, %r10 ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -5063,12 +5071,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: adcq %rcx, %r15 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r8, %rdi -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r9, %rdi +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r12 @@ -5082,19 +5090,19 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %r12, %r11 ; X64-NEXT: adcq %rdi, %r13 ; X64-NEXT: setb %dil -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r9 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %rbx ; X64-NEXT: mulq %r14 ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: addq %rsi, %r8 ; X64-NEXT: adcq %r15, %r11 ; X64-NEXT: movzbl %r10b, %ecx ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload @@ -5118,20 +5126,20 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rsi, %r11 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 40(%r8), %rbx +; X64-NEXT: movq 40(%r8), %r10 ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r11, %rsi ; X64-NEXT: adcq %rdi, %r15 -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: setb %r9b +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %r15, %r11 -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: movzbl %r9b, %eax ; X64-NEXT: adcq %rax, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax @@ -5147,32 +5155,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %r15, %rbp ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %rbx, %rcx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rbx +; X64-NEXT: movq %r9, %rbx +; X64-NEXT: movq %r10, %rcx +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: addq %rbp, %rax ; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill ; X64-NEXT: adcq %r13, %r10 -; X64-NEXT: setb %bl +; X64-NEXT: setb %r9b ; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %r10, %rbp -; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: movzbl %r9b, %eax ; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r12, %rbp ; X64-NEXT: adcq %rsi, %r15 ; X64-NEXT: adcq $0, %r11 ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq 48(%r8), %rcx -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rbx, %r9 +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, %r12 ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r13 @@ -5182,27 +5191,27 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r13, %r9 -; X64-NEXT: adcq %r10, %r14 -; X64-NEXT: setb %r8b -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r13, %r8 +; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r14, %r13 -; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: addq %r9, %r13 +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rsi ; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: adcq %r15, %r9 +; X64-NEXT: adcq %r15, %r8 ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: addq %r11, %r13 ; X64-NEXT: adcq %rdi, %rsi ; X64-NEXT: setb %r11b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rdi @@ -5210,195 +5219,195 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rdi, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdi, %r10 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r14, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r14, %rbp -; X64-NEXT: adcq %r10, %r8 -; X64-NEXT: setb %r10b +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r10, %rdi +; X64-NEXT: adcq %r9, %r14 +; X64-NEXT: setb %r9b ; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r8, %rdi -; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r14, %r15 +; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: adcq %rax, %rbp ; X64-NEXT: addq %r13, %r12 -; X64-NEXT: adcq %rsi, %rbp +; X64-NEXT: adcq %rsi, %rdi ; X64-NEXT: movzbl %r11b, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: adcq $0, %r15 +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: adcq %rax, (%rsp) # 8-byte Folded Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload ; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r12 -; X64-NEXT: adcq $0, %rbp ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: adcq $0, %r15 +; X64-NEXT: adcq $0, %rbp ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rcx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r8, %rbx +; X64-NEXT: addq %r9, %rbx ; X64-NEXT: adcq %rsi, %r10 -; X64-NEXT: setb %r8b -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: setb %r9b +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r10, %rsi -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: adcq %rax, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r8, %r14 +; X64-NEXT: addq %r9, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r11, %r13 -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: adcq %r10, %r9 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r12, %r11 -; X64-NEXT: mulq %r13 +; X64-NEXT: movq %r12, %r14 +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r8, %r13 +; X64-NEXT: addq %r9, %r13 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbp, %r13 +; X64-NEXT: addq %r8, %r13 ; X64-NEXT: adcq %rbx, %r12 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r14 +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, %rbx -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r8, %r10 -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r9, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r10, %r11 -; X64-NEXT: adcq %rbp, %r8 -; X64-NEXT: setb %r10b -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, %rbx ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %r9, %rbp ; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r8, %r10 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: addq %r10, %rax +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: adcq %r9, %r14 +; X64-NEXT: setb %r9b +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r8, %rbx -; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: addq %r13, %r14 -; X64-NEXT: movq %r14, %r13 -; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: addq %r14, %rbx +; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: adcq %rax, %r8 +; X64-NEXT: addq %r13, %rcx +; X64-NEXT: movq %rcx, %r11 +; X64-NEXT: adcq %r12, %r10 +; X64-NEXT: movq %r10, %r13 ; X64-NEXT: adcq $0, %rbx -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r8 ; X64-NEXT: addq %rsi, %rbx -; X64-NEXT: adcq %rcx, %r9 -; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: adcq %rdi, %r8 +; X64-NEXT: setb %r12b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rcx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r8, %rax -; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %r9 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %rdx ; X64-NEXT: addq %rbx, %r10 -; X64-NEXT: adcq %r9, %r8 -; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: adcq %r8, %r9 +; X64-NEXT: movzbl %r12b, %eax ; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq %rdi, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r15, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbp, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; X64-NEXT: adcq %rax, %r10 ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: movq 64(%r13), %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq 64(%rbp), %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r15 @@ -5411,93 +5420,91 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rsi, %r8 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 72(%r13), %rsi +; X64-NEXT: movq 72(%rbp), %rsi ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r8, %rbx -; X64-NEXT: adcq %rdi, %r10 -; X64-NEXT: setb %r8b +; X64-NEXT: adcq %rdi, %r9 +; X64-NEXT: setb %r10b ; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r10, %r9 -; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r9, %r8 +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r15, %rcx ; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r8, %r14 +; X64-NEXT: addq %r9, %r14 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %r12, %rax ; X64-NEXT: movq %r12, %rcx +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: adcq %r10, %r9 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, %r12 +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r8, %rbp +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %r9, %r12 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %r11, %rbp +; X64-NEXT: addq %r11, %r12 ; X64-NEXT: adcq %rbx, %r15 -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r8 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq 80(%r13), %r14 +; X64-NEXT: movq 80(%rbp), %r14 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r8, %r11 +; X64-NEXT: addq %r9, %r11 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq 88(%r13), %rbx +; X64-NEXT: movq 88(%rbp), %rbx ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: adcq %r10, %r9 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r8, %r13 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r9, %rbp ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbp, %rsi +; X64-NEXT: adcq %rax, %r13 +; X64-NEXT: addq %r12, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r15, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %rbp ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: adcq $0, %r12 -; X64-NEXT: addq %r9, %r13 -; X64-NEXT: adcq %rdi, %r12 -; X64-NEXT: setb %bpl -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax +; X64-NEXT: addq %r8, %rbp +; X64-NEXT: adcq %rdi, %r13 +; X64-NEXT: setb %r10b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi @@ -5505,196 +5512,196 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rdi, %r10 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rdi, %r9 ; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, %r15 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %r12 ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: addq %r10, %rax -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %r9 ; X64-NEXT: adcq %r8, %rdi ; X64-NEXT: setb %r8b ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: movq %rcx, %r11 ; X64-NEXT: mulq %rbx ; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %r13, %rsi +; X64-NEXT: addq %rbp, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %bpl, %eax +; X64-NEXT: adcq %r13, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rax, %rbx -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rax, %r8 +; X64-NEXT: imulq %rdi, %rbx +; X64-NEXT: movq %rdi, %r10 ; X64-NEXT: addq %rbx, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: imulq %rcx, %r14 ; X64-NEXT: addq %rdx, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: imulq %rsi, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: mulq %r11 +; X64-NEXT: imulq %rsi, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r10, %rdx +; X64-NEXT: addq %r9, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: imulq %r11, %rbx +; X64-NEXT: imulq %r15, %rbx ; X64-NEXT: addq %rdx, %rbx ; X64-NEXT: addq %r8, %rdi ; X64-NEXT: adcq %r14, %rbx -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r8, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r8, %r10 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r10, %r15 +; X64-NEXT: adcq %r9, %r8 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r8, %r14 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r8, %r9 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %rdi, %r14 +; X64-NEXT: addq %rdi, %r9 ; X64-NEXT: adcq %rbx, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 112(%rcx), %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq 120(%r8), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %rax, %rdi +; X64-NEXT: movq 112(%r8), %rbx ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: imulq %r11, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: movq 120(%rcx), %rax -; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movq %rdi, %r12 -; X64-NEXT: addq %rax, %r10 -; X64-NEXT: movq 96(%rcx), %r13 -; X64-NEXT: movq 104(%rcx), %r8 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, %rbx -; X64-NEXT: imulq %r8, %rbx +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: imulq %rcx, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: movq 96(%r8), %r13 +; X64-NEXT: movq 104(%r8), %r8 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r12, %r14 +; X64-NEXT: imulq %r8, %r14 ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rbx, %rdx -; X64-NEXT: imulq %r13, %r9 -; X64-NEXT: addq %rdx, %r9 -; X64-NEXT: addq %rbp, %rdi -; X64-NEXT: adcq %r10, %r9 -; X64-NEXT: movq %r9, %r15 +; X64-NEXT: addq %r14, %rdx +; X64-NEXT: imulq %r13, %r11 +; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: addq %r10, %rdi +; X64-NEXT: adcq %rbx, %r11 +; X64-NEXT: movq %r11, %r12 ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r10, %r12 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r10, %r14 ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r12, %r13 +; X64-NEXT: addq %r14, %r13 ; X64-NEXT: adcq %rbp, %r10 ; X64-NEXT: setb %bl ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: mulq %rcx ; X64-NEXT: addq %r10, %rax ; X64-NEXT: movzbl %bl, %r8d ; X64-NEXT: adcq %r8, %rdx ; X64-NEXT: addq %rdi, %rax -; X64-NEXT: adcq %r15, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: adcq %r14, %rax +; X64-NEXT: adcq %r12, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: adcq %r15, %r13 +; X64-NEXT: adcq %r9, %rax ; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq 80(%r14), %r10 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: mulq %rbx +; X64-NEXT: movq 80(%rbx), %r11 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 88(%r14), %r15 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rbx +; X64-NEXT: movq 88(%rbx), %r14 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: mulq %r10 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %r9, %rdi ; X64-NEXT: adcq %r8, %rcx ; X64-NEXT: setb %r8b -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rcx, %r12 ; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: movq 64(%r14), %rcx +; X64-NEXT: movq 64(%rbx), %rcx ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq 72(%r14), %r8 +; X64-NEXT: movq 72(%rbx), %r8 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %r11, %r14 ; X64-NEXT: adcq $0, %rbx ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movq %rcx, %r9 -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %rbx, %r11 ; X64-NEXT: setb %cl ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %r11, %rbp @@ -5710,9 +5717,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rax, %r10 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r10 +; X64-NEXT: movq %r8, %r9 ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rdi @@ -5727,15 +5734,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: adcq %rdi, %r13 ; X64-NEXT: setb %cl -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %r13, %rdi ; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %rbp, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rbp, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %rbx, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdi @@ -5748,8 +5755,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %rbx @@ -5763,7 +5770,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: adcq %r11, %r13 ; X64-NEXT: setb %r8b -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, %r11 @@ -5793,97 +5800,98 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq 112(%rcx), %rax ; X64-NEXT: movq %rcx, %r14 ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: imulq %r10, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: mulq %rbx +; X64-NEXT: imulq %rbx, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: movq 120(%r14), %r13 -; X64-NEXT: imulq %rbx, %r13 +; X64-NEXT: imulq %r10, %r13 ; X64-NEXT: addq %rdx, %r13 ; X64-NEXT: addq %rdi, %r8 ; X64-NEXT: adcq %r11, %r13 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %r14 ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rdi, %rbx ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rbx, %r12 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rbx, %rbp ; X64-NEXT: adcq %r11, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rcx, %r10 ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: addq %r8, %r9 +; X64-NEXT: addq %r8, %r10 ; X64-NEXT: adcq %r13, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: imulq %r10, %rdi -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %rdi, %rax +; X64-NEXT: addq %rax, %rdx ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r14, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: imulq %r12, %rax ; X64-NEXT: addq %rdx, %rax ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: imulq %r8, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: imulq %r15, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: imulq %rdi, %rbp -; X64-NEXT: addq %rdx, %rbp +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %r9, %rax +; X64-NEXT: addq %rdx, %rax ; X64-NEXT: addq %rcx, %r11 -; X64-NEXT: adcq %r13, %rbp -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: adcq %r13, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r15 -; X64-NEXT: mulq %r10 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %rcx, %rsi ; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rsi, %rcx ; X64-NEXT: adcq %r8, %rdi ; X64-NEXT: setb %sil ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: mulq %r12 ; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movzbl %sil, %esi ; X64-NEXT: adcq %rsi, %rdx ; X64-NEXT: addq %r11, %rax -; X64-NEXT: adcq %rbp, %rdx +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: adcq %r12, %rcx -; X64-NEXT: adcq %r9, %rax +; X64-NEXT: adcq %rbp, %rcx +; X64-NEXT: adcq %r10, %rax ; X64-NEXT: adcq %rbx, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll index 2d7737bfdd3c2..7617fa8da7f2a 100644 --- a/llvm/test/CodeGen/X86/mul-i256.ll +++ b/llvm/test/CodeGen/X86/mul-i256.ll @@ -12,20 +12,20 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $72, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 12(%eax), %ebx -; X86-NEXT: movl 8(%eax), %ebp -; X86-NEXT: movl (%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 8(%edi), %ebp +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl 12(%edi), %ebx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %edi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl %esi, %edi @@ -49,127 +49,125 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esi), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl 4(%esi), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl 4(%esi), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %edi, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: setb %cl -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: mull %edi -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: mull %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 8(%eax), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 12(%eax), %ebp -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl 12(%ebp), %ebp +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl 16(%ecx), %esi -; X86-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-NEXT: imull %esi, %edi ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %esi, %eax +; X86-NEXT: addl %eax, %edx ; X86-NEXT: movl 20(%ecx), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %edi ; X86-NEXT: imull %eax, %edi ; X86-NEXT: addl %edx, %edi ; X86-NEXT: movl 24(%ecx), %eax @@ -183,8 +181,8 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl 28(%ecx), %ecx ; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %edi, %ecx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %esi @@ -209,83 +207,81 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movzbl %bl, %esi ; X86-NEXT: adcl %esi, %edx -; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl 24(%edi), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 28(%ecx), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %eax, %edi +; X86-NEXT: movl 24(%ecx), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %edx ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl 28(%edi), %eax -; X86-NEXT: imull %ecx, %eax -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movl 16(%edi), %edi -; X86-NEXT: movl 20(%edx), %ebp +; X86-NEXT: movl 16(%ecx), %ebx +; X86-NEXT: movl 20(%ecx), %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: imull %ebp, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: imull %ebp, %edi ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %edi -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: mull %ebx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %edi, %ebp ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, (%esi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, 4(%esi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, 8(%esi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, 12(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, (%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 4(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 8(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 12(%esi) ; X86-NEXT: movl %ecx, 16(%esi) -; X86-NEXT: movl %edi, 20(%esi) +; X86-NEXT: movl %ebx, 20(%esi) ; X86-NEXT: movl %eax, 24(%esi) ; X86-NEXT: movl %edx, 28(%esi) ; X86-NEXT: addl $72, %esp @@ -312,9 +308,9 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r15, %rdx ; X64-NEXT: imulq %r14, %r10 ; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: addq %r15, %r10 ; X64-NEXT: movq %r8, %r15 ; X64-NEXT: imulq %r11, %r15 ; X64-NEXT: movq %r8, %rax diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll index 2421aabdbcd99..b87e6c4e32156 100644 --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -9,38 +9,38 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $180, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 28(%eax), %ebx -; X86-NEXT: movl 24(%eax), %ebp -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $184, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl 24(%esi), %ebx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl 28(%esi), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %edi, %esi ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl 4(%eax), %edi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx @@ -53,27 +53,26 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl 20(%ecx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%ecx), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %edi, %esi ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %cl +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -81,40 +80,39 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl 8(%ebx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 12(%eax), %ecx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl 12(%ebx), %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %bl +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload @@ -129,123 +127,123 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %bl +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl 8(%ecx), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ecx), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl 12(%ecx), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl 12(%ecx), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: addl %ebp, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl 4(%esi), %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl 4(%ecx), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %ecx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: addl %ebx, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: mull %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %bl +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -256,8 +254,8 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi @@ -265,30 +263,29 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: mull %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: setb %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: mull %edi ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X86-NEXT: adcl %edi, %eax ; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -305,63 +302,62 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl 16(%eax), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 20(%eax), %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl 20(%eax), %edi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: adcl %esi, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -375,27 +371,27 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 28(%eax), %ecx +; X86-NEXT: movl 28(%eax), %ebx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: setb %bl +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -406,62 +402,63 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %bl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl %ebp, %esi -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: adcl $0, %eax -; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %edi, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %eax +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi @@ -470,85 +467,84 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: mull %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %edi ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: addl %ebx, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: setb %cl +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl %ebp, %esi -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebx @@ -745,37 +741,40 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: imull %eax, %ecx -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %esi, %eax +; X86-NEXT: addl %eax, %edx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: addl %edx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: imull %ebp, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: imull %edi, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %esi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: imull %ecx, %esi +; X86-NEXT: imull %edi, %esi ; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl (%esp), %esi # 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi @@ -789,28 +788,28 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: setb %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl 56(%edi), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl 60(%ebx), %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %eax, %esi +; X86-NEXT: movl 56(%ebx), %ecx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %esi, %edx ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl 60(%edi), %eax -; X86-NEXT: imull %ebp, %eax -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movl 48(%edi), %esi -; X86-NEXT: movl 52(%edi), %edi +; X86-NEXT: movl 48(%ebx), %esi +; X86-NEXT: movl 52(%ebx), %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: imull %edi, %ebx @@ -836,15 +835,15 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebp ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx @@ -852,13 +851,13 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -889,6 +888,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: setb %cl ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax @@ -902,67 +902,67 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl 36(%esi), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl 36(%esi), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %edi, %esi ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: addl %edi, %esi +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp +; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %eax @@ -973,44 +973,46 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: mull %ebx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl 48(%esi), %edi -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 48(%ecx), %edi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl 52(%esi), %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: imull %edi, %eax +; X86-NEXT: addl %eax, %edx +; X86-NEXT: movl 52(%ecx), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: imull %eax, %ebx ; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl 56(%esi), %eax +; X86-NEXT: movl 56(%ecx), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: imull %ebp, %esi @@ -1026,92 +1028,94 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: adcl %ebx, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %edi -; X86-NEXT: setb %bl +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: imull %esi, %ecx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %ebp, %eax +; X86-NEXT: addl %eax, %edx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: addl %edx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: imull %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: mull %ebp +; X86-NEXT: mull %ebx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: imull %ebp, %ecx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %ecx, %esi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull %ebp ; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -1121,19 +1125,19 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -1159,12 +1163,12 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X86-NEXT: movl %esi, 36(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, 40(%ecx) -; X86-NEXT: movl %ebx, 44(%ecx) +; X86-NEXT: movl %edi, 44(%ecx) ; X86-NEXT: movl %ebp, 48(%ecx) -; X86-NEXT: movl %edi, 52(%ecx) +; X86-NEXT: movl %ebx, 52(%ecx) ; X86-NEXT: movl %eax, 56(%ecx) ; X86-NEXT: movl %edx, 60(%ecx) -; X86-NEXT: addl $180, %esp +; X86-NEXT: addl $184, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1181,254 +1185,251 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: pushq %rbx ; X64-NEXT: pushq %rax ; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rdi), %rbx -; X64-NEXT: movq 8(%rdi), %rdi -; X64-NEXT: movq 24(%rax), %r14 -; X64-NEXT: movq 16(%rax), %rax -; X64-NEXT: movq (%rsi), %r8 -; X64-NEXT: movq 8(%rsi), %r11 -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq 24(%rdi), %r10 +; X64-NEXT: movq 16(%rdi), %rax +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq (%rsi), %r11 +; X64-NEXT: movq 8(%rsi), %rbp +; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rcx, %r10 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r10, %r15 -; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: addq %r9, %r15 +; X64-NEXT: adcq %r8, %rcx +; X64-NEXT: movq 8(%rdi), %rdi ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rcx, %r9 -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: adcq %rsi, %r13 ; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %rcx -; X64-NEXT: setb %sil ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: adcq $0, %r14 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rsi, %r11 +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r14, %r9 +; X64-NEXT: setb %sil +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r9, %rcx ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %r14 -; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq %r15, %r14 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %r12 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq %r12, %rbp ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r13), %r10 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq 16(%r12), %rsi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq %rdi, %r12 ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq 24(%rsi), %rsi -; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r9, %r15 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq 24(%rbp), %rdi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rbp, %r11 -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: setb %dil +; X64-NEXT: addq %r15, %r11 +; X64-NEXT: adcq %rbx, %r9 +; X64-NEXT: setb %bpl ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: movzbl %bpl, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rcx, %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r14, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rbp +; X64-NEXT: adcq $0, %rbx ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: addq %r9, %rbp -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; X64-NEXT: setb %dil -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r10, %rbx +; X64-NEXT: adcq %r13, %r15 +; X64-NEXT: setb %bpl ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: adcq %r9, %rcx -; X64-NEXT: setb %r8b -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: adcq %r8, %rcx +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %rbp, %r11 +; X64-NEXT: addq %rbx, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: adcq %r15, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %bpl, %eax ; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq 32(%r8), %r15 -; X64-NEXT: imulq %r15, %rsi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq 40(%r8), %rsi -; X64-NEXT: imulq %rsi, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: movq 48(%r8), %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq 32(%r10), %rbx +; X64-NEXT: imulq %rbx, %rdi +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq 40(%r10), %r8 +; X64-NEXT: imulq %r8, %rsi +; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: movq 48(%r10), %rax ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: imulq %r9, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: imulq %r15, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 56(%r8), %r8 -; X64-NEXT: imulq %r11, %r8 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq %r10, %r8 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movq 56(%r10), %r11 +; X64-NEXT: imulq %r14, %r11 +; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: addq %r9, %rcx +; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %r15 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, %r10 +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r15, %r13 -; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: adcq %r9, %r15 +; X64-NEXT: setb %sil +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rcx, %r10 -; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: addq %r15, %r10 +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbx, %r10 -; X64-NEXT: adcq %r8, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq 48(%r8), %rsi +; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: adcq %r11, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq 56(%rdi), %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: imulq %rax, %rsi +; X64-NEXT: movq 48(%rdi), %rcx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rsi, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r14, %rsi -; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq %r8, %rdx -; X64-NEXT: movq 56(%r8), %rax -; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movq %rdi, %r8 -; X64-NEXT: addq %rax, %rsi -; X64-NEXT: movq 32(%rdx), %rbp -; X64-NEXT: movq 40(%rdx), %r9 +; X64-NEXT: imulq %r14, %rcx +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movq 32(%rdi), %r15 +; X64-NEXT: movq 40(%rdi), %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: imulq %rbx, %rsi +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: imulq %r9, %rdi -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: addq %rsi, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: imulq %rbp, %r11 +; X64-NEXT: imulq %r15, %r11 ; X64-NEXT: addq %rdx, %r11 -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq %rsi, %r11 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r9, %rdi +; X64-NEXT: adcq %rcx, %r11 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rdi, %r8 -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r9, %rcx +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r14 -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rcx, %r15 +; X64-NEXT: adcq %rbp, %r9 +; X64-NEXT: setb %cl +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: addq %r9, %rax +; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %rbx, %rax +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: adcq %r11, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq %r13, %r8 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; X64-NEXT: adcq %r13, %r15 ; X64-NEXT: adcq %r10, %rax ; X64-NEXT: adcq %r12, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload @@ -1436,12 +1437,12 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rdi, (%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, 8(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 16(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 24(%rcx) -; X64-NEXT: movq %rsi, 32(%rcx) -; X64-NEXT: movq %r8, 40(%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, 16(%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq %rsi, 24(%rcx) +; X64-NEXT: movq %r8, 32(%rcx) +; X64-NEXT: movq %r15, 40(%rcx) ; X64-NEXT: movq %rax, 48(%rcx) ; X64-NEXT: movq %rdx, 56(%rcx) ; X64-NEXT: addq $8, %rsp diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll index fc1cc1f65627a..721d7374d8059 100644 --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -7,8 +7,8 @@ define i128 @foo(i128 %t, i128 %u) { ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: imulq %rdi, %rcx ; X64-NEXT: mulq %rdx +; X64-NEXT: imulq %rdi, %rcx ; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: imulq %rsi, %r8 ; X64-NEXT: addq %r8, %rdx @@ -30,38 +30,36 @@ define i128 @foo(i128 %t, i128 %u) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %esi, %eax -; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: imull %edi, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull %ebx, %edi +; X86-NEXT: addl %edx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %esi, %edi ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ecx, %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp ; X86-NEXT: addl %esi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: imull %ebp, %esi ; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %ebx, %ecx diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll index e101c702e6409..56ce708a666b7 100644 --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -15,39 +15,40 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movq %rdx, %r9 ; CHECK-NEXT: movq %rsi, %r8 +; CHECK-NEXT: movq %rdi, %r10 ; CHECK-NEXT: movq %rsi, %rbx ; CHECK-NEXT: sarq $63, %rbx ; CHECK-NEXT: imulq %rdx, %rbx ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: mulq %rdx -; CHECK-NEXT: movq %rdx, %r10 +; CHECK-NEXT: movq %rdx, %r11 ; CHECK-NEXT: movq %rax, %rsi ; CHECK-NEXT: movq %r8, %rax ; CHECK-NEXT: mulq %r9 -; CHECK-NEXT: movq %rdx, %r9 -; CHECK-NEXT: movq %rax, %r11 -; CHECK-NEXT: addq %r10, %r11 -; CHECK-NEXT: adcq %rbx, %r9 -; CHECK-NEXT: movq %r9, %rbx -; CHECK-NEXT: sarq $63, %rbx +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: movq %rax, %r9 +; CHECK-NEXT: addq %r11, %r9 +; CHECK-NEXT: adcq %rbx, %rdi +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movq %rcx, %r14 ; CHECK-NEXT: sarq $63, %r14 -; CHECK-NEXT: imulq %rdi, %r14 -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: imulq %r10, %r14 +; CHECK-NEXT: movq %r10, %rax ; CHECK-NEXT: mulq %rcx -; CHECK-NEXT: movq %rdx, %r10 -; CHECK-NEXT: movq %rax, %rdi -; CHECK-NEXT: addq %r11, %rdi -; CHECK-NEXT: adcq %r14, %r10 -; CHECK-NEXT: movq %r10, %r11 -; CHECK-NEXT: sarq $63, %r11 +; CHECK-NEXT: movq %rax, %r10 +; CHECK-NEXT: movq %rdx, %r11 +; CHECK-NEXT: sarq $63, %rbx ; CHECK-NEXT: addq %r9, %r10 -; CHECK-NEXT: adcq %rbx, %r11 +; CHECK-NEXT: adcq %r14, %r11 +; CHECK-NEXT: movq %r11, %r9 +; CHECK-NEXT: sarq $63, %r9 +; CHECK-NEXT: addq %rdi, %r11 +; CHECK-NEXT: adcq %rbx, %r9 ; CHECK-NEXT: movq %r8, %rax ; CHECK-NEXT: imulq %rcx -; CHECK-NEXT: addq %r10, %rax -; CHECK-NEXT: adcq %r11, %rdx -; CHECK-NEXT: movq %rdi, %rcx +; CHECK-NEXT: addq %r11, %rax +; CHECK-NEXT: adcq %r9, %rdx +; CHECK-NEXT: movq %r10, %rcx ; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: xorq %rcx, %rdx ; CHECK-NEXT: xorq %rax, %rcx @@ -55,7 +56,7 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %nooverflow ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: movq %rdi, %rdx +; CHECK-NEXT: movq %r10, %rdx ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll index 961205c50d976..38ab08225ea9d 100644 --- a/llvm/test/CodeGen/X86/neg-abs.ll +++ b/llvm/test/CodeGen/X86/neg-abs.ll @@ -104,36 +104,34 @@ define i64 @neg_abs_i64(i64 %x) nounwind { define i128 @neg_abs_i128(i128 %x) nounwind { ; X86-LABEL: neg_abs_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %ecx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: xorl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %ecx, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: subl %ebx, %ebp ; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: subl %edi, %ebx ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl %ebp, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %esi, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: neg_abs_i128: diff --git a/llvm/test/CodeGen/X86/neg-shl-add.ll b/llvm/test/CodeGen/X86/neg-shl-add.ll index 9a4bde0743254..c305813f31a9e 100644 --- a/llvm/test/CodeGen/X86/neg-shl-add.ll +++ b/llvm/test/CodeGen/X86/neg-shl-add.ll @@ -7,9 +7,9 @@ define i64 @foo(i64 %x, i64 %y, i64 %n) nounwind { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdx, %rcx -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shlq %cl, %rsi +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: subq %rsi, %rax ; CHECK-NEXT: retq %a = sub i64 0, %y @@ -21,9 +21,9 @@ define i64 @boo(i64 %x, i64 %y, i64 %n) nounwind { ; CHECK-LABEL: boo: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdx, %rcx -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shlq %cl, %rsi +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: subq %rsi, %rax ; CHECK-NEXT: retq %a = sub i64 0, %y diff --git a/llvm/test/CodeGen/X86/no-split-size.ll b/llvm/test/CodeGen/X86/no-split-size.ll index c1f93acd77dee..06b4f772953b8 100644 --- a/llvm/test/CodeGen/X86/no-split-size.ll +++ b/llvm/test/CodeGen/X86/no-split-size.ll @@ -30,8 +30,8 @@ define i64 @foo(ptr %ptr, i64 %p2, i64 %p3, i64 %p4, i64 %p5, i64 %p6) optsize { ; CHECK-NEXT: testq %rdi, %rdi ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %if.else -; CHECK-NEXT: testq %r13, %r13 ; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: testq %r13, %r13 ; CHECK-NEXT: je .LBB0_3 ; CHECK-NEXT: .LBB0_4: # %if.end ; CHECK-NEXT: addq %r13, %rax diff --git a/llvm/test/CodeGen/X86/nontemporal-4.ll b/llvm/test/CodeGen/X86/nontemporal-4.ll index 3d86174e45103..c1eff891a9487 100644 --- a/llvm/test/CodeGen/X86/nontemporal-4.ll +++ b/llvm/test/CodeGen/X86/nontemporal-4.ll @@ -659,9 +659,9 @@ define void @test_constant_v4i64_align16(ptr %dst) nounwind { ; ; AVX512-LABEL: test_constant_v4i64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551614,18446744073709551613] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551614,18446744073709551613] ; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,18446744073709551615] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX512-NEXT: vmovntps %xmm0, (%rdi) ; AVX512-NEXT: retq store <4 x i64> , ptr %dst, align 16, !nontemporal !1 @@ -687,9 +687,9 @@ define void @test_constant_v8i32_align16(ptr %dst) nounwind { ; ; AVX512-LABEL: test_constant_v8i32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967292,4294967291,4294967290,4294967289] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4294967292,4294967291,4294967290,4294967289] ; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4294967295,4294967294,4294967293] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,4294967294,4294967293] ; AVX512-NEXT: vmovntps %xmm0, (%rdi) ; AVX512-NEXT: retq store <8 x i32> , ptr %dst, align 16, !nontemporal !1 @@ -1408,13 +1408,13 @@ define void @test_constant_v8i64_align16(ptr %dst) nounwind { ; ; AVX512-LABEL: test_constant_v8i64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551614,18446744073709551613] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551614,18446744073709551613] ; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,18446744073709551615] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551610,18446744073709551609] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551610,18446744073709551609] ; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551612,18446744073709551611] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551612,18446744073709551611] ; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <8 x i64> , ptr %dst, align 16, !nontemporal !1 @@ -1448,13 +1448,13 @@ define void @test_constant_v16i32_align16(ptr %dst) nounwind { ; ; AVX512-LABEL: test_constant_v16i32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967292,4294967291,4294967290,4294967289] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4294967292,4294967291,4294967290,4294967289] ; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4294967295,4294967294,4294967293] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,4294967294,4294967293] ; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967284,4294967283,4294967282,4294967281] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4294967284,4294967283,4294967282,4294967281] ; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967288,4294967287,4294967286,4294967285] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4294967288,4294967287,4294967286,4294967285] ; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <16 x i32> , ptr %dst, align 16, !nontemporal !1 @@ -1634,9 +1634,9 @@ define void @test_constant_v8i64_align32(ptr %dst) nounwind { ; ; AVX512-LABEL: test_constant_v8i64_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551612,18446744073709551611,18446744073709551610,18446744073709551609] +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551612,18446744073709551611,18446744073709551610,18446744073709551609] ; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551614,18446744073709551613] +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551614,18446744073709551613] ; AVX512-NEXT: vmovntps %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1668,9 +1668,9 @@ define void @test_constant_v16i32_align32(ptr %dst) nounwind { ; ; AVX512-LABEL: test_constant_v16i32_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967288,4294967287,4294967286,4294967285,4294967284,4294967283,4294967282,4294967281] +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [4294967288,4294967287,4294967286,4294967285,4294967284,4294967283,4294967282,4294967281] ; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4294967295,4294967294,4294967293,4294967292,4294967291,4294967290,4294967289] +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,4294967294,4294967293,4294967292,4294967291,4294967290,4294967289] ; AVX512-NEXT: vmovntps %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll index 98d193a79cb74..8f73603746639 100644 --- a/llvm/test/CodeGen/X86/nontemporal-loads.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -1787,21 +1787,21 @@ define <16 x i32> @test_masked_v16i32(ptr %addr, <16 x i32> %old, <16 x i32> %ma ; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE41-NEXT: movntdqa 48(%rdi), %xmm4 -; SSE41-NEXT: movntdqa 32(%rdi), %xmm9 -; SSE41-NEXT: movntdqa 16(%rdi), %xmm10 -; SSE41-NEXT: movntdqa (%rdi), %xmm11 -; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm11 +; SSE41-NEXT: movntdqa (%rdi), %xmm4 +; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm10 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm5 ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm9 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movntdqa 48(%rdi), %xmm6 ; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: movaps %xmm11, %xmm0 -; SSE41-NEXT: movaps %xmm10, %xmm1 -; SSE41-NEXT: movaps %xmm9, %xmm2 -; SSE41-NEXT: movaps %xmm4, %xmm3 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm8, %xmm1 +; SSE41-NEXT: movaps %xmm5, %xmm2 +; SSE41-NEXT: movaps %xmm6, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_masked_v16i32: @@ -1814,11 +1814,11 @@ define <16 x i32> @test_masked_v16i32(ptr %addr, <16 x i32> %old, <16 x i32> %ma ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm5 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-NEXT: vblendvps %ymm3, %ymm1, %ymm5, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm4 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 ; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 ; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll index 3b6ffacb0b230..2391ec42b22f8 100644 --- a/llvm/test/CodeGen/X86/nontemporal.ll +++ b/llvm/test/CodeGen/X86/nontemporal.ll @@ -12,36 +12,36 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X86-SSE-NEXT: movl 12(%ebp), %ecx -; X86-SSE-NEXT: movdqa 56(%ebp), %xmm4 -; X86-SSE-NEXT: movdqa 40(%ebp), %xmm5 -; X86-SSE-NEXT: movdqa 24(%ebp), %xmm6 -; X86-SSE-NEXT: movl 8(%ebp), %esi -; X86-SSE-NEXT: movl 80(%ebp), %edx -; X86-SSE-NEXT: movl (%edx), %eax +; X86-SSE-NEXT: movdqa 56(%ebp), %xmm3 +; X86-SSE-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE-NEXT: movl 8(%ebp), %edx +; X86-SSE-NEXT: movl 80(%ebp), %ecx +; X86-SSE-NEXT: movl (%ecx), %eax ; X86-SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movntps %xmm0, (%esi) ; X86-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntdq %xmm2, (%esi) +; X86-SSE-NEXT: movntps %xmm0, (%edx) +; X86-SSE-NEXT: addl (%ecx), %eax +; X86-SSE-NEXT: movntdq %xmm2, (%edx) ; X86-SSE-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntpd %xmm1, (%esi) -; X86-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntdq %xmm6, (%esi) -; X86-SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntdq %xmm5, (%esi) -; X86-SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntdq %xmm4, (%esi) -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movntil %ecx, (%esi) -; X86-SSE-NEXT: addl (%edx), %eax -; X86-SSE-NEXT: movsd %xmm3, (%esi) -; X86-SSE-NEXT: addl (%edx), %eax +; X86-SSE-NEXT: addl (%ecx), %eax +; X86-SSE-NEXT: movdqa 24(%ebp), %xmm0 +; X86-SSE-NEXT: movntpd %xmm1, (%edx) +; X86-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: addl (%ecx), %eax +; X86-SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 +; X86-SSE-NEXT: movntdq %xmm0, (%edx) +; X86-SSE-NEXT: addl (%ecx), %eax +; X86-SSE-NEXT: movntdq %xmm4, (%edx) +; X86-SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE-NEXT: addl (%ecx), %eax +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movntdq %xmm3, (%edx) +; X86-SSE-NEXT: addl (%ecx), %eax +; X86-SSE-NEXT: movl 12(%ebp), %esi +; X86-SSE-NEXT: movntil %esi, (%edx) +; X86-SSE-NEXT: addl (%ecx), %eax +; X86-SSE-NEXT: movsd %xmm0, (%edx) +; X86-SSE-NEXT: addl (%ecx), %eax ; X86-SSE-NEXT: leal -4(%ebp), %esp ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %ebp @@ -54,36 +54,36 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X86-AVX-NEXT: pushl %esi ; X86-AVX-NEXT: andl $-16, %esp ; X86-AVX-NEXT: subl $16, %esp -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; X86-AVX-NEXT: movl 12(%ebp), %ecx -; X86-AVX-NEXT: vmovdqa 56(%ebp), %xmm4 -; X86-AVX-NEXT: vmovdqa 40(%ebp), %xmm5 -; X86-AVX-NEXT: vmovdqa 24(%ebp), %xmm6 -; X86-AVX-NEXT: movl 8(%ebp), %esi -; X86-AVX-NEXT: movl 80(%ebp), %edx -; X86-AVX-NEXT: movl (%edx), %eax +; X86-AVX-NEXT: vmovdqa 56(%ebp), %xmm3 +; X86-AVX-NEXT: vmovdqa 40(%ebp), %xmm4 +; X86-AVX-NEXT: movl 8(%ebp), %edx +; X86-AVX-NEXT: movl 80(%ebp), %ecx +; X86-AVX-NEXT: movl (%ecx), %eax ; X86-AVX-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovntps %xmm0, (%esi) -; X86-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) +; X86-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2 +; X86-AVX-NEXT: vmovntps %xmm0, (%edx) +; X86-AVX-NEXT: addl (%ecx), %eax +; X86-AVX-NEXT: vmovntdq %xmm2, (%edx) ; X86-AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntpd %xmm0, (%esi) -; X86-AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) -; X86-AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) -; X86-AVX-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4, %xmm0 -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: movntil %ecx, (%esi) -; X86-AVX-NEXT: addl (%edx), %eax -; X86-AVX-NEXT: vmovsd %xmm3, (%esi) -; X86-AVX-NEXT: addl (%edx), %eax +; X86-AVX-NEXT: addl (%ecx), %eax +; X86-AVX-NEXT: vmovdqa 24(%ebp), %xmm1 +; X86-AVX-NEXT: vmovntpd %xmm0, (%edx) +; X86-AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 +; X86-AVX-NEXT: addl (%ecx), %eax +; X86-AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4, %xmm1 +; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X86-AVX-NEXT: addl (%ecx), %eax +; X86-AVX-NEXT: vmovntdq %xmm1, (%edx) +; X86-AVX-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm0 +; X86-AVX-NEXT: addl (%ecx), %eax +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X86-AVX-NEXT: addl (%ecx), %eax +; X86-AVX-NEXT: movl 12(%ebp), %esi +; X86-AVX-NEXT: movntil %esi, (%edx) +; X86-AVX-NEXT: addl (%ecx), %eax +; X86-AVX-NEXT: vmovsd %xmm1, (%edx) +; X86-AVX-NEXT: addl (%ecx), %eax ; X86-AVX-NEXT: leal -4(%ebp), %esp ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %ebp @@ -93,8 +93,8 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movl (%rcx), %eax ; X64-SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: movntps %xmm0, (%rdi) ; X64-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-SSE-NEXT: movntps %xmm0, (%rdi) ; X64-SSE-NEXT: addl (%rcx), %eax ; X64-SSE-NEXT: movntdq %xmm2, (%rdi) ; X64-SSE-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -102,8 +102,8 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X64-SSE-NEXT: movntpd %xmm1, (%rdi) ; X64-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; X64-SSE-NEXT: addl (%rcx), %eax -; X64-SSE-NEXT: movntdq %xmm3, (%rdi) ; X64-SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; X64-SSE-NEXT: movntdq %xmm3, (%rdi) ; X64-SSE-NEXT: addl (%rcx), %eax ; X64-SSE-NEXT: movntdq %xmm4, (%rdi) ; X64-SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 @@ -120,19 +120,19 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: movl (%rcx), %eax ; X64-AVX-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; X64-AVX-NEXT: vmovntps %xmm0, (%rdi) -; X64-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 ; X64-AVX-NEXT: addl (%rcx), %eax -; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi) +; X64-AVX-NEXT: vmovntdq %xmm2, (%rdi) ; X64-AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; X64-AVX-NEXT: addl (%rcx), %eax ; X64-AVX-NEXT: vmovntpd %xmm0, (%rdi) ; X64-AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0 ; X64-AVX-NEXT: addl (%rcx), %eax +; X64-AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm1 ; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi) -; X64-AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm0 ; X64-AVX-NEXT: addl (%rcx), %eax -; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi) +; X64-AVX-NEXT: vmovntdq %xmm1, (%rdi) ; X64-AVX-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm0 ; X64-AVX-NEXT: addl (%rcx), %eax ; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/nosse-vector.ll b/llvm/test/CodeGen/X86/nosse-vector.ll index 9807d1b09d8ef..4cd3cbaff0ade 100644 --- a/llvm/test/CodeGen/X86/nosse-vector.ll +++ b/llvm/test/CodeGen/X86/nosse-vector.ll @@ -267,20 +267,20 @@ define void @add_2i64_mem(ptr %p0, ptr %p1, ptr %p2) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl 12(%edx), %esi -; X32-NEXT: movl 8(%edx), %edi -; X32-NEXT: movl (%edx), %ebx -; X32-NEXT: movl 4(%edx), %edx -; X32-NEXT: addl (%ecx), %ebx -; X32-NEXT: adcl 4(%ecx), %edx -; X32-NEXT: addl 8(%ecx), %edi -; X32-NEXT: adcl 12(%ecx), %esi -; X32-NEXT: movl %edi, 8(%eax) -; X32-NEXT: movl %ebx, (%eax) -; X32-NEXT: movl %esi, 12(%eax) -; X32-NEXT: movl %edx, 4(%eax) +; X32-NEXT: movl (%edx), %ecx +; X32-NEXT: addl (%eax), %ecx +; X32-NEXT: movl 4(%edx), %esi +; X32-NEXT: adcl 4(%eax), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 8(%edx), %ebx +; X32-NEXT: addl 8(%eax), %ebx +; X32-NEXT: movl 12(%edx), %edx +; X32-NEXT: adcl 12(%eax), %edx +; X32-NEXT: movl %ebx, 8(%edi) +; X32-NEXT: movl %ecx, (%edi) +; X32-NEXT: movl %edx, 12(%edi) +; X32-NEXT: movl %esi, 4(%edi) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index edc8404993996..0fabc3912174d 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -25,9 +25,9 @@ define void @v3i64(<2 x i64> %a, <2 x i64> %b, ptr %p) nounwind { ; ; AVX-LABEL: v3i64: ; AVX: # %bb.0: -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; AVX-NEXT: vpextrq $1, %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqa %xmm1, (%rdi) +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rdi) ; AVX-NEXT: retq %r = shufflevector <2 x i64> %a, <2 x i64> %b, <3 x i32> store <3 x i64> %r, ptr %p @@ -70,9 +70,9 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, ptr %p) nounwind { ; ; AVX-LABEL: v3i32: ; AVX: # %bb.0: -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vextractps $1, %xmm0, 8(%rdi) -; AVX-NEXT: vmovlps %xmm1, (%rdi) +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovlps %xmm0, (%rdi) ; AVX-NEXT: retq %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> store <3 x i32> %r, ptr %p @@ -82,8 +82,8 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, ptr %p) nounwind { define void @v5i16(<4 x i16> %a, <4 x i16> %b, ptr %p) nounwind { ; SSE2-LABEL: v5i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrlq $16, %xmm1 ; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: psrlq $16, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movw %ax, 8(%rdi) ; SSE2-NEXT: movq %xmm0, (%rdi) @@ -91,8 +91,8 @@ define void @v5i16(<4 x i16> %a, <4 x i16> %b, ptr %p) nounwind { ; ; SSE42-LABEL: v5i16: ; SSE42: # %bb.0: -; SSE42-NEXT: psrlq $16, %xmm1 ; SSE42-NEXT: pextrw $3, %xmm0, 8(%rdi) +; SSE42-NEXT: psrlq $16, %xmm1 ; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE42-NEXT: movq %xmm0, (%rdi) ; SSE42-NEXT: retq @@ -121,18 +121,18 @@ define void @v5i32(<4 x i32> %a, <4 x i32> %b, ptr %p) nounwind { ; ; SSE42-LABEL: v5i32: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3] ; SSE42-NEXT: pextrd $3, %xmm0, 16(%rdi) +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3] ; SSE42-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE42-NEXT: movdqa %xmm0, (%rdi) ; SSE42-NEXT: retq ; ; AVX-LABEL: v5i32: ; AVX: # %bb.0: -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,2,3] -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi) -; AVX-NEXT: vmovaps %xmm1, (%rdi) +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,2,3] +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovaps %xmm0, (%rdi) ; AVX-NEXT: retq %r = shufflevector <4 x i32> %a, <4 x i32> %b, <5 x i32> store <5 x i32> %r, ptr %p @@ -246,19 +246,20 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, ptr %p) nounwind { ; ; SSE42-LABEL: v7i16: ; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm1, %xmm2 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15] +; SSE42-NEXT: pextrd $2, %xmm2, 8(%rdi) ; SSE42-NEXT: pextrw $0, %xmm1, 12(%rdi) -; SSE42-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15] -; SSE42-NEXT: pextrd $2, %xmm1, 8(%rdi) -; SSE42-NEXT: movq %xmm1, (%rdi) +; SSE42-NEXT: movq %xmm2, (%rdi) ; SSE42-NEXT: retq ; ; AVX1-LABEL: v7i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15] -; AVX1-NEXT: vpextrw $0, %xmm1, 12(%rdi) ; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi) +; AVX1-NEXT: vpextrw $0, %xmm1, 12(%rdi) ; AVX1-NEXT: vmovq %xmm0, (%rdi) ; AVX1-NEXT: retq ; @@ -266,16 +267,16 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, ptr %p) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,8,9,14,15,8,9,6,7,12,13,0,1,14,15] -; AVX2-NEXT: vpextrw $0, %xmm1, 12(%rdi) ; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi) +; AVX2-NEXT: vpextrw $0, %xmm1, 12(%rdi) ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: retq ; ; XOP-LABEL: v7i16: ; XOP: # %bb.0: ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[4,5],xmm0[6,7],xmm1[4,5],xmm0[2,3],xmm1[6,7,0,1],xmm0[6,7] -; XOP-NEXT: vpextrw $0, %xmm1, 12(%rdi) ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi) +; XOP-NEXT: vpextrw $0, %xmm1, 12(%rdi) ; XOP-NEXT: vmovq %xmm0, (%rdi) ; XOP-NEXT: retq %r = shufflevector <4 x i16> %a, <4 x i16> %b, <7 x i32> @@ -412,8 +413,8 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, ptr %p) nounwind { ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; SSE42-NEXT: movdqa %xmm0, (%rdi) ; SSE42-NEXT: movq %xmm3, 16(%rdi) @@ -425,8 +426,8 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, ptr %p) nounwind { ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: vmovq %xmm2, 16(%rdi) @@ -438,8 +439,8 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, ptr %p) nounwind { ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi) ; AVX2-SLOW-NEXT: vmovq %xmm2, 16(%rdi) @@ -447,11 +448,11 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, ptr %p) nounwind { ; ; AVX2-FAST-LABEL: v12i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13] +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovq %xmm0, 16(%rdi) ; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rdi) @@ -546,7 +547,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, ptr %p) nounwind { ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm2 ; AVX2-FAST-ALL-NEXT: vbroadcastsd %xmm1, %ymm3 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = [u,3,7,u,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] @@ -717,24 +718,24 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; ; SSE42-LABEL: interleave_24i8_out: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqu (%rdi), %xmm0 -; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: movdqa %xmm1, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[2,5,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movdqu (%rdi), %xmm2 +; SSE42-NEXT: movdqa %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm2, %xmm3 -; SSE42-NEXT: movdqa %xmm1, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: por %xmm1, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movdqa %xmm2, %xmm4 ; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm2, %xmm4 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: por %xmm1, %xmm4 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[1,4,7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,5,8,11,14],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm2 ; SSE42-NEXT: movq %xmm3, (%rsi) ; SSE42-NEXT: movq %xmm4, (%rdx) -; SSE42-NEXT: movq %xmm0, (%rcx) +; SSE42-NEXT: movq %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX1-LABEL: interleave_24i8_out: @@ -743,10 +744,10 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -761,10 +762,10 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -840,17 +841,17 @@ define void @interleave_24i8_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE42: # %bb.0: ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] ; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE42-NEXT: movdqa %xmm2, %xmm1 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero -; SSE42-NEXT: por %xmm1, %xmm3 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[13],zero,xmm2[6,14],zero,xmm2[7,15],zero,xmm2[u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm2, %xmm0 -; SSE42-NEXT: movq %xmm0, 16(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm3 +; SSE42-NEXT: por %xmm1, %xmm2 +; SSE42-NEXT: movq %xmm2, 16(%rdi) ; SSE42-NEXT: movdqu %xmm3, (%rdi) ; SSE42-NEXT: retq ; @@ -862,9 +863,9 @@ define void @interleave_24i8_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, 16(%rdi) ; AVX1-NEXT: vmovdqu %xmm2, (%rdi) @@ -878,9 +879,9 @@ define void @interleave_24i8_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, 16(%rdi) ; AVX2-NEXT: vmovdqu %xmm2, (%rdi) @@ -968,22 +969,22 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE42: # %bb.0: ; SSE42-NEXT: movdqu (%rdi), %xmm0 ; SSE42-NEXT: movdqu 16(%rdi), %xmm1 -; SSE42-NEXT: movdqu 32(%rdi), %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] +; SSE42-NEXT: movdqu 32(%rdi), %xmm3 ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7] -; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,2,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] +; SSE42-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] -; SSE42-NEXT: movdqu %xmm4, (%rsi) -; SSE42-NEXT: movdqu %xmm3, (%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; SSE42-NEXT: movdqu %xmm2, (%rsi) +; SSE42-NEXT: movdqu %xmm4, (%rdx) ; SSE42-NEXT: movdqu %xmm1, (%rcx) ; SSE42-NEXT: retq ; @@ -992,18 +993,18 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX1-NEXT: vmovdqu %xmm3, (%rsi) +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] +; AVX1-NEXT: vmovdqu %xmm1, (%rsi) ; AVX1-NEXT: vmovdqu %xmm4, (%rdx) ; AVX1-NEXT: vmovdqu %xmm0, (%rcx) ; AVX1-NEXT: retq @@ -1111,25 +1112,25 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou ; ; SSE42-LABEL: interleave_24i16_out_reverse: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqu (%rdi), %xmm0 -; SSE42-NEXT: movdqu 16(%rdi), %xmm1 -; SSE42-NEXT: movdqu 32(%rdi), %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; SSE42-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] +; SSE42-NEXT: movdqu 16(%rdi), %xmm0 +; SSE42-NEXT: movdqu 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[14,15,8,9,2,3,12,13,6,7,0,1,u,u,u,u] +; SSE42-NEXT: movdqu (%rdi), %xmm3 ; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[14,15,8,9,2,3,12,13,6,7,0,1,u,u,u,u] -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm3 -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[12,13,6,7,0,1,10,11,4,5,14,15,8,9,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[10,11,4,5,14,15,8,9,2,3,12,13,6,7,0,1] -; SSE42-NEXT: movdqu %xmm4, (%rsi) -; SSE42-NEXT: movdqu %xmm3, (%rdx) -; SSE42-NEXT: movdqu %xmm1, (%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,2,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[12,13,6,7,0,1,10,11,4,5,14,15,8,9,2,3] +; SSE42-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,4,5,14,15,8,9,2,3,12,13,6,7,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; SSE42-NEXT: movdqu %xmm2, (%rsi) +; SSE42-NEXT: movdqu %xmm4, (%rdx) +; SSE42-NEXT: movdqu %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX1-LABEL: interleave_24i16_out_reverse: @@ -1299,9 +1300,9 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -1318,12 +1319,12 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-FAST-ALL-NEXT: vmovdqu (%rdx), %xmm1 ; AVX2-FAST-ALL-NEXT: vmovdqu (%rcx), %xmm2 ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] -; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] ; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] -; AVX2-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -1345,9 +1346,9 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -1362,17 +1363,17 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; XOP: # %bb.0: ; XOP-NEXT: vmovdqu (%rsi), %xmm0 ; XOP-NEXT: vmovdqu (%rdx), %xmm1 -; XOP-NEXT: vmovdqu (%rcx), %xmm2 -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[u,u,6,7],xmm1[6,7],xmm0[u,u,8,9],xmm1[8,9],xmm0[u,u,10,11] -; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] -; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[u,u,6,7],xmm1[6,7],xmm0[u,u,8,9],xmm1[8,9],xmm0[u,u,10,11] +; XOP-NEXT: vmovdqu (%rcx), %xmm3 +; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2] +; XOP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] ; XOP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11] -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[0,1],xmm4[4,5,6,7],xmm3[2,3],xmm4[8,9,10,11] +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5],xmm2[10,11],xmm0[10,11,8,9],xmm2[12,13],xmm0[14,15,12,13],xmm2[14,15] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5],xmm3[10,11],xmm0[10,11,8,9],xmm3[12,13],xmm0[14,15,12,13],xmm3[14,15] ; XOP-NEXT: vmovdqu %xmm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm3, (%rdi) +; XOP-NEXT: vmovups %ymm2, (%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %s1 = load <8 x i16>, ptr %q1, align 4 @@ -1745,10 +1746,9 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-FAST-ALL-NEXT: vmovups (%rsi), %ymm0 ; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-ALL-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,0,7,6,5,0,7,6] -; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = [5,u,u,6,u,u,7,u] ; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm4 +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm3 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] @@ -1964,8 +1964,7 @@ define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, ptr%a2) { ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 @@ -2136,8 +2135,7 @@ define void @splat3_256(<32 x i8> %a0, ptr%a1) { ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 @@ -2370,9 +2368,9 @@ define void @D107009(ptr %input, ptr %output) { ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm1 ; SSE-NEXT: movdqa 144(%rdi), %xmm2 -; SSE-NEXT: movdqa 208(%rdi), %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa 208(%rdi), %xmm3 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] @@ -2434,16 +2432,16 @@ define void @D107009(ptr %input, ptr %output) { ; AVX2-LABEL: D107009: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqu 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqu 128(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu 192(%rdi), %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX2-NEXT: vmovdqu 192(%rdi), %ymm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-NEXT: vmovdqu 128(%rdi), %ymm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index a1da40e7e7655..05b1d94f50b2e 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -209,8 +209,8 @@ define void @PR42833() { ; SSE42-NEXT: movdqa c+176(%rip), %xmm2 ; SSE42-NEXT: movdqa d+160(%rip), %xmm4 ; SSE42-NEXT: movdqa d+176(%rip), %xmm5 -; SSE42-NEXT: movdqa d+128(%rip), %xmm6 ; SSE42-NEXT: pinsrd $0, %eax, %xmm0 +; SSE42-NEXT: movdqa d+128(%rip), %xmm6 ; SSE42-NEXT: psubd %xmm0, %xmm6 ; SSE42-NEXT: psubd %xmm2, %xmm5 ; SSE42-NEXT: psubd %xmm1, %xmm4 @@ -235,24 +235,24 @@ define void @PR42833() { ; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vmovdqa d+144(%rip), %xmm3 +; AVX1-NEXT: vpsubd c+144(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2 -; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vmovups %ymm0, c+128(%rip) ; AVX1-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1 -; AVX1-NEXT: vmovdqa c+176(%rip), %xmm3 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa c+176(%rip), %xmm2 +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa d+160(%rip), %xmm4 ; AVX1-NEXT: vmovdqa c+160(%rip), %xmm5 ; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa %xmm2, d+144(%rip) +; AVX1-NEXT: vmovdqa %xmm3, d+144(%rip) ; AVX1-NEXT: vmovdqa %xmm4, d+160(%rip) ; AVX1-NEXT: vmovdqa %xmm1, d+176(%rip) ; AVX1-NEXT: vmovdqa %xmm0, d+128(%rip) -; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm2, %xmm0 ; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, c+160(%rip) ; AVX1-NEXT: vmovdqa %xmm0, c+176(%rip) @@ -317,24 +317,24 @@ define void @PR42833() { ; XOP-NEXT: vmovdqa c+144(%rip), %xmm3 ; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3 ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; XOP-NEXT: vmovdqa d+144(%rip), %xmm3 +; XOP-NEXT: vpsubd c+144(%rip), %xmm3, %xmm3 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] -; XOP-NEXT: vmovdqa d+144(%rip), %xmm2 -; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 ; XOP-NEXT: vmovups %ymm0, c+128(%rip) ; XOP-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 ; XOP-NEXT: vmovdqa d+128(%rip), %xmm1 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vmovdqa d+176(%rip), %xmm1 -; XOP-NEXT: vmovdqa c+176(%rip), %xmm3 -; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vmovdqa c+176(%rip), %xmm2 +; XOP-NEXT: vpsubd %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa d+160(%rip), %xmm4 ; XOP-NEXT: vmovdqa c+160(%rip), %xmm5 ; XOP-NEXT: vpsubd %xmm5, %xmm4, %xmm4 -; XOP-NEXT: vmovdqa %xmm2, d+144(%rip) +; XOP-NEXT: vmovdqa %xmm3, d+144(%rip) ; XOP-NEXT: vmovdqa %xmm4, d+160(%rip) ; XOP-NEXT: vmovdqa %xmm1, d+176(%rip) ; XOP-NEXT: vmovdqa %xmm0, d+128(%rip) -; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm0 +; XOP-NEXT: vpaddd %xmm2, %xmm2, %xmm0 ; XOP-NEXT: vpaddd %xmm5, %xmm5, %xmm1 ; XOP-NEXT: vmovdqa %xmm1, c+160(%rip) ; XOP-NEXT: vmovdqa %xmm0, c+176(%rip) diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll index 9e398096bfcc5..489f2096aeecb 100644 --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -112,11 +112,11 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) { ; AVX2-LABEL: p4_vector_urem_by_const__splat: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -149,7 +149,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) { ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = [1,2147483648] +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [1,u,2147483648,u] ; SSE4-NEXT: pmuludq %xmm0, %xmm1 ; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; SSE4-NEXT: psrlq $32, %xmm1 @@ -212,11 +212,11 @@ define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32 ; AVX2-LABEL: p6_vector_urem_by_const__nonsplat_undef0: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/optimize-max-2.ll b/llvm/test/CodeGen/X86/optimize-max-2.ll index 3533bfbc61e6d..7e5dc48cb333a 100644 --- a/llvm/test/CodeGen/X86/optimize-max-2.ll +++ b/llvm/test/CodeGen/X86/optimize-max-2.ll @@ -14,14 +14,15 @@ define void @foo(ptr nocapture %p, i64 %x, i64 %y) nounwind { ; CHECK-NEXT: cmovneq %rdx, %rax ; CHECK-NEXT: cmpq %rsi, %rax ; CHECK-NEXT: cmovbeq %rsi, %rax +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %bb4 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: addsd %xmm0, %xmm0 -; CHECK-NEXT: movsd %xmm0, (%rdi) -; CHECK-NEXT: addq $8, %rdi -; CHECK-NEXT: decq %rax +; CHECK-NEXT: movsd %xmm0, (%rdi,%rcx,8) +; CHECK-NEXT: incq %rcx +; CHECK-NEXT: cmpq %rcx, %rax ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/or-lea.ll b/llvm/test/CodeGen/X86/or-lea.ll index 616ab99437892..cf1198aa3b172 100644 --- a/llvm/test/CodeGen/X86/or-lea.ll +++ b/llvm/test/CodeGen/X86/or-lea.ll @@ -220,7 +220,8 @@ define i32 @or_and_and_rhs_neg_i32(i32 %x, i32 %y, i32 %z) { ; BMI-NEXT: # kill: def $edx killed $edx def $rdx ; BMI-NEXT: andl %esi, %edx ; BMI-NEXT: andnl %edi, %esi, %eax -; BMI-NEXT: leal 1(%rdx,%rax), %eax +; BMI-NEXT: addl %edx, %eax +; BMI-NEXT: incl %eax ; BMI-NEXT: retq entry: %and1 = and i32 %z, %y @@ -256,7 +257,8 @@ define i32 @or_and_and_lhs_neg_i32(i32 %x, i32 %y, i32 %z) { ; BMI-NEXT: # kill: def $edx killed $edx def $rdx ; BMI-NEXT: andl %esi, %edx ; BMI-NEXT: andnl %edi, %esi, %eax -; BMI-NEXT: leal 1(%rdx,%rax), %eax +; BMI-NEXT: addl %edx, %eax +; BMI-NEXT: incl %eax ; BMI-NEXT: retq entry: %and1 = and i32 %z, %y @@ -292,7 +294,8 @@ define i32 @or_and_rhs_neg_and_i32(i32 %x, i32 %y, i32 %z) { ; BMI-NEXT: # kill: def $edi killed $edi def $rdi ; BMI-NEXT: andnl %edx, %esi, %eax ; BMI-NEXT: andl %esi, %edi -; BMI-NEXT: leal 1(%rax,%rdi), %eax +; BMI-NEXT: addl %edi, %eax +; BMI-NEXT: incl %eax ; BMI-NEXT: retq entry: %xor = xor i32 %y, -1 @@ -328,7 +331,8 @@ define i32 @or_and_lhs_neg_and_i32(i32 %x, i32 %y, i32 %z) { ; BMI-NEXT: # kill: def $edi killed $edi def $rdi ; BMI-NEXT: andnl %edx, %esi, %eax ; BMI-NEXT: andl %esi, %edi -; BMI-NEXT: leal 1(%rax,%rdi), %eax +; BMI-NEXT: addl %edi, %eax +; BMI-NEXT: incl %eax ; BMI-NEXT: retq entry: %xor = xor i32 %y, -1 @@ -342,11 +346,11 @@ entry: define i64 @or_and_and_rhs_neg_i64(i64 %x, i64 %y, i64 %z) { ; X86-LABEL: or_and_and_rhs_neg_i64: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %eax @@ -368,7 +372,8 @@ define i64 @or_and_and_rhs_neg_i64(i64 %x, i64 %y, i64 %z) { ; BMI: # %bb.0: # %entry ; BMI-NEXT: andq %rsi, %rdx ; BMI-NEXT: andnq %rdi, %rsi, %rax -; BMI-NEXT: leaq 1(%rdx,%rax), %rax +; BMI-NEXT: addq %rdx, %rax +; BMI-NEXT: incq %rax ; BMI-NEXT: retq entry: %and1 = and i64 %z, %y @@ -382,11 +387,11 @@ entry: define i64 @or_and_and_lhs_neg_i64(i64 %x, i64 %y, i64 %z) { ; X86-LABEL: or_and_and_lhs_neg_i64: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %eax @@ -408,7 +413,8 @@ define i64 @or_and_and_lhs_neg_i64(i64 %x, i64 %y, i64 %z) { ; BMI: # %bb.0: # %entry ; BMI-NEXT: andq %rsi, %rdx ; BMI-NEXT: andnq %rdi, %rsi, %rax -; BMI-NEXT: leaq 1(%rdx,%rax), %rax +; BMI-NEXT: addq %rdx, %rax +; BMI-NEXT: incq %rax ; BMI-NEXT: retq entry: %and1 = and i64 %z, %y @@ -422,11 +428,11 @@ entry: define i64 @or_and_rhs_neg_and_i64(i64 %x, i64 %y, i64 %z) { ; X86-LABEL: or_and_rhs_neg_and_i64: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %eax @@ -448,7 +454,8 @@ define i64 @or_and_rhs_neg_and_i64(i64 %x, i64 %y, i64 %z) { ; BMI: # %bb.0: # %entry ; BMI-NEXT: andnq %rdx, %rsi, %rax ; BMI-NEXT: andq %rsi, %rdi -; BMI-NEXT: leaq 1(%rax,%rdi), %rax +; BMI-NEXT: addq %rdi, %rax +; BMI-NEXT: incq %rax ; BMI-NEXT: retq entry: %xor = xor i64 %y, -1 @@ -462,11 +469,11 @@ entry: define i64 @or_and_lhs_neg_and_i64(i64 %x, i64 %y, i64 %z) { ; X86-LABEL: or_and_lhs_neg_and_i64: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %eax @@ -488,7 +495,8 @@ define i64 @or_and_lhs_neg_and_i64(i64 %x, i64 %y, i64 %z) { ; BMI: # %bb.0: # %entry ; BMI-NEXT: andnq %rdx, %rsi, %rax ; BMI-NEXT: andq %rsi, %rdi -; BMI-NEXT: leaq 1(%rax,%rdi), %rax +; BMI-NEXT: addq %rdi, %rax +; BMI-NEXT: incq %rax ; BMI-NEXT: retq entry: %xor = xor i64 %y, -1 @@ -505,7 +513,7 @@ define i32 @or_sext1(i32 %x) { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $43, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al -; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: leal -1(,%eax,2), %eax ; X86-NEXT: retl ; ; X64-LABEL: or_sext1: @@ -513,7 +521,7 @@ define i32 @or_sext1(i32 %x) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl $43, %edi ; X64-NEXT: setl %al -; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: leal -1(,%rax,2), %eax ; X64-NEXT: retq %cmp = icmp sgt i32 %x, 42 %sext = sext i1 %cmp to i32 @@ -540,7 +548,7 @@ define i64 @or_sext1_64(i64 %x) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq $43, %rdi ; X64-NEXT: setl %al -; X64-NEXT: leaq -1(%rax,%rax), %rax +; X64-NEXT: leaq -1(,%rax,2), %rax ; X64-NEXT: retq %cmp = icmp sgt i64 %x, 42 %sext = sext i1 %cmp to i64 @@ -554,7 +562,8 @@ define i32 @or_sext2(i32 %x) { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $43, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al -; X86-NEXT: leal -1(%eax,%eax,2), %eax +; X86-NEXT: leal (%eax,%eax,2), %eax +; X86-NEXT: decl %eax ; X86-NEXT: retl ; ; X64-LABEL: or_sext2: @@ -562,7 +571,8 @@ define i32 @or_sext2(i32 %x) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl $43, %edi ; X64-NEXT: setl %al -; X64-NEXT: leal -1(%rax,%rax,2), %eax +; X64-NEXT: leal (%rax,%rax,2), %eax +; X64-NEXT: decl %eax ; X64-NEXT: retq %cmp = icmp sgt i32 %x, 42 %sext = sext i1 %cmp to i32 @@ -589,7 +599,8 @@ define i64 @or_sext2_64(i64 %x) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq $43, %rdi ; X64-NEXT: setl %al -; X64-NEXT: leaq -1(%rax,%rax,2), %rax +; X64-NEXT: leaq (%rax,%rax,2), %rax +; X64-NEXT: decq %rax ; X64-NEXT: retq %cmp = icmp sgt i64 %x, 42 %sext = sext i1 %cmp to i64 @@ -652,7 +663,8 @@ define i32 @or_sext4(i32 %x) { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $43, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al -; X86-NEXT: leal -1(%eax,%eax,4), %eax +; X86-NEXT: leal (%eax,%eax,4), %eax +; X86-NEXT: decl %eax ; X86-NEXT: retl ; ; X64-LABEL: or_sext4: @@ -660,7 +672,8 @@ define i32 @or_sext4(i32 %x) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl $43, %edi ; X64-NEXT: setl %al -; X64-NEXT: leal -1(%rax,%rax,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: decl %eax ; X64-NEXT: retq %cmp = icmp sgt i32 %x, 42 %sext = sext i1 %cmp to i32 @@ -687,7 +700,8 @@ define i64 @or_sext4_64(i64 %x) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq $43, %rdi ; X64-NEXT: setl %al -; X64-NEXT: leaq -1(%rax,%rax,4), %rax +; X64-NEXT: leaq (%rax,%rax,4), %rax +; X64-NEXT: decq %rax ; X64-NEXT: retq %cmp = icmp sgt i64 %x, 42 %sext = sext i1 %cmp to i64 @@ -750,7 +764,8 @@ define i32 @or_sext8(i32 %x) { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $43, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al -; X86-NEXT: leal -1(%eax,%eax,8), %eax +; X86-NEXT: leal (%eax,%eax,8), %eax +; X86-NEXT: decl %eax ; X86-NEXT: retl ; ; X64-LABEL: or_sext8: @@ -758,7 +773,8 @@ define i32 @or_sext8(i32 %x) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl $43, %edi ; X64-NEXT: setl %al -; X64-NEXT: leal -1(%rax,%rax,8), %eax +; X64-NEXT: leal (%rax,%rax,8), %eax +; X64-NEXT: decl %eax ; X64-NEXT: retq %cmp = icmp sgt i32 %x, 42 %sext = sext i1 %cmp to i32 @@ -785,7 +801,8 @@ define i64 @or_sext8_64(i64 %x) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq $43, %rdi ; X64-NEXT: setl %al -; X64-NEXT: leaq -1(%rax,%rax,8), %rax +; X64-NEXT: leaq (%rax,%rax,8), %rax +; X64-NEXT: decq %rax ; X64-NEXT: retq %cmp = icmp sgt i64 %x, 42 %sext = sext i1 %cmp to i64 diff --git a/llvm/test/CodeGen/X86/overflow.ll b/llvm/test/CodeGen/X86/overflow.ll index 5900e7674cd0e..535e4425126c1 100644 --- a/llvm/test/CodeGen/X86/overflow.ll +++ b/llvm/test/CodeGen/X86/overflow.ll @@ -10,13 +10,12 @@ define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx diff --git a/llvm/test/CodeGen/X86/overflowing-iv-codegen.ll b/llvm/test/CodeGen/X86/overflowing-iv-codegen.ll index e5dd5b23bea96..0d8e593a7f8aa 100644 --- a/llvm/test/CodeGen/X86/overflowing-iv-codegen.ll +++ b/llvm/test/CodeGen/X86/overflowing-iv-codegen.ll @@ -5,16 +5,16 @@ define i32 @test_01(ptr %p, i64 %len, i32 %x) { ; CHECK-LABEL: test_01: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addq $-4, %rdi +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $1, %rsi -; CHECK-NEXT: jb .LBB0_4 +; CHECK-NEXT: cmpq %rax, %rsi +; CHECK-NEXT: je .LBB0_4 ; CHECK-NEXT: # %bb.2: # %backedge ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: cmpl %edx, (%rdi) -; CHECK-NEXT: leaq 4(%rdi), %rdi +; CHECK-NEXT: cmpl %edx, -4(%rdi,%rax,4) +; CHECK-NEXT: leaq 1(%rax), %rax ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.3: # %failure ; CHECK-NEXT: .LBB0_4: # %exit @@ -47,16 +47,16 @@ failure: ; preds = %backedge define i32 @test_02(ptr %p, i64 %len, i32 %x) { ; CHECK-LABEL: test_02: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addq $-4, %rdi +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $1, %rsi -; CHECK-NEXT: jb .LBB1_4 +; CHECK-NEXT: cmpq %rax, %rsi +; CHECK-NEXT: je .LBB1_4 ; CHECK-NEXT: # %bb.2: # %backedge ; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: cmpl %edx, (%rdi) -; CHECK-NEXT: leaq 4(%rdi), %rdi +; CHECK-NEXT: cmpl %edx, -4(%rdi,%rax,4) +; CHECK-NEXT: leaq 1(%rax), %rax ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.3: # %failure ; CHECK-NEXT: .LBB1_4: # %exit @@ -89,16 +89,16 @@ failure: ; preds = %backedge define i32 @test_02_nopoison(ptr %p, i64 %len, i32 %x) { ; CHECK-LABEL: test_02_nopoison: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addq $-4, %rdi +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB2_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $1, %rsi -; CHECK-NEXT: jb .LBB2_4 +; CHECK-NEXT: cmpq %rax, %rsi +; CHECK-NEXT: je .LBB2_4 ; CHECK-NEXT: # %bb.2: # %backedge ; CHECK-NEXT: # in Loop: Header=BB2_1 Depth=1 -; CHECK-NEXT: cmpl %edx, (%rdi) -; CHECK-NEXT: leaq 4(%rdi), %rdi +; CHECK-NEXT: cmpl %edx, -4(%rdi,%rax,4) +; CHECK-NEXT: leaq 1(%rax), %rax ; CHECK-NEXT: jne .LBB2_1 ; CHECK-NEXT: # %bb.3: # %failure ; CHECK-NEXT: .LBB2_4: # %exit @@ -133,16 +133,16 @@ failure: ; preds = %backedge define i32 @test_03(ptr %p, i64 %len, i32 %x) { ; CHECK-LABEL: test_03: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addq $-4, %rdi +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB3_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $1, %rsi -; CHECK-NEXT: jb .LBB3_4 +; CHECK-NEXT: cmpq %rax, %rsi +; CHECK-NEXT: je .LBB3_4 ; CHECK-NEXT: # %bb.2: # %backedge ; CHECK-NEXT: # in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: cmpl %edx, (%rdi) -; CHECK-NEXT: leaq 4(%rdi), %rdi +; CHECK-NEXT: cmpl %edx, -4(%rdi,%rax,4) +; CHECK-NEXT: leaq 1(%rax), %rax ; CHECK-NEXT: jne .LBB3_1 ; CHECK-NEXT: # %bb.3: # %failure ; CHECK-NEXT: .LBB3_4: # %exit @@ -175,16 +175,16 @@ failure: ; preds = %backedge define i32 @test_03_nopoison(ptr %p, i64 %len, i32 %x) { ; CHECK-LABEL: test_03_nopoison: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addq $-4, %rdi +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB4_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $1, %rsi -; CHECK-NEXT: jb .LBB4_4 +; CHECK-NEXT: cmpq %rax, %rsi +; CHECK-NEXT: je .LBB4_4 ; CHECK-NEXT: # %bb.2: # %backedge ; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1 -; CHECK-NEXT: cmpl %edx, (%rdi) -; CHECK-NEXT: leaq 4(%rdi), %rdi +; CHECK-NEXT: cmpl %edx, -4(%rdi,%rax,4) +; CHECK-NEXT: leaq 1(%rax), %rax ; CHECK-NEXT: jne .LBB4_1 ; CHECK-NEXT: # %bb.3: # %failure ; CHECK-NEXT: .LBB4_4: # %exit diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll index 384e40496d82a..1b0af955ae824 100644 --- a/llvm/test/CodeGen/X86/packus.ll +++ b/llvm/test/CodeGen/X86/packus.ll @@ -131,7 +131,7 @@ define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) { ; SSE4-LABEL: trunc_lshr_v4i64_demandedelts: ; SSE4: # %bb.0: ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE4-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,1,1,1] +; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] ; SSE4-NEXT: pand %xmm2, %xmm1 ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE4-NEXT: pand %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll index 26c55d6371b51..826bf67fb5a4d 100644 --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -613,7 +613,7 @@ define <64 x i8> @test17(<64 x i8> %x) { ; ; AVX1-LABEL: test17: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3 ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 @@ -993,26 +993,12 @@ define <16 x i16> @test27(<16 x i16> %x) { } define <16 x i16> @test28(<16 x i16> %x) { -; SSE2-LABEL: test28: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] -; SSE2-NEXT: paddusw %xmm2, %xmm0 -; SSE2-NEXT: paddusw %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test28: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] -; SSSE3-NEXT: paddusw %xmm2, %xmm0 -; SSSE3-NEXT: paddusw %xmm2, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: test28: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] -; SSE41-NEXT: paddusw %xmm2, %xmm0 -; SSE41-NEXT: paddusw %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: test28: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65534,65534,65534,65534,65534,65534,65534,65534] +; SSE-NEXT: paddusw %xmm2, %xmm0 +; SSE-NEXT: paddusw %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: test28: ; AVX1: # %bb.0: @@ -1129,26 +1115,12 @@ define <16 x i16> @test29(<16 x i16> %x) { } define <16 x i16> @test30(<16 x i16> %x) { -; SSE2-LABEL: test30: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] -; SSE2-NEXT: paddusw %xmm2, %xmm0 -; SSE2-NEXT: paddusw %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test30: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] -; SSSE3-NEXT: paddusw %xmm2, %xmm0 -; SSSE3-NEXT: paddusw %xmm2, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: test30: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] -; SSE41-NEXT: paddusw %xmm2, %xmm0 -; SSE41-NEXT: paddusw %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: test30: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; SSE-NEXT: paddusw %xmm2, %xmm0 +; SSE-NEXT: paddusw %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: test30: ; AVX1: # %bb.0: @@ -1322,32 +1294,14 @@ define <32 x i16> @test33(<32 x i16> %x) { } define <32 x i16> @test34(<32 x i16> %x) { -; SSE2-LABEL: test34: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] -; SSE2-NEXT: paddusw %xmm4, %xmm0 -; SSE2-NEXT: paddusw %xmm4, %xmm1 -; SSE2-NEXT: paddusw %xmm4, %xmm2 -; SSE2-NEXT: paddusw %xmm4, %xmm3 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test34: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] -; SSSE3-NEXT: paddusw %xmm4, %xmm0 -; SSSE3-NEXT: paddusw %xmm4, %xmm1 -; SSSE3-NEXT: paddusw %xmm4, %xmm2 -; SSSE3-NEXT: paddusw %xmm4, %xmm3 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: test34: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] -; SSE41-NEXT: paddusw %xmm4, %xmm0 -; SSE41-NEXT: paddusw %xmm4, %xmm1 -; SSE41-NEXT: paddusw %xmm4, %xmm2 -; SSE41-NEXT: paddusw %xmm4, %xmm3 -; SSE41-NEXT: retq +; SSE-LABEL: test34: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] +; SSE-NEXT: paddusw %xmm4, %xmm0 +; SSE-NEXT: paddusw %xmm4, %xmm1 +; SSE-NEXT: paddusw %xmm4, %xmm2 +; SSE-NEXT: paddusw %xmm4, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: test34: ; AVX1: # %bb.0: @@ -1467,7 +1421,7 @@ define <32 x i16> @test35(<32 x i16> %x) { ; ; AVX1-LABEL: test35: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3 ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 @@ -1524,32 +1478,14 @@ define <32 x i16> @test35(<32 x i16> %x) { } define <32 x i16> @test36(<32 x i16> %x) { -; SSE2-LABEL: test36: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2] -; SSE2-NEXT: paddusw %xmm4, %xmm0 -; SSE2-NEXT: paddusw %xmm4, %xmm1 -; SSE2-NEXT: paddusw %xmm4, %xmm2 -; SSE2-NEXT: paddusw %xmm4, %xmm3 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test36: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2] -; SSSE3-NEXT: paddusw %xmm4, %xmm0 -; SSSE3-NEXT: paddusw %xmm4, %xmm1 -; SSSE3-NEXT: paddusw %xmm4, %xmm2 -; SSSE3-NEXT: paddusw %xmm4, %xmm3 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: test36: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2] -; SSE41-NEXT: paddusw %xmm4, %xmm0 -; SSE41-NEXT: paddusw %xmm4, %xmm1 -; SSE41-NEXT: paddusw %xmm4, %xmm2 -; SSE41-NEXT: paddusw %xmm4, %xmm3 -; SSE41-NEXT: retq +; SSE-LABEL: test36: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2] +; SSE-NEXT: paddusw %xmm4, %xmm0 +; SSE-NEXT: paddusw %xmm4, %xmm1 +; SSE-NEXT: paddusw %xmm4, %xmm2 +; SSE-NEXT: paddusw %xmm4, %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: test36: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/peep-test-0.ll b/llvm/test/CodeGen/X86/peep-test-0.ll index 3a0ef86b64ee3..95ca951e76c01 100644 --- a/llvm/test/CodeGen/X86/peep-test-0.ll +++ b/llvm/test/CodeGen/X86/peep-test-0.ll @@ -5,17 +5,19 @@ define void @loop(i64 %n, ptr nocapture %d) nounwind { ; CHECK-LABEL: loop: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: negq %rax +; CHECK-NEXT: shlq $4, %rdi +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %bb ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: mulsd %xmm0, %xmm1 -; CHECK-NEXT: movsd %xmm1, (%rax) -; CHECK-NEXT: addq $8, %rax -; CHECK-NEXT: incq %rdi +; CHECK-NEXT: movsd %xmm1, (%rdi,%rcx,8) +; CHECK-NEXT: incq %rcx +; CHECK-NEXT: cmpq %rcx, %rax ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll index f3741dc202dc5..3b01607e5f613 100644 --- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll +++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll @@ -284,12 +284,12 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK32-NEXT: lock cmpxchg8b (%esi) +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK32-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: movl %ebp, %edx ; CHECK32-NEXT: movl %edi, %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK32-NEXT: lock cmpxchg8b (%esi) ; CHECK32-NEXT: sete %al ; CHECK32-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll index b38a10c7e4263..fec1d42e78722 100644 --- a/llvm/test/CodeGen/X86/phaddsub-extract.ll +++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll @@ -1590,9 +1590,9 @@ define i32 @extract_extract01_v4i32_add_i32_uses1(<4 x i32> %x, ptr %p) { ; ; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_uses1: ; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm1 ; AVX-FAST-NEXT: vmovd %xmm0, (%rdi) -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax +; AVX-FAST-NEXT: vmovd %xmm1, %eax ; AVX-FAST-NEXT: retq %x0 = extractelement <4 x i32> %x, i32 0 store i32 %x0, ptr %p @@ -1614,17 +1614,17 @@ define i32 @extract_extract01_v4i32_add_i32_uses2(<4 x i32> %x, ptr %p) { ; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_uses2: ; SSE3-FAST: # %bb.0: ; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE3-FAST-NEXT: movd %xmm1, (%rdi) ; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSE3-FAST-NEXT: movd %xmm1, (%rdi) ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses2: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vmovd %xmm0, %ecx +; AVX-SLOW-NEXT: vpextrd $1, %xmm0, (%rdi) ; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %eax ; AVX-SLOW-NEXT: addl %ecx, %eax -; AVX-SLOW-NEXT: vpextrd $1, %xmm0, (%rdi) ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_uses2: @@ -1655,9 +1655,9 @@ define i32 @extract_extract01_v4i32_add_i32_uses3(<4 x i32> %x, ptr %p1, ptr %p2 ; AVX: # %bb.0: ; AVX-NEXT: vmovd %xmm0, %ecx ; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: vpextrd $1, %xmm0, (%rsi) ; AVX-NEXT: vpextrd $1, %xmm0, %eax ; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vpextrd $1, %xmm0, (%rsi) ; AVX-NEXT: retq %x0 = extractelement <4 x i32> %x, i32 0 store i32 %x0, ptr %p1 diff --git a/llvm/test/CodeGen/X86/phaddsub-undef.ll b/llvm/test/CodeGen/X86/phaddsub-undef.ll index 8aa40939994fd..06a98f7ddd05d 100644 --- a/llvm/test/CodeGen/X86/phaddsub-undef.ll +++ b/llvm/test/CodeGen/X86/phaddsub-undef.ll @@ -50,21 +50,21 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { ; ; SSE-FAST-LABEL: test15_undef: ; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: movdqa %xmm3, %xmm1 ; SSE-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSE-FAST-NEXT: movdqa %xmm3, %xmm1 ; SSE-FAST-NEXT: phaddd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: test15_undef: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %ecx -; AVX1-SLOW-NEXT: addl %eax, %ecx +; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-SLOW-NEXT: vmovd %xmm0, %ecx +; AVX1-SLOW-NEXT: addl %ecx, %eax ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vmovd %xmm0, %ecx ; AVX1-SLOW-NEXT: vpextrd $1, %xmm0, %edx -; AVX1-SLOW-NEXT: addl %eax, %edx -; AVX1-SLOW-NEXT: vmovd %ecx, %xmm0 +; AVX1-SLOW-NEXT: addl %ecx, %edx +; AVX1-SLOW-NEXT: vmovd %eax, %xmm0 ; AVX1-SLOW-NEXT: vmovd %edx, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll index d6c9877cd99b6..d07c124c97492 100644 --- a/llvm/test/CodeGen/X86/pmaddubsw.ll +++ b/llvm/test/CodeGen/X86/pmaddubsw.ll @@ -87,7 +87,6 @@ define <16 x i16> @pmaddubsw_256(ptr %Aptr, ptr %Bptr) { define <64 x i16> @pmaddubsw_512(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_512: ; SSE: # %bb.0: -; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa 16(%rdx), %xmm1 ; SSE-NEXT: movdqa 32(%rdx), %xmm2 @@ -100,6 +99,7 @@ define <64 x i16> @pmaddubsw_512(ptr %Aptr, ptr %Bptr) { ; SSE-NEXT: pmaddubsw 64(%rsi), %xmm4 ; SSE-NEXT: movdqa 80(%rdx), %xmm5 ; SSE-NEXT: pmaddubsw 80(%rsi), %xmm5 +; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: movdqa 96(%rdx), %xmm6 ; SSE-NEXT: pmaddubsw 96(%rsi), %xmm6 ; SSE-NEXT: movdqa 112(%rdx), %xmm7 @@ -122,15 +122,15 @@ define <64 x i16> @pmaddubsw_512(ptr %Aptr, ptr %Bptr) { ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 ; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpmaddubsw 48(%rdi), %xmm3, %xmm1 +; AVX1-NEXT: vpmaddubsw 48(%rdi), %xmm3, %xmm3 ; AVX1-NEXT: vpmaddubsw 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovdqa 80(%rsi), %xmm2 -; AVX1-NEXT: vpmaddubsw 80(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX1-NEXT: vpmaddubsw 64(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa 80(%rsi), %xmm1 +; AVX1-NEXT: vpmaddubsw 80(%rdi), %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX1-NEXT: vpmaddubsw 64(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: vmovdqa 112(%rsi), %xmm3 ; AVX1-NEXT: vpmaddubsw 112(%rdi), %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa 96(%rsi), %xmm4 @@ -158,10 +158,10 @@ define <64 x i16> @pmaddubsw_512(ptr %Aptr, ptr %Bptr) { ; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm3 ; AVX512F-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 ; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm1 +; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 ; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: pmaddubsw_512: @@ -390,9 +390,9 @@ define <8 x i16> @pmaddubsw_bad_indices(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_bad_indices: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] +; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15] ; SSE-NEXT: psraw $8, %xmm1 @@ -411,21 +411,21 @@ define <8 x i16> @pmaddubsw_bad_indices(ptr %Aptr, ptr %Bptr) { ; AVX1-LABEL: pmaddubsw_bad_indices: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 ; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pmovsx-inreg.ll b/llvm/test/CodeGen/X86/pmovsx-inreg.ll index a39ea60331a5e..8c43edd01291c 100644 --- a/llvm/test/CodeGen/X86/pmovsx-inreg.ll +++ b/llvm/test/CodeGen/X86/pmovsx-inreg.ll @@ -27,8 +27,8 @@ define void @test1(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test1: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxbq (%ecx), %xmm0 +; X86-AVX2-NEXT: vpmovsxbq (%eax), %xmm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %xmm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %xmm0, (%eax) @@ -74,8 +74,8 @@ define void @test2(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test2: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxbq (%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxbq (%eax), %ymm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %ymm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %ymm0, (%eax) @@ -108,8 +108,8 @@ define void @test3(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test3: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxbd (%ecx), %xmm0 +; X86-AVX2-NEXT: vpmovsxbd (%eax), %xmm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %xmm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %xmm0, (%eax) @@ -155,8 +155,8 @@ define void @test4(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test4: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxbd (%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxbd (%eax), %ymm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %ymm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %ymm0, (%eax) @@ -189,8 +189,8 @@ define void @test5(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test5: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxbw (%ecx), %xmm0 +; X86-AVX2-NEXT: vpmovsxbw (%eax), %xmm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %xmm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %xmm0, (%eax) @@ -236,8 +236,8 @@ define void @test6(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test6: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxbw (%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxbw (%eax), %ymm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %ymm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %ymm0, (%eax) @@ -270,8 +270,8 @@ define void @test7(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test7: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxwq (%ecx), %xmm0 +; X86-AVX2-NEXT: vpmovsxwq (%eax), %xmm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %xmm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %xmm0, (%eax) @@ -317,8 +317,8 @@ define void @test8(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test8: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxwq (%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxwq (%eax), %ymm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %ymm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %ymm0, (%eax) @@ -351,8 +351,8 @@ define void @test9(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test9: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxwd (%ecx), %xmm0 +; X86-AVX2-NEXT: vpmovsxwd (%eax), %xmm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %xmm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %xmm0, (%eax) @@ -398,8 +398,8 @@ define void @test10(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test10: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxwd (%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%eax), %ymm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %ymm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %ymm0, (%eax) @@ -432,8 +432,8 @@ define void @test11(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test11: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxdq (%ecx), %xmm0 +; X86-AVX2-NEXT: vpmovsxdq (%eax), %xmm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %xmm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %xmm0, (%eax) @@ -479,8 +479,8 @@ define void @test12(ptr %in, ptr %out) nounwind { ; X86-AVX2-LABEL: test12: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpmovsxdq (%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxdq (%eax), %ymm0 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups %ymm1, (%eax) ; X86-AVX2-NEXT: vmovdqu %ymm0, (%eax) diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index c7cc2acaf2627..c743847eceb67 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -105,31 +105,20 @@ entry: } define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind { -; SSE2-LABEL: mul_v2i64c: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: mul_v2i64c: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [117,117] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmuludq %xmm1, %xmm2 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm1, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: mul_v2i64c: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: mul_v2i64c: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [117,117] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117] ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -160,7 +149,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -400,10 +389,10 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; ; SSE41-LABEL: mul_v32i8c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pmaddubsw %xmm2, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 @@ -447,19 +436,12 @@ entry: } define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind { -; SSE2-LABEL: mul_v16i16c: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pmullw %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: mul_v16i16c: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] -; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pmullw %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: mul_v16i16c: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] +; SSE-NEXT: pmullw %xmm2, %xmm0 +; SSE-NEXT: pmullw %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX-LABEL: mul_v16i16c: ; AVX: # %bb.0: # %entry @@ -490,7 +472,7 @@ define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind { ; ; SSE41-LABEL: mul_v8i32c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [117,117,117,117] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117] ; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: pmulld %xmm2, %xmm1 ; SSE41-NEXT: retq @@ -506,39 +488,22 @@ entry: } define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind { -; SSE2-LABEL: mul_v4i64c: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: paddq %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: psllq $32, %xmm1 -; SSE2-NEXT: paddq %xmm3, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: mul_v4i64c: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [117,117] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: paddq %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: pmuludq %xmm2, %xmm1 -; SSE41-NEXT: psllq $32, %xmm1 -; SSE41-NEXT: paddq %xmm3, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: mul_v4i64c: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: psrlq $32, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX-LABEL: mul_v4i64c: ; AVX: # %bb.0: # %entry @@ -584,7 +549,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v32i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pandn %xmm2, %xmm5 ; SSE41-NEXT: pand %xmm4, %xmm2 @@ -773,10 +738,10 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; ; SSE41-LABEL: mul_v64i8c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] ; SSE41-NEXT: movdqa %xmm0, %xmm6 ; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; SSE41-NEXT: pmaddubsw %xmm7, %xmm0 @@ -899,7 +864,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v64i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm8, %xmm9 ; SSE41-NEXT: pandn %xmm4, %xmm9 ; SSE41-NEXT: pand %xmm8, %xmm4 @@ -959,7 +924,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 @@ -979,7 +944,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512BW-LABEL: mul_v64i8: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 300da68d9a3b3..7cfbb03b2ffe6 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -329,7 +329,7 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; ; SSE41-LABEL: and_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] ; SSE41-NEXT: pand %xmm8, %xmm3 ; SSE41-NEXT: pand %xmm8, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 @@ -566,7 +566,6 @@ define <32 x i16> @sext_mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) { define <64 x i16> @zext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) { ; SSE-LABEL: zext_mulhuw_v64i16: ; SSE: # %bb.0: -; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 @@ -574,6 +573,7 @@ define <64 x i16> @zext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) { ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa %xmm7, 112(%rdi) ; SSE-NEXT: movdqa %xmm6, 96(%rdi) @@ -623,7 +623,6 @@ define <64 x i16> @zext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) { define <64 x i16> @sext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) { ; SSE-LABEL: sext_mulhuw_v64i16: ; SSE: # %bb.0: -; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 @@ -631,6 +630,7 @@ define <64 x i16> @sext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) { ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa %xmm7, 112(%rdi) ; SSE-NEXT: movdqa %xmm6, 96(%rdi) @@ -965,9 +965,9 @@ define void @PR109790(ptr sret([32 x i8]) %ret, ptr %a) { ; ; AVX512F-LABEL: PR109790: ; AVX512F: # %bb.0: -; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0 @@ -977,9 +977,9 @@ define void @PR109790(ptr sret([32 x i8]) %ret, ptr %a) { ; ; AVX512BW-LABEL: PR109790: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512BW-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0,64536,0] ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0 @@ -1460,8 +1460,8 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE2-NEXT: movdqa %xmm4, %xmm13 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] @@ -1499,8 +1499,8 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41-LABEL: zext_mulhuw_v64i16_lshr: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pxor %xmm11, %xmm11 ; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] @@ -1641,8 +1641,8 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE2-NEXT: movdqa %xmm4, %xmm13 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] @@ -1680,8 +1680,8 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; SSE41-LABEL: mulhsw_v64i16_lshr: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pxor %xmm11, %xmm11 ; SSE41-NEXT: movdqa %xmm8, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] @@ -1803,8 +1803,8 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-LABEL: mulhsw_v64i16_ashr: ; SSE2: # %bb.0: -; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm8 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] @@ -1813,8 +1813,8 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; SSE2-NEXT: psrad $16, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] ; SSE2-NEXT: psrad $16, %xmm10 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] @@ -1823,8 +1823,8 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] ; SSE2-NEXT: psrad $16, %xmm11 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm3 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: psrad $16, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; SSE2-NEXT: psrad $16, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] @@ -1833,8 +1833,8 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] ; SSE2-NEXT: psrad $16, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; SSE2-NEXT: psrad $16, %xmm14 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] @@ -1864,8 +1864,8 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; ; SSE41-LABEL: mulhsw_v64i16_ashr: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pmovsxwd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 @@ -1880,8 +1880,8 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pmovsxwd %xmm3, %xmm11 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 +; SSE41-NEXT: pmovsxwd %xmm3, %xmm3 ; SSE41-NEXT: pmovsxwd %xmm4, %xmm12 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovsxwd %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll index 35c7c0e09f394..04f189565a486 100644 --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -463,14 +463,14 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X86-POPCNT-LABEL: cnt128: ; X86-POPCNT: # %bb.0: ; X86-POPCNT-NEXT: pushl %esi -; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: addl %eax, %ecx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx -; X86-POPCNT-NEXT: addl %ecx, %edx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi -; X86-POPCNT-NEXT: addl %ecx, %esi +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-POPCNT-NEXT: addl %edx, %esi +; X86-POPCNT-NEXT: addl %ecx, %esi ; X86-POPCNT-NEXT: movl %esi, (%eax) ; X86-POPCNT-NEXT: movl $0, 12(%eax) ; X86-POPCNT-NEXT: movl $0, 8(%eax) @@ -1058,8 +1058,8 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-POPCNT-LABEL: cnt128_optsize: ; X86-POPCNT: # %bb.0: ; X86-POPCNT-NEXT: pushl %esi -; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx ; X86-POPCNT-NEXT: addl %ecx, %edx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx @@ -1539,8 +1539,8 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-POPCNT-LABEL: cnt128_pgso: ; X86-POPCNT: # %bb.0: ; X86-POPCNT-NEXT: pushl %esi -; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx ; X86-POPCNT-NEXT: addl %ecx, %edx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx diff --git a/llvm/test/CodeGen/X86/pr120093.ll b/llvm/test/CodeGen/X86/pr120093.ll index 99bf2218c5538..4e405397559b6 100644 --- a/llvm/test/CodeGen/X86/pr120093.ll +++ b/llvm/test/CodeGen/X86/pr120093.ll @@ -5,10 +5,10 @@ define double @PR120093() { ; CHECK-LABEL: PR120093: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorpd %xmm0, %xmm0 +; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: xorpd %xmm1, %xmm1 -; CHECK-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: cmpnltpd %xmm1, %xmm0 -; CHECK-NEXT: movmskpd %xmm0, %eax +; CHECK-NEXT: cmpnltpd %xmm0, %xmm1 +; CHECK-NEXT: movmskpd %xmm1, %eax ; CHECK-NEXT: cmpl $3, %eax ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %bb2 diff --git a/llvm/test/CodeGen/X86/pr18344.ll b/llvm/test/CodeGen/X86/pr18344.ll index 75a55e6a4bf5e..df71a47bff54c 100644 --- a/llvm/test/CodeGen/X86/pr18344.ll +++ b/llvm/test/CodeGen/X86/pr18344.ll @@ -37,8 +37,8 @@ define void @FFT(ptr noalias nocapture %destination, ptr noalias %re, ptr noalia ; X64-NEXT: movdqu (%rdx), %xmm0 ; X64-NEXT: pslld $4, %xmm0 ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: cltq ; X64-NEXT: pextrd $1, %xmm0, %ecx +; X64-NEXT: cltq ; X64-NEXT: movslq %ecx, %rcx ; X64-NEXT: pextrd $2, %xmm0, %edx ; X64-NEXT: movslq %edx, %rdx diff --git a/llvm/test/CodeGen/X86/pr2656.ll b/llvm/test/CodeGen/X86/pr2656.ll index 8306dafbb346f..facb6a60cc4f3 100644 --- a/llvm/test/CodeGen/X86/pr2656.ll +++ b/llvm/test/CodeGen/X86/pr2656.ll @@ -19,9 +19,9 @@ define void @foo(ptr byval(%struct.anon) %p) nounwind { ; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: xorps %xmm0, %xmm1 ; CHECK-NEXT: cvtss2sd %xmm1, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: xorps %xmm0, %xmm2 ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtss2sd %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll index 2e5c6f047292c..b099e0c399404 100644 --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -11,29 +11,29 @@ define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <1 ; CHECK-NEXT: subq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 144 ; CHECK-NEXT: vmovaps %xmm1, %xmm13 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,20,1,17] +; CHECK-NEXT: vmovaps {{.*#+}} xmm5 = [3,20,1,17] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm5 ; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,1,2,3] -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,21,1,17,4,21,5,21] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4,21,1,17,4,21,5,21] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0 ; CHECK-NEXT: vmovaps %zmm0, %zmm6 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,20,1,27] +; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,20,1,27] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm7 = [5,20,1,19,5,20,5,23] +; CHECK-NEXT: vmovaps {{.*#+}} ymm7 = [5,20,1,19,5,20,5,23] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,20,1,19,4,20,5,23] +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4,20,1,19,4,20,5,23] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,28,1,17] +; CHECK-NEXT: vmovaps {{.*#+}} xmm12 = [4,28,1,17] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm12 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,20,1,17,5,20,5,21] +; CHECK-NEXT: vmovaps {{.*#+}} ymm8 = [5,20,1,17,5,20,5,21] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,30,1,22] +; CHECK-NEXT: vmovaps {{.*#+}} xmm9 = [4,30,1,22] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm9 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm10 = [4,22,1,17,4,22,5,21] +; CHECK-NEXT: vmovaps {{.*#+}} ymm10 = [4,22,1,17,4,22,5,21] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,20,3,18,4,20,7,22] +; CHECK-NEXT: vmovaps {{.*#+}} ymm11 = [4,20,3,18,4,20,7,22] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm11 ; CHECK-NEXT: vaddps %xmm10, %xmm11, %xmm2 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/pr30562.ll b/llvm/test/CodeGen/X86/pr30562.ll index e05a8672b1f81..e656d78ccf710 100644 --- a/llvm/test/CodeGen/X86/pr30562.ll +++ b/llvm/test/CodeGen/X86/pr30562.ll @@ -16,8 +16,8 @@ define i32 @foo(ptr nocapture %perm, i32 %n) { ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq -24(%rsp,%rcx,8), %rdx ; CHECK-NEXT: movups %xmm0, (%rdi,%rdx,8) -; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %exit ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr31271.ll b/llvm/test/CodeGen/X86/pr31271.ll index 0ac1943a428ef..14f725dbc6d21 100644 --- a/llvm/test/CodeGen/X86/pr31271.ll +++ b/llvm/test/CodeGen/X86/pr31271.ll @@ -12,8 +12,8 @@ define void @fn1(i32 %k, ptr %p) { ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) ; CHECK-NEXT: setne %dl -; CHECK-NEXT: addl $c, %eax ; CHECK-NEXT: movl %edx, (%eax) +; CHECK-NEXT: addl $c, %eax ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %r ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll index d9671aa04f460..fec8628829b6c 100644 --- a/llvm/test/CodeGen/X86/pr32329.ll +++ b/llvm/test/CodeGen/X86/pr32329.ll @@ -51,9 +51,9 @@ define void @foo() local_unnamed_addr { ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovnel %ecx, %ebx ; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: setge var_205 ; X86-NEXT: movl %ebp, var_50+4 ; X86-NEXT: movl %ebx, var_50 -; X86-NEXT: setge var_205 ; X86-NEXT: imull %eax, %edx ; X86-NEXT: movb %dl, var_218 ; X86-NEXT: popl %esi @@ -69,24 +69,24 @@ define void @foo() local_unnamed_addr { ; X64-LABEL: foo: ; X64: # %bb.0: # %entry ; X64-NEXT: movsbl var_27(%rip), %eax -; X64-NEXT: movzwl var_2(%rip), %edx ; X64-NEXT: movl var_310(%rip), %ecx ; X64-NEXT: imull %eax, %ecx ; X64-NEXT: addl var_24(%rip), %ecx -; X64-NEXT: movl $4194303, %esi # imm = 0x3FFFFF -; X64-NEXT: andl obj(%rip), %esi -; X64-NEXT: leal (%rsi,%rsi), %edi +; X64-NEXT: movl $4194303, %edx # imm = 0x3FFFFF +; X64-NEXT: andl obj(%rip), %edx +; X64-NEXT: movzwl var_2(%rip), %esi +; X64-NEXT: leal (%rdx,%rdx), %edi ; X64-NEXT: subl %eax, %edi ; X64-NEXT: movl %edi, %r8d -; X64-NEXT: subl %edx, %r8d +; X64-NEXT: subl %esi, %r8d ; X64-NEXT: imull %r8d, %ecx ; X64-NEXT: addb $113, %cl -; X64-NEXT: movl $9, %edx +; X64-NEXT: movl $9, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shlq %cl, %rdx -; X64-NEXT: movq %rdx, var_50(%rip) -; X64-NEXT: cmpl %esi, %r8d +; X64-NEXT: shlq %cl, %rsi +; X64-NEXT: cmpl %edx, %r8d ; X64-NEXT: setge var_205(%rip) +; X64-NEXT: movq %rsi, var_50(%rip) ; X64-NEXT: imull %eax, %edi ; X64-NEXT: movb %dil, var_218(%rip) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr32345.ll b/llvm/test/CodeGen/X86/pr32345.ll index c7405e982660c..3a9873f6a4cd1 100644 --- a/llvm/test/CodeGen/X86/pr32345.ll +++ b/llvm/test/CodeGen/X86/pr32345.ll @@ -92,10 +92,10 @@ define void @foo() { ; X86-NEXT: movzbl var_27, %ecx ; X86-NEXT: movzwl var_22, %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: addb $30, %cl ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shrdl %cl, %edx, %eax +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: testb $32, %cl ; X86-NEXT: jne .LBB0_2 ; X86-NEXT: # %bb.1: # %bb diff --git a/llvm/test/CodeGen/X86/pr32368.ll b/llvm/test/CodeGen/X86/pr32368.ll index 52cf6fb07d672..c10bacea688aa 100644 --- a/llvm/test/CodeGen/X86/pr32368.ll +++ b/llvm/test/CodeGen/X86/pr32368.ll @@ -114,12 +114,12 @@ define <16 x float> @PR32368_512(<16 x float>) { ; ; AVX1-LABEL: PR32368_512: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll index de34bfb13159c..4a8eacf6e46b7 100644 --- a/llvm/test/CodeGen/X86/pr34080-2.ll +++ b/llvm/test/CodeGen/X86/pr34080-2.ll @@ -14,38 +14,39 @@ define void @computeJD(ptr) nounwind { ; CHECK-NEXT: andl $-8, %esp ; CHECK-NEXT: subl $40, %esp ; CHECK-NEXT: movl 8(%ebp), %ebx -; CHECK-NEXT: movl 8(%ebx), %esi ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $3, 12(%ebx) +; CHECK-NEXT: movl 8(%ebx), %edi ; CHECK-NEXT: setl %al -; CHECK-NEXT: subl %eax, %esi +; CHECK-NEXT: subl %eax, %edi ; CHECK-NEXT: movl $-1374389535, %ecx # imm = 0xAE147AE1 -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: imull %ecx ; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: sarl $5, %ecx -; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: imull %edx -; CHECK-NEXT: movl %edx, %edi -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: sarl $7, %edi -; CHECK-NEXT: addl %eax, %edi -; CHECK-NEXT: imull $36525, %esi, %eax # imm = 0x8EAD +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: imull $36525, %edi, %eax # imm = 0x8EAD ; CHECK-NEXT: addl $172251900, %eax # imm = 0xA445AFC ; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F ; CHECK-NEXT: imull %edx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $5, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $7, %esi +; CHECK-NEXT: addl %eax, %esi ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: sarl $5, %edx ; CHECK-NEXT: addl %eax, %edx ; CHECK-NEXT: addl 16(%ebx), %ecx -; CHECK-NEXT: addl %edi, %ecx -; CHECK-NEXT: leal 257(%ecx,%edx), %eax +; CHECK-NEXT: addl %esi, %ecx +; CHECK-NEXT: leal (%ecx,%edx), %eax +; CHECK-NEXT: addl $257, %eax # imm = 0x101 ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: fildl {{[0-9]+}}(%esp) ; CHECK-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}} @@ -60,18 +61,18 @@ define void @computeJD(ptr) nounwind { ; CHECK-NEXT: movb $1, 36(%ebx) ; CHECK-NEXT: imull $3600000, 20(%ebx), %ecx # imm = 0x36EE80 ; CHECK-NEXT: imull $60000, 24(%ebx), %eax # imm = 0xEA60 -; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: fldl 28(%ebx) ; CHECK-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} ; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: orl $3072, %ecx # imm = 0xC00 -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: sarl $31, %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: orl $3072, %edx # imm = 0xC00 +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp) ; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) ; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: sarl $31, %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll index 5b2431eb21495..68cc5bfbb0afb 100644 --- a/llvm/test/CodeGen/X86/pr34177.ll +++ b/llvm/test/CodeGen/X86/pr34177.ll @@ -8,23 +8,23 @@ target triple = "x86_64-unknown-linux-gnu" define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr { ; AVX512F-LABEL: test: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rdx -; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512F-NEXT: cmpq $3, %rsi +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512F-NEXT: cmpq $3, %rcx ; AVX512F-NEXT: fld1 ; AVX512F-NEXT: fldz ; AVX512F-NEXT: fld %st(0) ; AVX512F-NEXT: fcmove %st(2), %st +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vmovq %xmm1, %rdx ; AVX512F-NEXT: cmpq $2, %rdx ; AVX512F-NEXT: fld %st(1) ; AVX512F-NEXT: fcmove %st(3), %st -; AVX512F-NEXT: cmpq $1, %rcx +; AVX512F-NEXT: cmpq $1, %rax ; AVX512F-NEXT: fld %st(2) ; AVX512F-NEXT: fcmove %st(4), %st -; AVX512F-NEXT: testq %rax, %rax +; AVX512F-NEXT: testq %rcx, %rcx ; AVX512F-NEXT: fxch %st(3) ; AVX512F-NEXT: fcmove %st(4), %st ; AVX512F-NEXT: fstp %st(4) @@ -49,7 +49,6 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr { ; AVX512VL-LABEL: test: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 -; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: testb $8, %al ; AVX512VL-NEXT: fld1 @@ -59,10 +58,11 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr { ; AVX512VL-NEXT: testb $2, %al ; AVX512VL-NEXT: fld %st(1) ; AVX512VL-NEXT: fcmovne %st(3), %st +; AVX512VL-NEXT: kshiftrb $2, %k0, %k0 ; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fld %st(2) ; AVX512VL-NEXT: fcmovne %st(4), %st -; AVX512VL-NEXT: kmovd %k1, %eax +; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fxch %st(3) ; AVX512VL-NEXT: fcmovne %st(4), %st diff --git a/llvm/test/CodeGen/X86/pr34605.ll b/llvm/test/CodeGen/X86/pr34605.ll index 25dd6a7436a8a..c0c0c7e283856 100644 --- a/llvm/test/CodeGen/X86/pr34605.ll +++ b/llvm/test/CodeGen/X86/pr34605.ll @@ -4,9 +4,9 @@ define void @pr34605(ptr nocapture %s, i32 %p) { ; CHECK-LABEL: pr34605: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm0 ; CHECK-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %k0 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %k1 ; CHECK-NEXT: kunpckwd %k0, %k1, %k0 ; CHECK-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %k1 diff --git a/llvm/test/CodeGen/X86/pr35972.ll b/llvm/test/CodeGen/X86/pr35972.ll index 981c47800c0f3..e66bb907acb7e 100644 --- a/llvm/test/CodeGen/X86/pr35972.ll +++ b/llvm/test/CodeGen/X86/pr35972.ll @@ -4,13 +4,13 @@ define void @test3(i32 %c, ptr %ptr) { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: sbbl %ecx, %ecx -; CHECK-NEXT: kmovd %ecx, %k0 +; CHECK-NEXT: sbbl %eax, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: kmovd %eax, %k0 ; CHECK-NEXT: kunpckdq %k0, %k0, %k0 -; CHECK-NEXT: kmovq %k0, (%eax) +; CHECK-NEXT: kmovq %k0, (%ecx) ; CHECK-NEXT: retl %cmp = icmp eq i32 %c, 0 %insert = insertelement <64 x i1> undef, i1 %cmp, i32 0 diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll index 0ad35309b87bb..43afd97fe9286 100644 --- a/llvm/test/CodeGen/X86/pr35982.ll +++ b/llvm/test/CodeGen/X86/pr35982.ll @@ -7,10 +7,10 @@ define float @PR35982_emms(<1 x i64>) nounwind { ; NO-POSTRA: # %bb.0: ; NO-POSTRA-NEXT: subl $8, %esp ; NO-POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax +; NO-POSTRA-NEXT: emms ; NO-POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; NO-POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] ; NO-POSTRA-NEXT: movd %mm0, %ecx -; NO-POSTRA-NEXT: emms ; NO-POSTRA-NEXT: movl %eax, (%esp) ; NO-POSTRA-NEXT: fildl (%esp) ; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -21,12 +21,12 @@ define float @PR35982_emms(<1 x i64>) nounwind { ; POSTRA-LABEL: PR35982_emms: ; POSTRA: # %bb.0: ; POSTRA-NEXT: subl $8, %esp +; POSTRA-NEXT: emms ; POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax ; POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] -; POSTRA-NEXT: movd %mm0, %ecx -; POSTRA-NEXT: emms ; POSTRA-NEXT: movl %eax, (%esp) +; POSTRA-NEXT: movd %mm0, %ecx ; POSTRA-NEXT: fildl (%esp) ; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/pr37499.ll b/llvm/test/CodeGen/X86/pr37499.ll index 15a7739fd2c7f..2995017275c18 100644 --- a/llvm/test/CodeGen/X86/pr37499.ll +++ b/llvm/test/CodeGen/X86/pr37499.ll @@ -4,7 +4,7 @@ define <2 x i64> @undef_tval() { ; CHECK-LABEL: undef_tval: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0 {%k1} @@ -18,7 +18,7 @@ define <2 x i64> @undef_tval() { define <2 x i64> @foo(<8 x i64> %x) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1} @@ -33,7 +33,7 @@ define <2 x i64> @foo(<8 x i64> %x) { define <4 x i64> @goo(<16 x i32> %x) { ; CHECK-LABEL: goo: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpmovdw %zmm0, %ymm1 {%k1} diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll index b633c28a214b7..ae6639553519e 100644 --- a/llvm/test/CodeGen/X86/pr38539.ll +++ b/llvm/test/CodeGen/X86/pr38539.ll @@ -22,7 +22,7 @@ define void @f() nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $160, %esp +; X86-NEXT: subl $176, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -75,20 +75,20 @@ define void @f() nounwind { ; X86-NEXT: addl $64, %esi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: .LBB0_8: # %BB_udiv-special-cases -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl $-66, %eax ; X86-NEXT: movl $0, %ebx ; X86-NEXT: adcl $-1, %ebx -; X86-NEXT: movl $0, %edx -; X86-NEXT: adcl $3, %edx +; X86-NEXT: movl $0, %esi +; X86-NEXT: adcl $3, %esi ; X86-NEXT: movb $1, %cl ; X86-NEXT: testb %cl, %cl ; X86-NEXT: jne .LBB0_14 ; X86-NEXT: # %bb.9: # %BB_udiv-special-cases -; X86-NEXT: andl $3, %edx +; X86-NEXT: andl $3, %esi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorl $65, %ecx -; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: je .LBB0_14 ; X86-NEXT: # %bb.10: # %udiv-bb1 @@ -96,16 +96,14 @@ define void @f() nounwind { ; X86-NEXT: addl $1, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: andl $3, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movb $65, %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %esi +; X86-NEXT: movsbl %al, %eax ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -115,24 +113,23 @@ define void @f() nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 112(%esp,%esi), %edi -; X86-NEXT: movl 116(%esp,%esi), %eax -; X86-NEXT: movl 120(%esp,%esi), %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl 132(%esp,%eax), %edi +; X86-NEXT: movl 136(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $3, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: je .LBB0_13 ; X86-NEXT: # %bb.11: # %udiv-preheader -; X86-NEXT: andl $3, %edi -; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -146,94 +143,97 @@ define void @f() nounwind { ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $12, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 72(%esp,%eax), %ebx -; X86-NEXT: movl 64(%esp,%eax), %esi -; X86-NEXT: movl 68(%esp,%eax), %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrdl %cl, %ebx, %eax -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl 88(%esp,%eax), %esi +; X86-NEXT: movl 84(%esp,%eax), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: shrdl %cl, %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %eax ; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shrdl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl $-1, %eax +; X86-NEXT: shrdl %cl, %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax +; X86-NEXT: andl $3, %eax +; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $3, %eax +; X86-NEXT: andl $3, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $3, %edi -; X86-NEXT: andl $3, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB0_12: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl $1, %ebx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %edx -; X86-NEXT: andl $2, %edx -; X86-NEXT: shrl %edx -; X86-NEXT: leal (%edx,%ebx,2), %ebx +; X86-NEXT: shldl $1, %ebx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: shldl $1, %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %esi +; X86-NEXT: andl $2, %esi +; X86-NEXT: shrl %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl $1, %eax, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %eax, %eax +; X86-NEXT: shldl $1, %eax, %edi +; X86-NEXT: leal (%esi,%edx,2), %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %esi, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: andl $3, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %ecx, %edx -; X86-NEXT: shll $30, %edx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl %ecx, %esi +; X86-NEXT: shll $30, %esi +; X86-NEXT: movl %esi, %edi ; X86-NEXT: sarl $30, %edi -; X86-NEXT: sarl $31, %edx -; X86-NEXT: shrdl $1, %edx, %edi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: shrdl $1, %esi, %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: subl %edi, %ebx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: subl %edi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: movl %esi, %ebx ; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: andl $3, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $-1, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl $3, %edi ; X86-NEXT: andl $3, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %eax ; X86-NEXT: jne .LBB0_12 ; X86-NEXT: .LBB0_13: # %udiv-loop-exit -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-NEXT: .LBB0_14: # %udiv-end ; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload ; X86-NEXT: setne (%eax) -; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movb $0, (%eax) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/pr38738.ll b/llvm/test/CodeGen/X86/pr38738.ll index 205849e7d05db..366acc7b1c47c 100644 --- a/llvm/test/CodeGen/X86/pr38738.ll +++ b/llvm/test/CodeGen/X86/pr38738.ll @@ -84,14 +84,11 @@ entry: define void @tryset(ptr nocapture %x) { ; X64SSE-LABEL: tryset: ; X64SSE: # %bb.0: -; X64SSE-NEXT: movq $0, 56(%rdi) -; X64SSE-NEXT: movq $0, 48(%rdi) -; X64SSE-NEXT: movq $0, 40(%rdi) -; X64SSE-NEXT: movq $0, 32(%rdi) -; X64SSE-NEXT: movq $0, 24(%rdi) -; X64SSE-NEXT: movq $0, 16(%rdi) -; X64SSE-NEXT: movq $0, 8(%rdi) -; X64SSE-NEXT: movq $0, (%rdi) +; X64SSE-NEXT: xorps %xmm0, %xmm0 +; X64SSE-NEXT: movups %xmm0, 48(%rdi) +; X64SSE-NEXT: movups %xmm0, 32(%rdi) +; X64SSE-NEXT: movups %xmm0, 16(%rdi) +; X64SSE-NEXT: movups %xmm0, (%rdi) ; X64SSE-NEXT: retq ; ; X86SSE-LABEL: tryset: @@ -117,28 +114,21 @@ define void @tryset(ptr nocapture %x) { ; ; X64SSE2-LABEL: tryset: ; X64SSE2: # %bb.0: -; X64SSE2-NEXT: movq $0, 56(%rdi) -; X64SSE2-NEXT: movq $0, 48(%rdi) -; X64SSE2-NEXT: movq $0, 40(%rdi) -; X64SSE2-NEXT: movq $0, 32(%rdi) -; X64SSE2-NEXT: movq $0, 24(%rdi) -; X64SSE2-NEXT: movq $0, 16(%rdi) -; X64SSE2-NEXT: movq $0, 8(%rdi) -; X64SSE2-NEXT: movq $0, (%rdi) +; X64SSE2-NEXT: xorps %xmm0, %xmm0 +; X64SSE2-NEXT: movups %xmm0, 48(%rdi) +; X64SSE2-NEXT: movups %xmm0, 32(%rdi) +; X64SSE2-NEXT: movups %xmm0, 16(%rdi) +; X64SSE2-NEXT: movups %xmm0, (%rdi) ; X64SSE2-NEXT: retq ; ; X86SSE2-LABEL: tryset: ; X86SSE2: # %bb.0: ; X86SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86SSE2-NEXT: xorps %xmm0, %xmm0 -; X86SSE2-NEXT: movsd %xmm0, 56(%eax) -; X86SSE2-NEXT: movsd %xmm0, 48(%eax) -; X86SSE2-NEXT: movsd %xmm0, 40(%eax) -; X86SSE2-NEXT: movsd %xmm0, 32(%eax) -; X86SSE2-NEXT: movsd %xmm0, 24(%eax) -; X86SSE2-NEXT: movsd %xmm0, 16(%eax) -; X86SSE2-NEXT: movsd %xmm0, 8(%eax) -; X86SSE2-NEXT: movsd %xmm0, (%eax) +; X86SSE2-NEXT: movups %xmm0, 48(%eax) +; X86SSE2-NEXT: movups %xmm0, 32(%eax) +; X86SSE2-NEXT: movups %xmm0, 16(%eax) +; X86SSE2-NEXT: movups %xmm0, (%eax) ; X86SSE2-NEXT: retl ; ; X64AVX-LABEL: tryset: @@ -164,14 +154,10 @@ define void @tryset(ptr nocapture %x) { define void @trycpy(ptr nocapture %x, ptr nocapture readonly %y) { ; X64SSE-LABEL: trycpy: ; X64SSE: # %bb.0: -; X64SSE-NEXT: movq 24(%rsi), %rax -; X64SSE-NEXT: movq %rax, 24(%rdi) -; X64SSE-NEXT: movq 16(%rsi), %rax -; X64SSE-NEXT: movq %rax, 16(%rdi) -; X64SSE-NEXT: movq (%rsi), %rax -; X64SSE-NEXT: movq 8(%rsi), %rcx -; X64SSE-NEXT: movq %rcx, 8(%rdi) -; X64SSE-NEXT: movq %rax, (%rdi) +; X64SSE-NEXT: movups (%rsi), %xmm0 +; X64SSE-NEXT: movups 16(%rsi), %xmm1 +; X64SSE-NEXT: movups %xmm1, 16(%rdi) +; X64SSE-NEXT: movups %xmm0, (%rdi) ; X64SSE-NEXT: retq ; ; X86SSE-LABEL: trycpy: @@ -198,28 +184,20 @@ define void @trycpy(ptr nocapture %x, ptr nocapture readonly %y) { ; ; X64SSE2-LABEL: trycpy: ; X64SSE2: # %bb.0: -; X64SSE2-NEXT: movq 24(%rsi), %rax -; X64SSE2-NEXT: movq %rax, 24(%rdi) -; X64SSE2-NEXT: movq 16(%rsi), %rax -; X64SSE2-NEXT: movq %rax, 16(%rdi) -; X64SSE2-NEXT: movq (%rsi), %rax -; X64SSE2-NEXT: movq 8(%rsi), %rcx -; X64SSE2-NEXT: movq %rcx, 8(%rdi) -; X64SSE2-NEXT: movq %rax, (%rdi) +; X64SSE2-NEXT: movups (%rsi), %xmm0 +; X64SSE2-NEXT: movups 16(%rsi), %xmm1 +; X64SSE2-NEXT: movups %xmm1, 16(%rdi) +; X64SSE2-NEXT: movups %xmm0, (%rdi) ; X64SSE2-NEXT: retq ; ; X86SSE2-LABEL: trycpy: ; X86SSE2: # %bb.0: ; X86SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86SSE2-NEXT: movsd %xmm0, 24(%eax) -; X86SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86SSE2-NEXT: movsd %xmm0, 16(%eax) -; X86SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86SSE2-NEXT: movsd %xmm1, 8(%eax) -; X86SSE2-NEXT: movsd %xmm0, (%eax) +; X86SSE2-NEXT: movups (%ecx), %xmm0 +; X86SSE2-NEXT: movups 16(%ecx), %xmm1 +; X86SSE2-NEXT: movups %xmm1, 16(%eax) +; X86SSE2-NEXT: movups %xmm0, (%eax) ; X86SSE2-NEXT: retl ; ; X64AVX-LABEL: trycpy: diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll index c3c96e8228797..364f4c4347ca3 100644 --- a/llvm/test/CodeGen/X86/pr38795.ll +++ b/llvm/test/CodeGen/X86/pr38795.ll @@ -68,10 +68,10 @@ define dso_local void @fn() { ; CHECK-NEXT: calll printf ; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload ; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload -; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: # implicit-def: $eax ; CHECK-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne .LBB0_15 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: # %bb.5: # %for.cond35 @@ -288,9 +288,9 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) { ; CHECK-NEXT: # %bb.9: # %if.then13 ; CHECK-NEXT: # in Loop: Header=BB1_1 Depth=1 ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: testb $1, %al ; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: movl $0, %ebx +; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: jne .LBB1_8 ; CHECK-NEXT: jmp .LBB1_5 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/pr38865-3.ll b/llvm/test/CodeGen/X86/pr38865-3.ll index bc73a2585c996..084fbec81f942 100644 --- a/llvm/test/CodeGen/X86/pr38865-3.ll +++ b/llvm/test/CodeGen/X86/pr38865-3.ll @@ -7,11 +7,17 @@ target triple = "x86_64-unknown-linux-gnux32" define void @foo(ptr %x) optsize { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: movl $707406378, %eax # encoding: [0xb8,0x2a,0x2a,0x2a,0x2a] -; CHECK-NEXT: # imm = 0x2A2A2A2A -; CHECK-NEXT: movl $32, %ecx # encoding: [0xb9,0x20,0x00,0x00,0x00] -; CHECK-NEXT: # kill: def $edi killed $edi killed $rdi -; CHECK-NEXT: rep;stosl %eax, %es:(%edi) # encoding: [0xf3,0x67,0xab] +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; CHECK-NEXT: # encoding: [0x0f,0x28,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; CHECK-NEXT: movups %xmm0, 112(%edi) # encoding: [0x67,0x0f,0x11,0x47,0x70] +; CHECK-NEXT: movups %xmm0, 96(%edi) # encoding: [0x67,0x0f,0x11,0x47,0x60] +; CHECK-NEXT: movups %xmm0, 80(%edi) # encoding: [0x67,0x0f,0x11,0x47,0x50] +; CHECK-NEXT: movups %xmm0, 64(%edi) # encoding: [0x67,0x0f,0x11,0x47,0x40] +; CHECK-NEXT: movups %xmm0, 48(%edi) # encoding: [0x67,0x0f,0x11,0x47,0x30] +; CHECK-NEXT: movups %xmm0, 32(%edi) # encoding: [0x67,0x0f,0x11,0x47,0x20] +; CHECK-NEXT: movups %xmm0, 16(%edi) # encoding: [0x67,0x0f,0x11,0x47,0x10] +; CHECK-NEXT: movups %xmm0, (%edi) # encoding: [0x67,0x0f,0x11,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] call void @llvm.memset.p0.i32(ptr align 4 %x, i8 42, i32 128, i1 false) ret void diff --git a/llvm/test/CodeGen/X86/pr40891.ll b/llvm/test/CodeGen/X86/pr40891.ll index 1795333ca3f79..3317a5076d781 100644 --- a/llvm/test/CodeGen/X86/pr40891.ll +++ b/llvm/test/CodeGen/X86/pr40891.ll @@ -7,8 +7,8 @@ define <8 x i32> @foo(<8 x i64> %x, <4 x i64> %y) { ; X86-LABEL: foo: ; X86: # %bb.0: -; X86-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 +; X86-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X86-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; X86-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll index 2fb4410567be6..20a902e7f5ed8 100644 --- a/llvm/test/CodeGen/X86/pr43820.ll +++ b/llvm/test/CodeGen/X86/pr43820.ll @@ -15,14 +15,14 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; CHECK-NEXT: bswapq %r12 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; CHECK-NEXT: movq %r12, %r10 ; CHECK-NEXT: shrq $4, %r10 -; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: andq %rsi, %r12 +; CHECK-NEXT: movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F +; CHECK-NEXT: andq %r11, %r10 +; CHECK-NEXT: andq %r11, %r12 ; CHECK-NEXT: shlq $4, %r12 ; CHECK-NEXT: orq %r10, %r12 ; CHECK-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333 @@ -41,8 +41,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %r14 ; CHECK-NEXT: movq %r14, %r12 ; CHECK-NEXT: shrq $4, %r12 -; CHECK-NEXT: andq %rsi, %r12 -; CHECK-NEXT: andq %rsi, %r14 +; CHECK-NEXT: andq %r11, %r12 +; CHECK-NEXT: andq %r11, %r14 ; CHECK-NEXT: shlq $4, %r14 ; CHECK-NEXT: orq %r12, %r14 ; CHECK-NEXT: movq %r14, %r12 @@ -55,13 +55,13 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %r13 ; CHECK-NEXT: shrq %r12 ; CHECK-NEXT: andq %r14, %r12 +; CHECK-NEXT: bswapq %r15 ; CHECK-NEXT: leaq (%r12,%r13,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: bswapq %r15 ; CHECK-NEXT: movq %r15, %r12 ; CHECK-NEXT: shrq $4, %r12 -; CHECK-NEXT: andq %rsi, %r12 -; CHECK-NEXT: andq %rsi, %r15 +; CHECK-NEXT: andq %r11, %r12 +; CHECK-NEXT: andq %r11, %r15 ; CHECK-NEXT: shlq $4, %r15 ; CHECK-NEXT: orq %r12, %r15 ; CHECK-NEXT: movq %r15, %r12 @@ -78,8 +78,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: movq %rbx, %r15 ; CHECK-NEXT: shrq $4, %r15 -; CHECK-NEXT: andq %rsi, %r15 -; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %r11, %r15 +; CHECK-NEXT: andq %r11, %rbx ; CHECK-NEXT: shlq $4, %rbx ; CHECK-NEXT: orq %r15, %rbx ; CHECK-NEXT: movq %rbx, %r15 @@ -91,13 +91,13 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %r15 ; CHECK-NEXT: shrq %rbx ; CHECK-NEXT: andq %r14, %rbx +; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: leaq (%rbx,%r15,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: andq %r11, %rbx +; CHECK-NEXT: andq %r11, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -115,8 +115,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: andq %r11, %rbx +; CHECK-NEXT: andq %r11, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -134,8 +134,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: andq %r11, %rbx +; CHECK-NEXT: andq %r11, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -153,8 +153,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: andq %r11, %rbx +; CHECK-NEXT: andq %r11, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -166,21 +166,21 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %rbx ; CHECK-NEXT: shrq %rdi ; CHECK-NEXT: andq %r14, %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; CHECK-NEXT: bswapq %r15 ; CHECK-NEXT: leaq (%rdi,%rbx,2), %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: bswapq %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi -; CHECK-NEXT: shlq $4, %rdi -; CHECK-NEXT: orq %rbx, %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: andq %r10, %rbx -; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: movq %r15, %rdi +; CHECK-NEXT: shrq $4, %rdi +; CHECK-NEXT: andq %r11, %rdi +; CHECK-NEXT: andq %r11, %r15 +; CHECK-NEXT: shlq $4, %r15 +; CHECK-NEXT: orq %rdi, %r15 +; CHECK-NEXT: movq %r15, %rdi ; CHECK-NEXT: andq %r10, %rdi -; CHECK-NEXT: leaq (%rdi,%rbx,4), %rdi +; CHECK-NEXT: shrq $2, %r15 +; CHECK-NEXT: andq %r10, %r15 +; CHECK-NEXT: leaq (%r15,%rdi,4), %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: andq %r14, %rbx ; CHECK-NEXT: shrq %rdi @@ -191,8 +191,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rsi, %rbx -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: andq %r11, %rbx +; CHECK-NEXT: andq %r11, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -210,8 +210,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %r11, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rax, %rdi ; CHECK-NEXT: movq %rdi, %rax @@ -223,12 +223,13 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %rdi ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rdi,2), %rdi ; CHECK-NEXT: bswapq %r9 +; CHECK-NEXT: leaq (%rax,%rdi,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %r9 +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %r11, %r9 ; CHECK-NEXT: shlq $4, %r9 ; CHECK-NEXT: orq %rax, %r9 ; CHECK-NEXT: movq %r9, %rax @@ -245,8 +246,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %r8 ; CHECK-NEXT: movq %r8, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %r8 +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %r11, %r8 ; CHECK-NEXT: shlq $4, %r8 ; CHECK-NEXT: orq %rax, %r8 ; CHECK-NEXT: movq %r8, %rax @@ -258,13 +259,12 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %r8 ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%r8,2), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %rcx +; CHECK-NEXT: leaq (%rax,%r8,2), %rbp ; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %r11, %rcx ; CHECK-NEXT: shlq $4, %rcx ; CHECK-NEXT: orq %rax, %rcx ; CHECK-NEXT: movq %rcx, %rax @@ -280,49 +280,49 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdx ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rdx +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: andq %r11, %rdx ; CHECK-NEXT: shlq $4, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: andq %r10, %rax ; CHECK-NEXT: shrq $2, %rdx ; CHECK-NEXT: andq %r10, %rdx -; CHECK-NEXT: leaq (%rdx,%rax,4), %rax -; CHECK-NEXT: movq %rax, %rdx +; CHECK-NEXT: leaq (%rdx,%rax,4), %rdx +; CHECK-NEXT: movq %rdx, %rcx +; CHECK-NEXT: andq %r14, %rcx +; CHECK-NEXT: shrq %rdx ; CHECK-NEXT: andq %r14, %rdx -; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rdx,2), %rdx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-NEXT: bswapq %rcx -; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rsi, %rax -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: shlq $4, %rcx -; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: andq %r10, %rax -; CHECK-NEXT: shrq $2, %rcx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: bswapq %rax +; CHECK-NEXT: leaq (%rdx,%rcx,2), %rsi +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrq $4, %rcx +; CHECK-NEXT: andq %r11, %rcx +; CHECK-NEXT: andq %r11, %rax +; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: andq %r10, %rcx -; CHECK-NEXT: leaq (%rcx,%rax,4), %rax -; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: andq %r14, %rsi -; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: andq %r10, %rax +; CHECK-NEXT: leaq (%rax,%rcx,4), %r10 +; CHECK-NEXT: movq %r10, %rdx +; CHECK-NEXT: andq %r14, %rdx +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: andq %r14, %r10 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rax, %r10 +; CHECK-NEXT: shrdq $24, %rax, %rdi ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rcx, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rbp, %rcx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: shrdq $24, %rax, %rcx ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r13, %rbp +; CHECK-NEXT: shrdq $24, %r13, %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r12, %r13 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload @@ -333,20 +333,21 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: shrdq $24, %r11, %r14 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r9, %r11 -; CHECK-NEXT: movq %rdi, %r8 -; CHECK-NEXT: shrdq $24, %rdi, %r9 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rdi, %r8 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rcx, %rdi -; CHECK-NEXT: shrdq $24, %rbx, %rcx -; CHECK-NEXT: shrdq $24, %rdx, %rbx -; CHECK-NEXT: shrdq $24, %rsi, %rdx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; CHECK-NEXT: shrdq $24, %r8, %r9 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movq %rdx, 112(%rax) +; CHECK-NEXT: shrdq $24, %rax, %r8 +; CHECK-NEXT: shrdq $24, %rbp, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: shrdq $24, %rbx, %rbp +; CHECK-NEXT: shrdq $24, %rsi, %rbx +; CHECK-NEXT: leaq (%r10,%rdx,2), %r10 +; CHECK-NEXT: shrdq $24, %r10, %rsi +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-NEXT: movq %rsi, 112(%rax) ; CHECK-NEXT: movq %rbx, 104(%rax) -; CHECK-NEXT: movq %rcx, 96(%rax) -; CHECK-NEXT: movq %rdi, 88(%rax) +; CHECK-NEXT: movq %rbp, 96(%rax) +; CHECK-NEXT: movq %rcx, 88(%rax) ; CHECK-NEXT: movq %r8, 80(%rax) ; CHECK-NEXT: movq %r9, 72(%rax) ; CHECK-NEXT: movq %r11, 64(%rax) @@ -354,15 +355,16 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: movq %r15, 48(%rax) ; CHECK-NEXT: movq %r12, 40(%rax) ; CHECK-NEXT: movq %r13, 32(%rax) -; CHECK-NEXT: movq %rbp, 24(%rax) +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: movq %rcx, 24(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 8(%rax) -; CHECK-NEXT: movq %r10, (%rax) -; CHECK-NEXT: movq %rsi, %rcx -; CHECK-NEXT: shrq $56, %rsi -; CHECK-NEXT: movb %sil, 124(%rax) +; CHECK-NEXT: movq %rdi, (%rax) +; CHECK-NEXT: movq %r10, %rcx +; CHECK-NEXT: shrq $56, %r10 +; CHECK-NEXT: movb %r10b, 124(%rax) ; CHECK-NEXT: shrq $24, %rcx ; CHECK-NEXT: movl %ecx, 120(%rax) ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/pr44812.ll b/llvm/test/CodeGen/X86/pr44812.ll index 7c4dc67dc9c0e..ebb1d6ed20558 100644 --- a/llvm/test/CodeGen/X86/pr44812.ll +++ b/llvm/test/CodeGen/X86/pr44812.ll @@ -8,9 +8,9 @@ define <2 x i32> @foo(<2 x i32> %tmp) { ; CHECK-NEXT: leal 7(%eax), %ecx ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: cmovnsl %eax, %ecx -; CHECK-NEXT: sarl $3, %ecx ; CHECK-NEXT: movl $1717986919, %eax # imm = 0x66666667 ; CHECK-NEXT: imull {{[0-9]+}}(%esp) +; CHECK-NEXT: sarl $3, %ecx ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: sarl $2, %edx diff --git a/llvm/test/CodeGen/X86/pr44976.ll b/llvm/test/CodeGen/X86/pr44976.ll index 7c8d5e099ca67..660e1dfa3c445 100644 --- a/llvm/test/CodeGen/X86/pr44976.ll +++ b/llvm/test/CodeGen/X86/pr44976.ll @@ -33,11 +33,11 @@ define <3 x i32> @f_29(<12 x i16> %a, <12 x i16> %b) { ; CHECK-NEXT: movd %edi, %xmm1 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: pinsrw $1, {{[0-9]+}}(%rsp), %xmm4 ; CHECK-NEXT: pinsrw $2, {{[0-9]+}}(%rsp), %xmm4 ; CHECK-NEXT: pinsrw $3, {{[0-9]+}}(%rsp), %xmm4 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: pinsrw $1, {{[0-9]+}}(%rsp), %xmm2 ; CHECK-NEXT: pinsrw $2, {{[0-9]+}}(%rsp), %xmm2 diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll index 72877e1b1d67d..d62ba66b9afbd 100644 --- a/llvm/test/CodeGen/X86/pr45563-2.ll +++ b/llvm/test/CodeGen/X86/pr45563-2.ll @@ -18,8 +18,6 @@ define <9 x float> @mload_split9(<9 x i1> %mask, ptr %addr, <9 x float> %dst) { ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovd %esi, %xmm1 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 @@ -29,21 +27,23 @@ define <9 x float> @mload_split9(<9 x i1> %mask, ptr %addr, <9 x float> %dst) { ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm4 -; CHECK-NEXT: vblendvps %ymm1, %ymm4, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u] -; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm1, %ymm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm5 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: vblendvps %ymm1, %ymm5, %ymm0, %ymm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u] +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm2, %ymm3 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -60,8 +60,6 @@ define <13 x float> @mload_split13(<13 x i1> %mask, ptr %addr, <13 x float> %dst ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovd %esi, %xmm1 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 @@ -74,16 +72,18 @@ define <13 x float> @mload_split13(<13 x i1> %mask, ptr %addr, <13 x float> %dst ; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm3 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm5 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm6, %xmm6 ; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 @@ -92,15 +92,15 @@ define <13 x float> @mload_split13(<13 x i1> %mask, ptr %addr, <13 x float> %dst ; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 -; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 +; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm5, %ymm5 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vblendvps %xmm1, %xmm5, %xmm3, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, 32(%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0 -; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm0 +; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 48(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -111,14 +111,11 @@ define <13 x float> @mload_split13(<13 x i1> %mask, ptr %addr, <13 x float> %dst define <14 x float> @mload_split14(<14 x i1> %mask, ptr %addr, <14 x float> %dst) { ; CHECK-LABEL: mload_split14: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovd %esi, %xmm1 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 @@ -133,12 +130,13 @@ define <14 x float> @mload_split14(<14 x i1> %mask, ptr %addr, <14 x float> %dst ; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 @@ -147,7 +145,9 @@ define <14 x float> @mload_split14(<14 x i1> %mask, ptr %addr, <14 x float> %dst ; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 ; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm5 +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] ; CHECK-NEXT: vblendvps %ymm1, %ymm5, %ymm0, %ymm0 +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm2[u],zero,xmm2[u] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 @@ -177,57 +177,57 @@ define <17 x float> @mload_split17(<17 x i1> %mask, ptr %addr, <17 x float> %dst ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: vmovd %esi, %xmm3 -; CHECK-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $3, %r8d, %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $4, %r9d, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; CHECK-NEXT: vmaskmovps (%rdi), %ymm3, %ymm4 -; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm2, %ymm2 -; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm3, %ymm4 -; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1 -; CHECK-NEXT: vmovd %r10d, %xmm3 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vblendvps %ymm1, %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm3, %ymm4 -; CHECK-NEXT: vblendvps %xmm3, %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, 64(%rax) -; CHECK-NEXT: vmovaps %ymm1, 32(%rax) -; CHECK-NEXT: vmovaps %ymm2, (%rax) +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm2, %ymm3 +; CHECK-NEXT: vmovd %edx, %xmm5 +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vmaskmovps 64(%rcx), %ymm5, %ymm6 +; CHECK-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: vblendvps %xmm5, %xmm6, %xmm4, %xmm2 +; CHECK-NEXT: vmovss %xmm2, 64(%rdi) +; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-NEXT: vmovaps %ymm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <17 x float> @llvm.masked.load.v17f32.p0(ptr %addr, i32 4, <17 x i1>%mask, <17 x float> %dst) @@ -237,85 +237,85 @@ define <17 x float> @mload_split17(<17 x i1> %mask, ptr %addr, <17 x float> %dst define <23 x float> @mload_split23(<23 x i1> %mask, ptr %addr, <23 x float> %dst) { ; CHECK-LABEL: mload_split23: ; CHECK: # %bb.0: +; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0],mem[0],xmm8[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1],mem[0],xmm8[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],mem[0] ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm9[0],mem[0],xmm9[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm9[0,1],mem[0],xmm9[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm9[0,1,2],mem[0] +; CHECK-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm9[0],mem[0],xmm9[2,3] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: vmovd {{.*#+}} xmm10 = mem[0],zero,zero,zero +; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; CHECK-NEXT: vpslld $31, %xmm11, %xmm11 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm10, %xmm10 +; CHECK-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm10, %ymm11 +; CHECK-NEXT: vinsertps {{.*#+}} xmm12 = xmm9[0,1],mem[0],xmm9[3] +; CHECK-NEXT: vblendvps %ymm10, %ymm11, %ymm8, %ymm8 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: vmovd %r10d, %xmm9 +; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm9, %xmm9 +; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm9, %xmm9 +; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm9, %xmm10 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm9, %xmm9 +; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm10, %xmm10 +; CHECK-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm10, %xmm11 +; CHECK-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm10 +; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm10, %ymm10 +; CHECK-NEXT: vextractf128 $1, %ymm10, %xmm13 +; CHECK-NEXT: vblendvps %xmm11, %xmm13, %xmm12, %xmm11 ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: vmovd %esi, %xmm4 -; CHECK-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $2, %ecx, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $3, %r8d, %xmm4, %xmm4 -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 -; CHECK-NEXT: vpinsrb $4, %r9d, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; CHECK-NEXT: vmaskmovps (%rdi), %ymm4, %ymm5 -; CHECK-NEXT: vblendvps %ymm4, %ymm5, %ymm3, %ymm3 -; CHECK-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm4, %ymm5 -; CHECK-NEXT: vblendvps %ymm4, %ymm5, %ymm2, %ymm2 -; CHECK-NEXT: vmovd %r10d, %xmm4 -; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 -; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 -; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm6, %ymm6 -; CHECK-NEXT: vmovaps %ymm2, 32(%rax) -; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm2 -; CHECK-NEXT: vblendvps %xmm4, %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vextractps $2, %xmm1, 88(%rax) -; CHECK-NEXT: vmovlps %xmm1, 80(%rax) -; CHECK-NEXT: vblendvps %xmm5, %xmm6, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 64(%rax) -; CHECK-NEXT: vmovaps %ymm3, (%rax) +; CHECK-NEXT: vmovd %esi, %xmm2 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; CHECK-NEXT: vmaskmovps (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vmovaps %ymm8, 32(%rax) +; CHECK-NEXT: vextractps $2, %xmm11, 88(%rax) +; CHECK-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: vblendvps %xmm9, %xmm10, %xmm1, %xmm1 +; CHECK-NEXT: vmovlps %xmm11, 80(%rax) +; CHECK-NEXT: vmovaps %xmm1, 64(%rax) +; CHECK-NEXT: vmovaps %ymm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <23 x float> @llvm.masked.load.v23f32.p0(ptr %addr, i32 4, <23 x i1>%mask, <23 x float> %dst) diff --git a/llvm/test/CodeGen/X86/pr45563.ll b/llvm/test/CodeGen/X86/pr45563.ll index 214ae56b50c01..d854653c6ab90 100644 --- a/llvm/test/CodeGen/X86/pr45563.ll +++ b/llvm/test/CodeGen/X86/pr45563.ll @@ -26,33 +26,33 @@ define <16 x double> @bug45563(ptr %addr, <16 x double> %dst, <16 x i64> %e, <16 ; CHECK-NEXT: vmovdqa 128(%rbp), %xmm10 ; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8 ; CHECK-NEXT: vpcmpgtq %xmm7, %xmm9, %xmm7 -; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm8 -; CHECK-NEXT: vmovdqa 80(%rbp), %xmm9 -; CHECK-NEXT: vmovdqa 96(%rbp), %xmm10 -; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8 -; CHECK-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm6 -; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm8 -; CHECK-NEXT: vmovdqa 48(%rbp), %xmm9 +; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm9 +; CHECK-NEXT: vextractf128 $1, %ymm4, %xmm10 +; CHECK-NEXT: vmovdqa 32(%rbp), %xmm11 +; CHECK-NEXT: vpcmpgtq %xmm10, %xmm11, %xmm10 +; CHECK-NEXT: vmovdqa 16(%rbp), %xmm11 +; CHECK-NEXT: vpcmpgtq %xmm4, %xmm11, %xmm4 +; CHECK-NEXT: vmovdqa 96(%rbp), %xmm11 +; CHECK-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 +; CHECK-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm10 +; CHECK-NEXT: vpcmpgtq %xmm9, %xmm11, %xmm9 +; CHECK-NEXT: vblendvpd %ymm4, %ymm10, %ymm0, %ymm0 +; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm4 ; CHECK-NEXT: vmovdqa 64(%rbp), %xmm10 -; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8 -; CHECK-NEXT: vpcmpgtq %xmm5, %xmm9, %xmm5 -; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; CHECK-NEXT: vextractf128 $1, %ymm4, %xmm8 -; CHECK-NEXT: vmovdqa 16(%rbp), %xmm9 -; CHECK-NEXT: vmovdqa 32(%rbp), %xmm10 -; CHECK-NEXT: vpcmpgtq %xmm8, %xmm10, %xmm8 -; CHECK-NEXT: vpcmpgtq %xmm4, %xmm9, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; CHECK-NEXT: vmaskmovpd (%rdi), %ymm4, %ymm8 -; CHECK-NEXT: vblendvpd %ymm4, %ymm8, %ymm0, %ymm0 -; CHECK-NEXT: vmaskmovpd 32(%rdi), %ymm5, %ymm4 -; CHECK-NEXT: vblendvpd %ymm5, %ymm4, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovpd 64(%rdi), %ymm6, %ymm4 -; CHECK-NEXT: vblendvpd %ymm6, %ymm4, %ymm2, %ymm2 -; CHECK-NEXT: vmaskmovpd 96(%rdi), %ymm7, %ymm4 -; CHECK-NEXT: vblendvpd %ymm7, %ymm4, %ymm3, %ymm3 +; CHECK-NEXT: vpcmpgtq %xmm4, %xmm10, %xmm4 +; CHECK-NEXT: vmovdqa 48(%rbp), %xmm10 +; CHECK-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm5 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; CHECK-NEXT: vmaskmovpd 32(%rdi), %ymm4, %ymm5 +; CHECK-NEXT: vmovdqa 80(%rbp), %xmm10 +; CHECK-NEXT: vpcmpgtq %xmm6, %xmm10, %xmm6 +; CHECK-NEXT: vblendvpd %ymm4, %ymm5, %ymm1, %ymm1 +; CHECK-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm4 +; CHECK-NEXT: vmaskmovpd 64(%rdi), %ymm4, %ymm5 +; CHECK-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm6 +; CHECK-NEXT: vmaskmovpd 96(%rdi), %ymm6, %ymm7 +; CHECK-NEXT: vblendvpd %ymm4, %ymm5, %ymm2, %ymm2 +; CHECK-NEXT: vblendvpd %ymm6, %ymm7, %ymm3, %ymm3 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll index 04c342b6673ed..e4a70fece8fe5 100644 --- a/llvm/test/CodeGen/X86/pr45833.ll +++ b/llvm/test/CodeGen/X86/pr45833.ll @@ -17,24 +17,24 @@ define void @mstore_split9(<9 x float> %value, ptr %addr, <9 x i1> %mask) { ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[8,u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u] +; CHECK-NEXT: vpslld $31, %xmm6, %xmm6 +; CHECK-NEXT: vmaskmovps %ymm5, %ymm6, 32(%rdi) ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovd %esi, %xmm2 -; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u] -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm4, 32(%rdi) -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -52,12 +52,10 @@ define void @mstore_split13(<13 x float> %value, ptr %addr, <13 x i1> %mask) { ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -65,25 +63,27 @@ define void @mstore_split13(<13 x float> %value, ptr %addr, <13 x i1> %mask) { ; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm5 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm5, %xmm6 +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 -; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm3, 32(%rdi) +; CHECK-NEXT: vpsrldq {{.*#+}} xmm7 = xmm6[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm7, %xmm7 +; CHECK-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; CHECK-NEXT: vmaskmovps %ymm1, %ymm5, 32(%rdi) +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -110,35 +110,35 @@ define void @mstore_split14(<14 x float> %value, ptr %addr, <14 x i1> %mask) { ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmovd %esi, %xmm2 -; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm3[u],zero,xmm3[u] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vmovd %esi, %xmm3 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm3, %xmm4 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 +; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; CHECK-NEXT: vmaskmovps %ymm0, %ymm3, (%rdi) +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm4[u],zero,xmm4[u] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdi) +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v14f32.p0(<14 x float> %value, ptr %addr, i32 4, <14 x i1>%mask) @@ -148,41 +148,41 @@ define void @mstore_split14(<14 x float> %value, ptr %addr, <14 x i1> %mask) { define void @mstore_split17(<17 x float> %value, ptr %addr, <17 x i1> %mask) { ; CHECK-LABEL: mstore_split17: ; CHECK: # %bb.0: +; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: vmovd %eax, %xmm9 +; CHECK-NEXT: vpslld $31, %xmm9, %xmm9 +; CHECK-NEXT: vmaskmovps %ymm8, %ymm9, 64(%rdi) ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],mem[0] +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],mem[0],xmm6[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0,1],mem[0],xmm6[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0,1,2],mem[0] +; CHECK-NEXT: vmovd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm6, %xmm5 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpslld $31, %xmm6, %xmm6 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; CHECK-NEXT: vmaskmovps %ymm1, %ymm5, 32(%rdi) ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vmovd %eax, %xmm3 -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vmaskmovps %ymm2, %ymm3, 64(%rdi) -; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi) ; CHECK-NEXT: vmovd %esi, %xmm1 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 @@ -208,58 +208,58 @@ define void @mstore_split23(<23 x float> %value, ptr %addr, <23 x i1> %mask) { ; CHECK-LABEL: mstore_split23: ; CHECK: # %bb.0: ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],mem[0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],mem[0],xmm6[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] +; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0,1],mem[0],xmm6[3] +; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0],mem[0],xmm7[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],mem[0],xmm7[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vinsertps {{.*#+}} xmm8 = xmm1[0,1],mem[0],xmm1[3] +; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm8[0,1,2],mem[0] +; CHECK-NEXT: vmovd {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm6, %xmm6 +; CHECK-NEXT: vpslld $31, %xmm7, %xmm7 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm6, %xmm6 +; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; CHECK-NEXT: vmaskmovps %ymm5, %ymm6, 32(%rdi) ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; CHECK-NEXT: vmaskmovps %ymm2, %ymm3, 32(%rdi) -; CHECK-NEXT: vmovd %eax, %xmm2 -; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 64(%rdi) +; CHECK-NEXT: vmovd %eax, %xmm5 +; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm6, %xmm6 +; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; CHECK-NEXT: vmaskmovps %ymm1, %ymm5, 64(%rdi) +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovd %esi, %xmm1 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/pr47857.ll b/llvm/test/CodeGen/X86/pr47857.ll index 419e839a5d974..1fa59b5533c8b 100644 --- a/llvm/test/CodeGen/X86/pr47857.ll +++ b/llvm/test/CodeGen/X86/pr47857.ll @@ -6,32 +6,32 @@ define void @PR47857(ptr noalias nocapture writeonly sret(%"struct.std::array") align 8 %0, ptr nocapture noundef nonnull readonly align 8 dereferenceable(32) %1, ptr nocapture noundef nonnull readonly align 8 dereferenceable(32) %2) { ; CHECK-LABEL: PR47857: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rdx), %r9 -; CHECK-NEXT: movq 8(%rdx), %rcx -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: addq (%rsi), %r9 -; CHECK-NEXT: adcq 8(%rsi), %rcx -; CHECK-NEXT: movq 16(%rdx), %r8 -; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: movq (%rdx), %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: addq (%rsi), %rax +; CHECK-NEXT: movq 8(%rdx), %r8 +; CHECK-NEXT: adcq 8(%rsi), %r8 +; CHECK-NEXT: movq 16(%rdx), %r9 +; CHECK-NEXT: adcq 16(%rsi), %r9 ; CHECK-NEXT: movq 24(%rdx), %rdx ; CHECK-NEXT: adcq 24(%rsi), %rdx -; CHECK-NEXT: sbbq %rdi, %rdi -; CHECK-NEXT: andl $38, %edi -; CHECK-NEXT: addq %rdi, %r9 -; CHECK-NEXT: adcq $0, %rcx +; CHECK-NEXT: sbbq %rcx, %rcx +; CHECK-NEXT: andl $38, %ecx +; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: adcq $0, %r9 ; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: sbbq %rdi, %rdi -; CHECK-NEXT: andl $38, %edi -; CHECK-NEXT: addq %r9, %rdi -; CHECK-NEXT: adcq $0, %rcx +; CHECK-NEXT: sbbq %rcx, %rcx +; CHECK-NEXT: andl $38, %ecx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: adcq $0, %r9 +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: adcq $0, %rdx -; CHECK-NEXT: movq %rdi, (%rax) -; CHECK-NEXT: movq %rcx, 8(%rax) -; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %rdx, 24(%rax) +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: movq %r8, 8(%rdi) +; CHECK-NEXT: movq %r9, 16(%rdi) +; CHECK-NEXT: movq %rdx, 24(%rdi) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 %5 = getelementptr inbounds %"struct.std::array", ptr %1, i64 0, i32 0, i64 1 diff --git a/llvm/test/CodeGen/X86/pr47874.ll b/llvm/test/CodeGen/X86/pr47874.ll index ce3aaca59fae8..182099aaeda94 100644 --- a/llvm/test/CodeGen/X86/pr47874.ll +++ b/llvm/test/CodeGen/X86/pr47874.ll @@ -11,6 +11,7 @@ define void @a(ptr %arg, i32 %arg1) { ; SSE2-NEXT: ## %bb.1: ## %bb2 ; SSE2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE2-NEXT: movl %esi, %eax +; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: LBB0_2: ## %bb6 ; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -18,9 +19,9 @@ define void @a(ptr %arg, i32 %arg1) { ; SSE2-NEXT: ## InlineAsm End ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Folded Reload -; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: addq $4, %rdi -; SSE2-NEXT: decq %rax +; SSE2-NEXT: movss %xmm0, (%rdi,%rcx,4) +; SSE2-NEXT: incq %rcx +; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne LBB0_2 ; SSE2-NEXT: LBB0_3: ## %bb5 ; SSE2-NEXT: retq @@ -32,6 +33,7 @@ define void @a(ptr %arg, i32 %arg1) { ; AVX-NEXT: ## %bb.1: ## %bb2 ; AVX-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; AVX-NEXT: movl %esi, %eax +; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4 ; AVX-NEXT: LBB0_2: ## %bb6 ; AVX-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -39,10 +41,10 @@ define void @a(ptr %arg, i32 %arg1) { ; AVX-NEXT: ## InlineAsm End ; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload ; AVX-NEXT: ## xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovss %xmm0, (%rdi) -; AVX-NEXT: addq $4, %rdi -; AVX-NEXT: decq %rax +; AVX-NEXT: vaddss (%rdi,%rcx,4), %xmm0, %xmm0 +; AVX-NEXT: vmovss %xmm0, (%rdi,%rcx,4) +; AVX-NEXT: incq %rcx +; AVX-NEXT: cmpq %rcx, %rax ; AVX-NEXT: jne LBB0_2 ; AVX-NEXT: LBB0_3: ## %bb5 ; AVX-NEXT: retq @@ -77,6 +79,7 @@ define void @b(ptr %arg, i64 %arg1) { ; SSE2-NEXT: jle LBB1_3 ; SSE2-NEXT: ## %bb.1: ## %bb2 ; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: LBB1_2: ## %bb6 ; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -84,9 +87,9 @@ define void @b(ptr %arg, i64 %arg1) { ; SSE2-NEXT: ## InlineAsm End ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Folded Reload -; SSE2-NEXT: movsd %xmm0, (%rdi) -; SSE2-NEXT: addq $8, %rdi -; SSE2-NEXT: decq %rsi +; SSE2-NEXT: movsd %xmm0, (%rdi,%rax,8) +; SSE2-NEXT: incq %rax +; SSE2-NEXT: cmpq %rax, %rsi ; SSE2-NEXT: jne LBB1_2 ; SSE2-NEXT: LBB1_3: ## %bb5 ; SSE2-NEXT: retq @@ -97,6 +100,7 @@ define void @b(ptr %arg, i64 %arg1) { ; AVX-NEXT: jle LBB1_3 ; AVX-NEXT: ## %bb.1: ## %bb2 ; AVX-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: .p2align 4 ; AVX-NEXT: LBB1_2: ## %bb6 ; AVX-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -104,10 +108,10 @@ define void @b(ptr %arg, i64 %arg1) { ; AVX-NEXT: ## InlineAsm End ; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload ; AVX-NEXT: ## xmm0 = mem[0],zero -; AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovsd %xmm0, (%rdi) -; AVX-NEXT: addq $8, %rdi -; AVX-NEXT: decq %rsi +; AVX-NEXT: vaddsd (%rdi,%rax,8), %xmm0, %xmm0 +; AVX-NEXT: vmovsd %xmm0, (%rdi,%rax,8) +; AVX-NEXT: incq %rax +; AVX-NEXT: cmpq %rax, %rsi ; AVX-NEXT: jne LBB1_2 ; AVX-NEXT: LBB1_3: ## %bb5 ; AVX-NEXT: retq @@ -142,6 +146,8 @@ define void @c(ptr %arg, ptr %arg1, i32 %arg2) { ; SSE2-NEXT: ## %bb.1: ## %bb4 ; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: shlq $4, %rax +; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: LBB2_2: ## %bb8 ; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -149,9 +155,9 @@ define void @c(ptr %arg, ptr %arg1, i32 %arg2) { ; SSE2-NEXT: ## InlineAsm End ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Folded Reload -; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: addq $16, %rdi -; SSE2-NEXT: decq %rax +; SSE2-NEXT: movss %xmm0, (%rdi,%rcx) +; SSE2-NEXT: addq $16, %rcx +; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne LBB2_2 ; SSE2-NEXT: LBB2_3: ## %bb7 ; SSE2-NEXT: retq @@ -163,6 +169,8 @@ define void @c(ptr %arg, ptr %arg1, i32 %arg2) { ; AVX-NEXT: ## %bb.1: ## %bb4 ; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: shlq $4, %rax +; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4 ; AVX-NEXT: LBB2_2: ## %bb8 ; AVX-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -170,10 +178,10 @@ define void @c(ptr %arg, ptr %arg1, i32 %arg2) { ; AVX-NEXT: ## InlineAsm End ; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload ; AVX-NEXT: ## xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovss %xmm0, (%rdi) -; AVX-NEXT: addq $16, %rdi -; AVX-NEXT: decq %rax +; AVX-NEXT: vaddss (%rdi,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vmovss %xmm0, (%rdi,%rcx) +; AVX-NEXT: addq $16, %rcx +; AVX-NEXT: cmpq %rcx, %rax ; AVX-NEXT: jne LBB2_2 ; AVX-NEXT: LBB2_3: ## %bb7 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr48215.ll b/llvm/test/CodeGen/X86/pr48215.ll index 8843a0410a9f7..d9ea1ee643040 100644 --- a/llvm/test/CodeGen/X86/pr48215.ll +++ b/llvm/test/CodeGen/X86/pr48215.ll @@ -12,11 +12,11 @@ define i32 @PR48215(i32 %a0, i32 %a1) { ; AVX1-NEXT: idivl %esi ; AVX1-NEXT: vmovd %edx, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] ; AVX1-NEXT: vmovd %eax, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,1,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3] ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovmskps %ymm2, %ecx @@ -51,9 +51,9 @@ define i32 @PR48215(i32 %a0, i32 %a1) { ; AVX512-NEXT: cltd ; AVX512-NEXT: idivl %esi ; AVX512-NEXT: vpbroadcastd %eax, %ymm0 -; AVX512-NEXT: vpbroadcastd %edx, %xmm1 ; AVX512-NEXT: vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 -; AVX512-NEXT: vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 +; AVX512-NEXT: vpbroadcastd %edx, %xmm0 +; AVX512-NEXT: vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 ; AVX512-NEXT: kmovw %k0, %eax ; AVX512-NEXT: movzbl %al, %ecx ; AVX512-NEXT: kmovw %k1, %eax diff --git a/llvm/test/CodeGen/X86/pr49393.ll b/llvm/test/CodeGen/X86/pr49393.ll index 512177246b5d9..bed326db79e0c 100644 --- a/llvm/test/CodeGen/X86/pr49393.ll +++ b/llvm/test/CodeGen/X86/pr49393.ll @@ -8,23 +8,23 @@ define void @f() { ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: mulsd %xmm0, %xmm1 -; CHECK-NEXT: subsd %xmm0, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm2 +; CHECK-NEXT: subsd %xmm0, %xmm2 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %for.cond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: imull %eax, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: cvtsi2sd %eax, %xmm2 -; CHECK-NEXT: mulsd %xmm0, %xmm2 -; CHECK-NEXT: mulsd %xmm0, %xmm2 -; CHECK-NEXT: movapd %xmm2, %xmm3 +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: cvtsi2sd %eax, %xmm3 ; CHECK-NEXT: mulsd %xmm1, %xmm3 -; CHECK-NEXT: mulsd %xmm0, %xmm2 -; CHECK-NEXT: movapd %xmm1, %xmm4 -; CHECK-NEXT: subsd %xmm3, %xmm4 -; CHECK-NEXT: addsd %xmm2, %xmm4 -; CHECK-NEXT: cvttsd2si %xmm4, %eax +; CHECK-NEXT: movapd %xmm3, %xmm4 +; CHECK-NEXT: mulsd %xmm2, %xmm4 +; CHECK-NEXT: mulsd %xmm0, %xmm3 +; CHECK-NEXT: movapd %xmm2, %xmm5 +; CHECK-NEXT: subsd %xmm4, %xmm5 +; CHECK-NEXT: addsd %xmm3, %xmm5 +; CHECK-NEXT: cvttsd2si %xmm5, %eax ; CHECK-NEXT: jmp .LBB0_1 entry: br label %for.cond diff --git a/llvm/test/CodeGen/X86/pr50782.ll b/llvm/test/CodeGen/X86/pr50782.ll index 591a33446d4e3..eba9b40899530 100644 --- a/llvm/test/CodeGen/X86/pr50782.ll +++ b/llvm/test/CodeGen/X86/pr50782.ll @@ -36,9 +36,9 @@ define void @h(float %i) { ; CHECK-NEXT: fsts _d ; CHECK-NEXT: fld1 ; CHECK-NEXT: fldz -; CHECK-NEXT: testl %ecx, %ecx ; CHECK-NEXT: fld %st(0) ; CHECK-NEXT: fld %st(2) +; CHECK-NEXT: testl %ecx, %ecx ; CHECK-NEXT: je LBB0_2 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: fstp %st(1) diff --git a/llvm/test/CodeGen/X86/pr57402.ll b/llvm/test/CodeGen/X86/pr57402.ll index 338229f51cf6e..96fa77a9b1211 100644 --- a/llvm/test/CodeGen/X86/pr57402.ll +++ b/llvm/test/CodeGen/X86/pr57402.ll @@ -6,7 +6,8 @@ define void @PR57402() { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: notl %eax ; CHECK-NEXT: andl $-2, %eax -; CHECK-NEXT: leal 1(%rax,%rax,2), %ecx +; CHECK-NEXT: leal (%rax,%rax,2), %ecx +; CHECK-NEXT: incl %ecx ; CHECK-NEXT: movswq %cx, %rsi ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: movq $-1, %rax diff --git a/llvm/test/CodeGen/X86/pr61964.ll b/llvm/test/CodeGen/X86/pr61964.ll index 1949841ea216b..24f150727a184 100644 --- a/llvm/test/CodeGen/X86/pr61964.ll +++ b/llvm/test/CodeGen/X86/pr61964.ll @@ -30,7 +30,7 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; AVX2-LABEL: splitTransposeDecode_8_avx2: ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] ; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1 @@ -39,9 +39,9 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; ; AVX512VL-LABEL: splitTransposeDecode_8_avx2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,8,24,1,17,9,25,2,18,10,26,3,19,11,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,8,24,1,17,9,25,2,18,10,26,3,19,11,27] ; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm3 = [4,20,12,28,5,21,13,29,6,22,14,30,7,23,15,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,20,12,28,5,21,13,29,6,22,14,30,7,23,15,31] ; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm3 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: vmovdqa %ymm3, %ymm1 @@ -67,7 +67,7 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; XOPAVX2-LABEL: splitTransposeDecode_8_avx2: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] ; XOPAVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1 diff --git a/llvm/test/CodeGen/X86/pr62014.ll b/llvm/test/CodeGen/X86/pr62014.ll index 19a6962731b6a..0cc04a32b7149 100644 --- a/llvm/test/CodeGen/X86/pr62014.ll +++ b/llvm/test/CodeGen/X86/pr62014.ll @@ -26,7 +26,7 @@ define <2 x i64> @select_cast_cond_multiuse_v2i64(<2 x i64> %x, <2 x i64> %y, i2 ; SSE42-NEXT: movapd %xmm0, %xmm2 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm3 = [1,2] +; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2] ; SSE42-NEXT: pand %xmm3, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm3, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 @@ -38,7 +38,7 @@ define <2 x i64> @select_cast_cond_multiuse_v2i64(<2 x i64> %x, <2 x i64> %y, i2 ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,2] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -91,7 +91,7 @@ define <4 x i32> @select_cast_cond_multiuse_v4i32(<4 x i32> %x, <4 x i32> %y, i4 ; SSE42-NEXT: movaps %xmm0, %xmm2 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] ; SSE42-NEXT: pand %xmm3, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm3, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm1 @@ -103,7 +103,7 @@ define <4 x i32> @select_cast_cond_multiuse_v4i32(<4 x i32> %x, <4 x i32> %y, i4 ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 @@ -158,7 +158,7 @@ define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; SSE42-NEXT: pand %xmm3, %xmm0 ; SSE42-NEXT: pcmpeqw %xmm3, %xmm0 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 @@ -170,7 +170,7 @@ define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8 ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 @@ -302,20 +302,20 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; SSE42-NEXT: pmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: pcmpeqw %xmm5, %xmm6 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm7 = [1,2,4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [1,2,4,8] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [1,2,4,8,16,32,64,128] +; SSE42-NEXT: pand %xmm7, %xmm6 ; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm2 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = [16,32,64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [16,32,64,128] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE42-NEXT: pcmpeqw %xmm7, %xmm6 ; SSE42-NEXT: movdqa %xmm6, (%rsi) ; SSE42-NEXT: movaps %xmm2, %xmm0 ; SSE42-NEXT: movaps %xmm3, %xmm1 @@ -325,10 +325,10 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 @@ -351,9 +351,9 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512VL-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: vpmovdw %ymm2, (%rsi) +; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1} {z} +; AVX512VL-NEXT: vpmovdw %ymm1, (%rsi) ; AVX512VL-NEXT: retq %z = bitcast i8 %m to <8 x i1> %s = sext <8 x i1> %z to <8 x i16> diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll index b5b80515fc6d9..c70f690ef5255 100644 --- a/llvm/test/CodeGen/X86/pr63108.ll +++ b/llvm/test/CodeGen/X86/pr63108.ll @@ -20,8 +20,8 @@ define i32 @PR63108() { ; SSE-NEXT: .LBB0_3: # %vector.body ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: testb %al, %al ; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: testb %al, %al ; SSE-NEXT: jne .LBB0_3 ; SSE-NEXT: # %bb.4: # %middle.block ; SSE-NEXT: pxor %xmm2, %xmm0 @@ -81,7 +81,7 @@ define i32 @PR63108() { ; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: je .LBB0_2 ; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0] ; AVX2-NEXT: jmp .LBB0_5 ; AVX2-NEXT: .LBB0_2: # %vector.body.preheader ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0] diff --git a/llvm/test/CodeGen/X86/pr63507.ll b/llvm/test/CodeGen/X86/pr63507.ll index 46f1038db19c6..869cd418005e7 100644 --- a/llvm/test/CodeGen/X86/pr63507.ll +++ b/llvm/test/CodeGen/X86/pr63507.ll @@ -4,7 +4,7 @@ define <4 x i32> @PR63507() { ; CHECK-LABEL: PR63507: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,0,4294967295,0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,4294967295] ; CHECK-NEXT: vpmulld %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %psll.i = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer) diff --git a/llvm/test/CodeGen/X86/pr65895.ll b/llvm/test/CodeGen/X86/pr65895.ll index 0990b10fa936d..cd0645744bf5b 100644 --- a/llvm/test/CodeGen/X86/pr65895.ll +++ b/llvm/test/CodeGen/X86/pr65895.ll @@ -24,7 +24,8 @@ define i32 @PR65895() { ; CHECK-NEXT: addb $-3, %al ; CHECK-NEXT: movsbl %al, %eax ; CHECK-NEXT: movl %eax, d(%rip) -; CHECK-NEXT: leal 247(%rax,%rax,2), %eax +; CHECK-NEXT: leal (%rax,%rax,2), %eax +; CHECK-NEXT: addl $247, %eax ; CHECK-NEXT: movb $1, c(%rip) ; CHECK-NEXT: movsbq %al, %rax ; CHECK-NEXT: movq %rax, e(%rip) @@ -66,7 +67,8 @@ define void @foo(i8 %arg) nounwind { ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: addb $-109, %dil ; CHECK-NEXT: movsbl %dil, %eax -; CHECK-NEXT: leal 1(%rax,%rax,2), %edi +; CHECK-NEXT: leal (%rax,%rax,2), %edi +; CHECK-NEXT: incl %edi ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll index ceccee00c9457..1c3b4bd4971c1 100644 --- a/llvm/test/CodeGen/X86/pr74736.ll +++ b/llvm/test/CodeGen/X86/pr74736.ll @@ -39,7 +39,7 @@ define void @main(<16 x i32> %0, i32 %1) { ; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX-NEXT: vpaddd %ymm1, %ymm1, %ymm1 -; AVX-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7] +; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7] ; AVX-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll index 5f13e97487435..7ed2006547379 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll @@ -89,8 +89,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) { define <32 x i8> @testv32i8(<32 x i8> %in) { ; AVX256-LABEL: testv32i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX256-NEXT: # ymm1 = mem[0,1,0,1] +; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 3699c7f75c861..b119faaff1a35 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -49,7 +49,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512VL-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -65,7 +65,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX256VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; AVX256VLBW-NEXT: vpmovm2w %k1, %ymm0 ; AVX256VLBW-NEXT: vpmovm2w %k0, %ymm1 -; AVX256VLBW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX256VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX256VLBW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX256VLBW-NEXT: vpmovw2m %ymm2, %k0 ; AVX256VLBW-NEXT: vpmovm2b %k0, %xmm0 @@ -80,7 +80,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512VLBW-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512VLBW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512VLBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VLBW-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512VLBW-NEXT: vpmovm2b %k0, %xmm0 @@ -95,7 +95,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -111,7 +111,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX512BW-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 @@ -181,7 +181,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512NOBW-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512NOBW-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512NOBW-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512NOBW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512NOBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512NOBW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512NOBW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOBW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll index 155ef0faadad8..ad48edf7a512d 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll @@ -10,8 +10,8 @@ define <16 x i1> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX256-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpsrlw $8, %ymm0, %ymm1 ; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX256-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX256-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX256-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX256-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -43,9 +43,9 @@ define <16 x i1> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpsllw $8, %ymm0, %ymm2 ; AVX512-NEXT: vpsraw $15, %ymm2, %ymm2 ; AVX512-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm2, %k1 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vpmovdb %zmm0, (%rdi) +; AVX512-NEXT: vpcmpneqd %zmm1, %zmm2, %k1 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper @@ -95,9 +95,9 @@ define <16 x i1> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm1 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vpmovdb %zmm0, (%rdi) +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll index 34e32c43ef797..be7900745516c 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll @@ -36,7 +36,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) { define <16 x i8> @testv16i8(<16 x i8> %in) { ; AVX256-LABEL: testv16i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX256-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX256-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -60,10 +60,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) { define <16 x i16> @testv16i16(<16 x i16> %in) { ; AVX256-LABEL: testv16i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX256-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX256-NEXT: # ymm3 = mem[0,1,0,1] +; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -87,10 +86,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) { define <32 x i8> @testv32i8(<32 x i8> %in) { ; AVX256-LABEL: testv32i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX256-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX256-NEXT: # ymm3 = mem[0,1,0,1] +; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -100,10 +98,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) { ; ; AVX512VL-LABEL: testv32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -115,8 +112,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-shift.ll b/llvm/test/CodeGen/X86/prefer-avx256-shift.ll index bf04c8d435559..ddc8f80163e64 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-shift.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-shift.ll @@ -307,7 +307,6 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256-NEXT: vpsraw $1, %ymm3, %ymm4 ; AVX256-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; AVX256-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX256-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX256-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX256-NEXT: vpsraw $4, %ymm0, %ymm3 @@ -318,8 +317,9 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256-NEXT: vpsraw $1, %ymm0, %ymm3 ; AVX256-NEXT: vpaddw %ymm1, %ymm1, %ymm1 ; AVX256-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX256-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX256-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX256-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX256-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: retq ; ; AVX512BW-LABEL: var_ashr_v32i8: @@ -343,7 +343,6 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512VL-NEXT: vpsraw $1, %ymm3, %ymm4 ; AVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpsraw $4, %ymm0, %ymm3 @@ -354,8 +353,9 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512VL-NEXT: vpsraw $1, %ymm0, %ymm3 ; AVX512VL-NEXT: vpaddw %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift @@ -437,7 +437,6 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX256VL-NEXT: vpsraw $1, %xmm3, %xmm4 ; AVX256VL-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX256VL-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX256VL-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX256VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX256VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX256VL-NEXT: vpsraw $4, %xmm0, %xmm3 @@ -448,8 +447,9 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX256VL-NEXT: vpsraw $1, %xmm0, %xmm3 ; AVX256VL-NEXT: vpaddw %xmm1, %xmm1, %xmm1 ; AVX256VL-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX256VL-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX256VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX256VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX256VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256VL-NEXT: retq ; ; AVX512VL-LABEL: var_ashr_v16i8: diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index 885b07585e68f..d18147c487ec0 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -9,7 +9,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) { ; AVX256BW: # %bb.0: ; AVX256BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -58,7 +58,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) { define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-LABEL: test_mul_32i8: ; AVX256BW: # %bb.0: -; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX256BW-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/promote-vec3.ll b/llvm/test/CodeGen/X86/promote-vec3.ll index f28a2ad0fd009..1df4ee2126c0a 100644 --- a/llvm/test/CodeGen/X86/promote-vec3.ll +++ b/llvm/test/CodeGen/X86/promote-vec3.ll @@ -68,9 +68,9 @@ define <3 x i16> @sext_i8(<3 x i8>) { ; SSE3-NEXT: shll $8, %eax ; SSE3-NEXT: pinsrw $2, %eax, %xmm0 ; SSE3-NEXT: psraw $8, %xmm0 -; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pextrw $1, %xmm0, %edx ; SSE3-NEXT: pextrw $2, %xmm0, %ecx +; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: # kill: def $ax killed $ax killed $eax ; SSE3-NEXT: # kill: def $dx killed $dx killed $edx ; SSE3-NEXT: # kill: def $cx killed $cx killed $ecx @@ -82,9 +82,9 @@ define <3 x i16> @sext_i8(<3 x i8>) { ; SSE41-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 ; SSE41-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: pextrw $1, %xmm0, %edx ; SSE41-NEXT: pextrw $2, %xmm0, %ecx +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: # kill: def $dx killed $dx killed $edx ; SSE41-NEXT: # kill: def $cx killed $cx killed $ecx @@ -96,9 +96,9 @@ define <3 x i16> @sext_i8(<3 x i8>) { ; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; AVX-32-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX-32-NEXT: vmovd %xmm0, %eax ; AVX-32-NEXT: vpextrw $1, %xmm0, %edx ; AVX-32-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-32-NEXT: vmovd %xmm0, %eax ; AVX-32-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-32-NEXT: # kill: def $dx killed $dx killed $edx ; AVX-32-NEXT: # kill: def $cx killed $cx killed $ecx @@ -110,9 +110,9 @@ define <3 x i16> @sext_i8(<3 x i8>) { ; AVX-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 ; AVX-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; AVX-64-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX-64-NEXT: vmovd %xmm0, %eax ; AVX-64-NEXT: vpextrw $1, %xmm0, %edx ; AVX-64-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-64-NEXT: vmovd %xmm0, %eax ; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx ; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 73ee28a7fd247..d9999a033c67a 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -70,8 +70,8 @@ define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, ptr %p1, ptr %p2) noun ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: movdqa %xmm0, (%rsi) ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq @@ -80,8 +80,8 @@ define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, ptr %p1, ptr %p2) noun ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -90,8 +90,8 @@ define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, ptr %p1, ptr %p2) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -100,8 +100,8 @@ define <16 x i8> @ashr_xor_and_commute_uses(<16 x i8> %x, ptr %p1, ptr %p2) noun ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm1, (%rdi) ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -793,7 +793,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test13: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -903,7 +903,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 ; SSE41-NEXT: packssdw %xmm5, %xmm6 ; SSE41-NEXT: packsswb %xmm7, %xmm6 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: pand %xmm5, %xmm3 ; SSE41-NEXT: packusdw %xmm4, %xmm3 @@ -938,7 +938,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 @@ -1047,7 +1047,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test15: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -1565,7 +1565,7 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_8i32_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -1747,13 +1747,13 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; AVX2-LABEL: psubus_8i64_max: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535] -; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1868,7 +1868,7 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm6 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] ; SSE41-NEXT: pminud %xmm6, %xmm3 ; SSE41-NEXT: pminud %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 @@ -1972,7 +1972,7 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin ; ; SSE41-LABEL: psubus_i16_i32_max_swapped: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -2067,7 +2067,7 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_i16_i32_min: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -2498,10 +2498,10 @@ define <64 x i8> @test28(<64 x i8> %x) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpsubusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test28: @@ -2535,11 +2535,11 @@ define <32 x i16> @test29(<32 x i16> %x) { ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpsubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test29: @@ -2656,7 +2656,7 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) { ; ; SSE41-LABEL: test32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] ; SSE41-NEXT: pminud %xmm3, %xmm2 ; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -2851,8 +2851,8 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 @@ -2949,9 +2949,6 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-LABEL: test34: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm3, %xmm9 ; SSE41-NEXT: pxor %xmm10, %xmm9 @@ -2968,12 +2965,15 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: pxor %xmm10, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [1,1,1,1] +; SSE41-NEXT: pand %xmm12, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: movapd %xmm9, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm12, %xmm6 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm6 ; SSE41-NEXT: psubd %xmm3, %xmm6 @@ -3039,8 +3039,8 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll index d3da7524eaf10..fefa97679964a 100644 --- a/llvm/test/CodeGen/X86/ptest.ll +++ b/llvm/test/CodeGen/X86/ptest.ll @@ -292,15 +292,15 @@ define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) { ; ; SSE41-LABEL: vecsel128: ; SSE41: # %bb.0: -; SSE41-NEXT: movl %edi, %eax ; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: movl %edi, %eax ; SSE41-NEXT: cmovel %esi, %eax ; SSE41-NEXT: retq ; ; AVX-LABEL: vecsel128: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovel %esi, %eax ; AVX-NEXT: retq %t0 = bitcast <4 x i32> %input to i128 @@ -323,16 +323,16 @@ define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) { ; ; SSE41-LABEL: vecsel256: ; SSE41: # %bb.0: -; SSE41-NEXT: movl %edi, %eax ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: movl %edi, %eax ; SSE41-NEXT: cmovel %esi, %eax ; SSE41-NEXT: retq ; ; AVX-LABEL: vecsel256: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vptest %ymm0, %ymm0 +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: cmovel %esi, %eax ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -358,19 +358,19 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) { ; ; SSE41-LABEL: vecsel512: ; SSE41: # %bb.0: -; SSE41-NEXT: movl %edi, %eax ; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: movl %edi, %eax ; SSE41-NEXT: cmovel %esi, %eax ; SSE41-NEXT: retq ; ; AVX1-LABEL: vecsel512: ; AVX1: # %bb.0: -; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: cmovel %esi, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll index 0a00a72a2dc94..900e7f9fa3244 100644 --- a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll +++ b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll @@ -103,9 +103,9 @@ define i32 @xor_signbit_shl(i32 %x, ptr %dst) { ; ; X86-LABEL: xor_signbit_shl: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $16711680, %eax # imm = 0xFF0000 ; X86-NEXT: xorl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shll $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -125,9 +125,9 @@ define i32 @xor_nosignbit_shl(i32 %x, ptr %dst) { ; ; X86-LABEL: xor_nosignbit_shl: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $16711680, %eax # imm = 0xFF0000 ; X86-NEXT: xorl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shll $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -216,9 +216,9 @@ define i32 @and_nosignbit_lshr(i32 %x, ptr %dst) { ; ; X86-LABEL: and_nosignbit_lshr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: andl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -239,9 +239,9 @@ define i32 @or_signbit_lshr(i32 %x, ptr %dst) { ; ; X86-LABEL: or_signbit_lshr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: orl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -261,9 +261,9 @@ define i32 @or_nosignbit_lshr(i32 %x, ptr %dst) { ; ; X86-LABEL: or_nosignbit_lshr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: orl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -284,9 +284,9 @@ define i32 @xor_signbit_lshr(i32 %x, ptr %dst) { ; ; X86-LABEL: xor_signbit_lshr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: xorl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -306,9 +306,9 @@ define i32 @xor_nosignbit_lshr(i32 %x, ptr %dst) { ; ; X86-LABEL: xor_nosignbit_lshr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: xorl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -329,9 +329,9 @@ define i32 @add_signbit_lshr(i32 %x, ptr %dst) { ; ; X86-LABEL: add_signbit_lshr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: addl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -351,9 +351,9 @@ define i32 @add_nosignbit_lshr(i32 %x, ptr %dst) { ; ; X86-LABEL: add_nosignbit_lshr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: addl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -397,9 +397,9 @@ define i32 @and_nosignbit_ashr(i32 %x, ptr %dst) { ; ; X86-LABEL: and_nosignbit_ashr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: andl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -420,9 +420,9 @@ define i32 @or_signbit_ashr(i32 %x, ptr %dst) { ; ; X86-LABEL: or_signbit_ashr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: orl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -442,9 +442,9 @@ define i32 @or_nosignbit_ashr(i32 %x, ptr %dst) { ; ; X86-LABEL: or_nosignbit_ashr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: orl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -465,9 +465,9 @@ define i32 @xor_signbit_ashr(i32 %x, ptr %dst) { ; ; X86-LABEL: xor_signbit_ashr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: xorl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -487,9 +487,9 @@ define i32 @xor_nosignbit_ashr(i32 %x, ptr %dst) { ; ; X86-LABEL: xor_nosignbit_ashr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: xorl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -510,9 +510,9 @@ define i32 @add_signbit_ashr(i32 %x, ptr %dst) { ; ; X86-LABEL: add_signbit_ashr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: addl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -532,9 +532,9 @@ define i32 @add_nosignbit_ashr(i32 %x, ptr %dst) { ; ; X86-LABEL: add_nosignbit_ashr: ; X86: # %bb.0: -; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: movl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: addl 4(%esp), %eax +; X86-NEXT: movl 8(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll index 8c858e04de2a1..540bb7188c33f 100644 --- a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll +++ b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll @@ -76,12 +76,12 @@ define i32 @or_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: or_signbit_select_shl: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -104,12 +104,12 @@ define i32 @or_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: or_nosignbit_select_shl: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -133,12 +133,12 @@ define i32 @xor_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: xor_signbit_select_shl: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -161,12 +161,12 @@ define i32 @xor_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: xor_nosignbit_select_shl: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -190,12 +190,12 @@ define i32 @add_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: add_signbit_select_shl: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -218,12 +218,12 @@ define i32 @add_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: add_nosignbit_select_shl: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $16711680, %eax # imm = 0xFF0000 ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -308,12 +308,12 @@ define i32 @or_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: or_signbit_select_lshr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -336,12 +336,12 @@ define i32 @or_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: or_nosignbit_select_lshr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -365,12 +365,12 @@ define i32 @xor_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: xor_signbit_select_lshr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -393,12 +393,12 @@ define i32 @xor_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: xor_nosignbit_select_lshr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -422,12 +422,12 @@ define i32 @add_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: add_signbit_select_lshr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -450,12 +450,12 @@ define i32 @add_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: add_nosignbit_select_lshr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -540,12 +540,12 @@ define i32 @or_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: or_signbit_select_ashr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -568,12 +568,12 @@ define i32 @or_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: or_nosignbit_select_ashr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -597,12 +597,12 @@ define i32 @xor_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: xor_signbit_select_ashr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -625,12 +625,12 @@ define i32 @xor_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: xor_nosignbit_select_ashr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -654,12 +654,12 @@ define i32 @add_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: add_signbit_select_ashr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -682,12 +682,12 @@ define i32 @add_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) { ; ; X86-LABEL: add_nosignbit_select_ashr: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: andl $2147418112, %eax # imm = 0x7FFF0000 ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl @@ -713,13 +713,13 @@ define i32 @shl_signbit_select_add(i32 %x, i1 %cond, ptr %dst) { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $1, %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: negb %cl ; X86-NEXT: andb $4, %cl ; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl $123456, %eax # imm = 0x1E240 -; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl %t0 = shl i32 %x, 4 %t1 = select i1 %cond, i32 %t0, i32 %x @@ -774,13 +774,13 @@ define i32 @lshr_signbit_select_add(i32 %x, i1 %cond, ptr %dst, i32 %y) { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $1, %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: negb %cl ; X86-NEXT: andb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shrl %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl $123456, %eax # imm = 0x1E240 -; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl %t0 = lshr i32 %x, %y %t1 = select i1 %cond, i32 %t0, i32 %x @@ -804,13 +804,13 @@ define i32 @ashr_signbit_select_add(i32 %x, i1 %cond, ptr %dst) { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $1, %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: negb %cl ; X86-NEXT: andb $4, %cl ; X86-NEXT: sarl %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl $123456, %eax # imm = 0x1E240 -; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl %t0 = ashr i32 %x, 4 %t1 = select i1 %cond, i32 %t0, i32 %x @@ -832,12 +832,12 @@ define i32 @and_signbit_select_add(i32 %x, i1 %cond, ptr %dst, i32 %y) { ; ; X86-LABEL: and_signbit_select_add: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl $123456, %eax # imm = 0x1E240 ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll index c24823538aa14..f7515e7ff5727 100644 --- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll +++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -77,7 +77,8 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) { ; CHECK-NEXT: LBB0_8: ## %while.body.preheader ; CHECK-NEXT: imulq $1040, %rdx, %rax ## imm = 0x410 ; CHECK-NEXT: movq _syBuf@GOTPCREL(%rip), %rcx -; CHECK-NEXT: leaq 8(%rcx,%rax), %rdx +; CHECK-NEXT: leaq (%rcx,%rax), %rdx +; CHECK-NEXT: addq $8, %rdx ; CHECK-NEXT: movq _syCTRO@GOTPCREL(%rip), %rax ; CHECK-NEXT: movl $1, %r13d ; CHECK-NEXT: movb $1, %cl @@ -109,8 +110,8 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) { ; CHECK-NEXT: LBB0_20: ## %while.cond197.backedge ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 ; CHECK-NEXT: decl %r13d -; CHECK-NEXT: testl %r13d, %r13d ; CHECK-NEXT: movl %ebp, %r15d +; CHECK-NEXT: testl %r13d, %r13d ; CHECK-NEXT: jle LBB0_21 ; CHECK-NEXT: LBB0_13: ## %while.body200 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 @@ -126,13 +127,13 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) { ; CHECK-NEXT: jmpq *%rax ; CHECK-NEXT: LBB0_25: ## %sw.bb474 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: ## implicit-def: $rbx +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: jne LBB0_33 ; CHECK-NEXT: ## %bb.26: ## %do.body479.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: ## implicit-def: $rbx +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: jne LBB0_33 ; CHECK-NEXT: ## %bb.27: ## %land.rhs485.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 @@ -262,7 +263,8 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) { ; CHECK-NEXT: callq _write ; CHECK-NEXT: subq %rbx, %r14 ; CHECK-NEXT: movq _syHistory@GOTPCREL(%rip), %rax -; CHECK-NEXT: leaq 8189(%r14,%rax), %rax +; CHECK-NEXT: addq %r14, %rax +; CHECK-NEXT: addq $8189, %rax ## imm = 0x1FFD ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_48: ## %for.body1723 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll index dab7a6aab2d82..6042ece32271c 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -696,13 +696,13 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { ; ; AVX-RECIP-LABEL: v8f32_no_estimate: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_no_estimate: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; FMA-RECIP-NEXT: retq ; @@ -768,7 +768,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -879,7 +879,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -892,7 +892,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; FMA-RECIP-LABEL: v8f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 @@ -996,14 +996,14 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { ; ; AVX-RECIP-LABEL: v16f32_no_estimate: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_no_estimate: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; FMA-RECIP-NEXT: retq @@ -1089,21 +1089,21 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm4 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm4, %ymm1 +; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_one_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 @@ -1249,19 +1249,19 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 -; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 -; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 -; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm1, %ymm2 +; AVX-RECIP-NEXT: vsubps %ymm2, %ymm4, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm3, %ymm2 +; AVX-RECIP-NEXT: vaddps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 @@ -1271,13 +1271,13 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { ; FMA-RECIP-LABEL: v16f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 -; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm4) + ymm4 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 +; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm4) + ymm4 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm1 * ymm4) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll index 77ccaff15e42a..9f76713e6974e 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -838,7 +838,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -972,7 +972,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -987,7 +987,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] @@ -1179,13 +1179,13 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 { ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm5 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4 -; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 -; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; FMA-RECIP-NEXT: vmulps %ymm2, %ymm5, %ymm3 +; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm2 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm5 * ymm1) + ymm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_one_step2: @@ -1327,7 +1327,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1345,12 +1345,12 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { ; FMA-RECIP-LABEL: v16f32_one_step_2_divs: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 -; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3 -; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 +; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 +; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4 ; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; FMA-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0 @@ -1526,7 +1526,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 @@ -1552,7 +1552,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { ; FMA-RECIP-LABEL: v16f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 diff --git a/llvm/test/CodeGen/X86/reverse_branches.ll b/llvm/test/CodeGen/X86/reverse_branches.ll index 93c82a4524ef9..d85e6f886d977 100644 --- a/llvm/test/CodeGen/X86/reverse_branches.ll +++ b/llvm/test/CodeGen/X86/reverse_branches.ll @@ -65,8 +65,8 @@ define i32 @test_branches_order() uwtable ssp { ; CHECK-NEXT: movl $1000, %edx ## imm = 0x3E8 ; CHECK-NEXT: movl $120, %esi ; CHECK-NEXT: callq _memchr -; CHECK-NEXT: cmpq %rax, %r12 ; CHECK-NEXT: movq %r13, %rdi +; CHECK-NEXT: cmpq %rax, %r12 ; CHECK-NEXT: je LBB0_3 ; CHECK-NEXT: jmp LBB0_5 ; CHECK-NEXT: LBB0_7: ## %for.end11 @@ -89,36 +89,34 @@ define i32 @test_branches_order() uwtable ssp { ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: movq %rcx, %rdx ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: jmp LBB0_10 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_14: ## %exit ; CHECK-NEXT: ## in Loop: Header=BB0_10 Depth=2 -; CHECK-NEXT: addq %rsi, %r8 -; CHECK-NEXT: incq %rdi -; CHECK-NEXT: decq %rsi ; CHECK-NEXT: addq $1001, %rdx ## imm = 0x3E9 -; CHECK-NEXT: cmpq $-1000, %r8 ## imm = 0xFC18 +; CHECK-NEXT: cmpq %rdi, %rsi +; CHECK-NEXT: leaq 1(%rsi), %rsi ; CHECK-NEXT: jne LBB0_5 ; CHECK-NEXT: LBB0_10: ## %for.cond18 ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ## => This Loop Header: Depth=2 ; CHECK-NEXT: ## Child Loop BB0_12 Depth 3 -; CHECK-NEXT: cmpl $999, %edi ## imm = 0x3E7 +; CHECK-NEXT: cmpl $999, %esi ## imm = 0x3E7 ; CHECK-NEXT: jg LBB0_15 ; CHECK-NEXT: ## %bb.11: ## %for.body20 ; CHECK-NEXT: ## in Loop: Header=BB0_10 Depth=2 -; CHECK-NEXT: movq $-1000, %r8 ## imm = 0xFC18 +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_12: ## %do.body.i ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ## Parent Loop BB0_10 Depth=2 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=3 -; CHECK-NEXT: cmpb $120, 1000(%rdx,%r8) +; CHECK-NEXT: cmpb $120, (%rdx,%rdi) ; CHECK-NEXT: je LBB0_14 ; CHECK-NEXT: ## %bb.13: ## %do.cond.i ; CHECK-NEXT: ## in Loop: Header=BB0_12 Depth=3 -; CHECK-NEXT: incq %r8 +; CHECK-NEXT: incq %rdi +; CHECK-NEXT: cmpq $1000, %rdi ## imm = 0x3E8 ; CHECK-NEXT: jne LBB0_12 ; CHECK-NEXT: LBB0_5: ## %if.then ; CHECK-NEXT: leaq L_str4(%rip), %rdi diff --git a/llvm/test/CodeGen/X86/rotate-add.ll b/llvm/test/CodeGen/X86/rotate-add.ll index c705505bbbf2a..c6906a14a334e 100644 --- a/llvm/test/CodeGen/X86/rotate-add.ll +++ b/llvm/test/CodeGen/X86/rotate-add.ll @@ -273,10 +273,10 @@ define i32 @test_rotl_mul_special_case(i32 %i) { define i64 @test_rotl_mul_with_mask_special_case(i64 %i) { ; X86-LABEL: test_rotl_mul_with_mask_special_case: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: movl $9, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: shrdl $25, %eax, %edx ; X86-NEXT: movzbl %dl, %eax diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll index 1ead3f98ab5d6..714db6ee510d4 100644 --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -149,19 +149,12 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind { ; Result would undershift define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { -; X86-LABEL: no_extract_shl: -; X86: # %bb.0: -; X86-NEXT: vpsllq $24, %ymm0, %ymm1 -; X86-NEXT: vpsrlq $39, %ymm0, %ymm0 -; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0 -; X86-NEXT: retl -; -; X64-LABEL: no_extract_shl: -; X64: # %bb.0: -; X64-NEXT: vpsllq $24, %ymm0, %ymm1 -; X64-NEXT: vpsrlq $39, %ymm0, %ymm0 -; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 -; X64-NEXT: retq +; CHECK-LABEL: no_extract_shl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $24, %ymm0, %ymm1 +; CHECK-NEXT: vpsrlq $39, %ymm0, %ymm0 +; CHECK-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & mem) | ymm1 +; CHECK-NEXT: ret{{[l|q]}} %lhs_mul = shl <4 x i64> %i, %rhs_mul = shl <4 x i64> %i, %lhs_shift = lshr <4 x i64> %lhs_mul, @@ -171,19 +164,12 @@ define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { ; Result would overshift define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind { -; X86-LABEL: no_extract_shrl: -; X86: # %bb.0: -; X86-NEXT: vpsrld $9, %xmm0, %xmm1 -; X86-NEXT: vpslld $25, %xmm0, %xmm0 -; X86-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0 -; X86-NEXT: retl -; -; X64-LABEL: no_extract_shrl: -; X64: # %bb.0: -; X64-NEXT: vpsrld $9, %xmm0, %xmm1 -; X64-NEXT: vpslld $25, %xmm0, %xmm0 -; X64-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 -; X64-NEXT: retq +; CHECK-LABEL: no_extract_shrl: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrld $9, %xmm0, %xmm1 +; CHECK-NEXT: vpslld $25, %xmm0, %xmm0 +; CHECK-NEXT: vpternlogd {{.*#+}} xmm0 = (xmm0 & mem) | xmm1 +; CHECK-NEXT: ret{{[l|q]}} %lhs_div = lshr <4 x i32> %i, %rhs_div = lshr <4 x i32> %i, %lhs_shift = shl <4 x i32> %lhs_div, @@ -278,9 +264,10 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { ; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: vmovq %rdx, %xmm1 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rdi +; X64-NEXT: vmovq %rcx, %xmm1 ; X64-NEXT: vmovq %rdx, %xmm2 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X64-NEXT: vpsrlq $9, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll index 8f046a4f5aea5..4073a72e67145 100644 --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -106,10 +106,10 @@ define i8 @rolb_extract_udiv(i8 %i) nounwind { define i64 @rolq_extract_mul_with_mask(i64 %i) nounwind { ; X86-LABEL: rolq_extract_mul_with_mask: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: movl $9, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: leal (%ecx,%ecx,8), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: shrdl $25, %eax, %edx ; X86-NEXT: movzbl %dl, %eax diff --git a/llvm/test/CodeGen/X86/rotate.ll b/llvm/test/CodeGen/X86/rotate.ll index ea32edba62822..74e35cd5e8f7a 100644 --- a/llvm/test/CodeGen/X86/rotate.ll +++ b/llvm/test/CodeGen/X86/rotate.ll @@ -10,9 +10,9 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %edx ; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: testb $32, %cl @@ -63,10 +63,10 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: shrdl %cl, %esi, %eax ; X86-NEXT: testb $32, %cl @@ -655,11 +655,11 @@ define i64 @truncated_rot(i64 %x, i32 %amt) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl $0, %edi +; X86-NEXT: testb $32, %cl ; X86-NEXT: jne .LBB28_2 ; X86-NEXT: # %bb.1: # %entry ; X86-NEXT: movl %eax, %edi diff --git a/llvm/test/CodeGen/X86/rotate4.ll b/llvm/test/CodeGen/X86/rotate4.ll index 0cc9f465dd75a..a993f1a45c02b 100644 --- a/llvm/test/CodeGen/X86/rotate4.ll +++ b/llvm/test/CodeGen/X86/rotate4.ll @@ -67,9 +67,9 @@ define i64 @rotate_left_64(i64 %a, i64 %b) { ; X86-NEXT: .cfi_offset %ebx, -8 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %edx ; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: testb $32, %cl @@ -127,10 +127,10 @@ define i64 @rotate_right_64(i64 %a, i64 %b) { ; X86-NEXT: .cfi_offset %edi, -12 ; X86-NEXT: .cfi_offset %ebx, -8 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: shrdl %cl, %esi, %eax ; X86-NEXT: testb $32, %cl @@ -245,9 +245,9 @@ define void @rotate_left_m64(ptr%pa, i64 %b) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %esi -; X86-NEXT: movl 4(%eax), %ebx ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl 4(%eax), %ebx ; X86-NEXT: movl %ebx, %edi ; X86-NEXT: shldl %cl, %esi, %edi ; X86-NEXT: testb $32, %cl @@ -314,10 +314,10 @@ define void @rotate_right_m64(ptr%pa, i64 %b) { ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %ebx ; X86-NEXT: movl 4(%eax), %esi ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx +; X86-NEXT: movl (%eax), %ebx ; X86-NEXT: movl %ebx, %edi ; X86-NEXT: shrdl %cl, %esi, %edi ; X86-NEXT: testb $32, %cl diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index fe71a16039c19..d2d642d90c88e 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -810,11 +810,11 @@ define dso_local i32 @sad_nonloop_32i8(ptr nocapture readonly %p, i64, ptr nocap ; ; AVX1-LABEL: sad_nonloop_32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 +; AVX1-NEXT: vpsadbw 16(%rdx), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu (%rdi), %xmm1 +; AVX1-NEXT: vpsadbw (%rdx), %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -892,12 +892,12 @@ define dso_local i32 @sad_nonloop_64i8(ptr nocapture readonly %p, i64, ptr nocap ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 -; AVX1-NEXT: vpsadbw 48(%rdx), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm2 +; AVX1-NEXT: vpsadbw 48(%rdx), %xmm2, %xmm2 ; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw 32(%rdx), %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw 32(%rdx), %xmm3, %xmm2 ; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/sadd_sat.ll b/llvm/test/CodeGen/X86/sadd_sat.ll index 5b9a42d1f0d91..751526620944b 100644 --- a/llvm/test/CodeGen/X86/sadd_sat.ll +++ b/llvm/test/CodeGen/X86/sadd_sat.ll @@ -40,8 +40,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl ; X86-NEXT: movl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/sadd_sat_plus.ll b/llvm/test/CodeGen/X86/sadd_sat_plus.ll index deabeb27cdab8..7d0d208705f5a 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_plus.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_plus.ll @@ -42,8 +42,8 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl ; X86-NEXT: movl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index bd563f97b0ac4..0f7a3939b3d37 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -530,14 +530,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -545,14 +545,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512F-LABEL: v16i4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -560,13 +560,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512BW-LABEL: v16i4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -638,10 +638,10 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: @@ -729,10 +729,10 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i32: @@ -861,12 +861,12 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vpsrad $31, %xmm4, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm3 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm5, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm5, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32: @@ -1063,7 +1063,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 @@ -1174,7 +1174,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE41-LABEL: v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm4 ; SSE41-NEXT: paddq %xmm1, %xmm2 @@ -1352,7 +1352,7 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 @@ -1363,14 +1363,14 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm5 ; SSE41-NEXT: por %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm5 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] ; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm8, %xmm2 +; SSE41-NEXT: movapd %xmm8, %xmm9 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 +; SSE41-NEXT: pxor %xmm2, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm1 @@ -1399,13 +1399,13 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vxorpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm5, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm5, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i64: @@ -1650,7 +1650,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm8 ; SSE41-NEXT: movdqa %xmm8, %xmm9 @@ -1661,14 +1661,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm9 ; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm4, %xmm9 ; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] ; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 +; SSE41-NEXT: movapd %xmm11, %xmm13 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm13 +; SSE41-NEXT: pxor %xmm4, %xmm9 ; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: paddq %xmm5, %xmm1 @@ -1680,12 +1680,12 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] ; SSE41-NEXT: pand %xmm9, %xmm4 ; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: movapd %xmm11, %xmm9 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm9 +; SSE41-NEXT: pxor %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: paddq %xmm6, %xmm2 @@ -1713,9 +1713,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm7, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: pxor %xmm7, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 @@ -1736,7 +1736,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX1-NEXT: vxorpd %ymm5, %ymm4, %ymm4 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm7, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 @@ -1791,20 +1791,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE-LABEL: v2i128: ; SSE: # %bb.0: -; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; SSE-NEXT: seto %dil +; SSE-NEXT: seto %al ; SSE-NEXT: movq %r8, %r10 ; SSE-NEXT: sarq $63, %r10 -; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: testb %al, %al ; SSE-NEXT: cmovneq %r10, %rcx ; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 ; SSE-NEXT: xorq %r11, %r10 -; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: testb %al, %al ; SSE-NEXT: cmoveq %r8, %r10 ; SSE-NEXT: addq %r9, %rsi ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: seto %dil ; SSE-NEXT: movq %rdx, %r8 ; SSE-NEXT: sarq $63, %r8 @@ -1821,20 +1821,20 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; ; AVX-LABEL: v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: seto %dil +; AVX-NEXT: seto %al ; AVX-NEXT: movq %r8, %r10 ; AVX-NEXT: sarq $63, %r10 -; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: testb %al, %al ; AVX-NEXT: cmovneq %r10, %rcx ; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 ; AVX-NEXT: xorq %r11, %r10 -; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: testb %al, %al ; AVX-NEXT: cmoveq %r8, %r10 ; AVX-NEXT: addq %r9, %rsi ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: seto %dil ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: sarq $63, %r8 diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index b12be7cb129d3..5f84198682c74 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -693,7 +693,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) { ; ; SSE41-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [42,42] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm2, %xmm0 @@ -722,7 +722,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) { ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -757,7 +757,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) { ; ; SSE41-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [42,42] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] @@ -781,7 +781,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) { ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -1191,8 +1191,8 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64> ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_min: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 @@ -1261,7 +1261,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i ; ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 @@ -1329,11 +1329,11 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_notval(<2 x i64> %x, <2 ; ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_notval: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_variable_v2i64_using_cmp_notval: diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll index 50a967e1c2a1a..e0cdf09aa16d5 100644 --- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll @@ -43,9 +43,9 @@ define i32 @f_to_u32(float %a) nounwind { ; X86-SSE-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-WIN-NEXT: cvttss2si %xmm0, %ecx ; X86-SSE-WIN-NEXT: movl %ecx, %edx -; X86-SSE-WIN-NEXT: sarl $31, %edx ; X86-SSE-WIN-NEXT: subss __real@4f000000, %xmm0 ; X86-SSE-WIN-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-WIN-NEXT: sarl $31, %edx ; X86-SSE-WIN-NEXT: andl %edx, %eax ; X86-SSE-WIN-NEXT: orl %ecx, %eax ; X86-SSE-WIN-NEXT: retl @@ -55,9 +55,9 @@ define i32 @f_to_u32(float %a) nounwind { ; X86-SSE-LIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-LIN-NEXT: cvttss2si %xmm0, %ecx ; X86-SSE-LIN-NEXT: movl %ecx, %edx -; X86-SSE-LIN-NEXT: sarl $31, %edx ; X86-SSE-LIN-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-LIN-NEXT: cvttss2si %xmm0, %eax +; X86-SSE-LIN-NEXT: sarl $31, %edx ; X86-SSE-LIN-NEXT: andl %edx, %eax ; X86-SSE-LIN-NEXT: orl %ecx, %eax ; X86-SSE-LIN-NEXT: retl @@ -160,9 +160,9 @@ define i32 @d_to_u32(double %a) nounwind { ; X86-SSE3-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE3-WIN-NEXT: cvttsd2si %xmm0, %ecx ; X86-SSE3-WIN-NEXT: movl %ecx, %edx -; X86-SSE3-WIN-NEXT: sarl $31, %edx ; X86-SSE3-WIN-NEXT: subsd __real@41e0000000000000, %xmm0 ; X86-SSE3-WIN-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE3-WIN-NEXT: sarl $31, %edx ; X86-SSE3-WIN-NEXT: andl %edx, %eax ; X86-SSE3-WIN-NEXT: orl %ecx, %eax ; X86-SSE3-WIN-NEXT: retl @@ -172,9 +172,9 @@ define i32 @d_to_u32(double %a) nounwind { ; X86-SSE3-LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE3-LIN-NEXT: cvttsd2si %xmm0, %ecx ; X86-SSE3-LIN-NEXT: movl %ecx, %edx -; X86-SSE3-LIN-NEXT: sarl $31, %edx ; X86-SSE3-LIN-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE3-LIN-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE3-LIN-NEXT: sarl $31, %edx ; X86-SSE3-LIN-NEXT: andl %edx, %eax ; X86-SSE3-LIN-NEXT: orl %ecx, %eax ; X86-SSE3-LIN-NEXT: retl @@ -190,9 +190,9 @@ define i32 @d_to_u32(double %a) nounwind { ; X86-SSE2-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-WIN-NEXT: cvttsd2si %xmm0, %ecx ; X86-SSE2-WIN-NEXT: movl %ecx, %edx -; X86-SSE2-WIN-NEXT: sarl $31, %edx ; X86-SSE2-WIN-NEXT: subsd __real@41e0000000000000, %xmm0 ; X86-SSE2-WIN-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE2-WIN-NEXT: sarl $31, %edx ; X86-SSE2-WIN-NEXT: andl %edx, %eax ; X86-SSE2-WIN-NEXT: orl %ecx, %eax ; X86-SSE2-WIN-NEXT: retl @@ -202,9 +202,9 @@ define i32 @d_to_u32(double %a) nounwind { ; X86-SSE2-LIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-SSE2-LIN-NEXT: cvttsd2si %xmm0, %ecx ; X86-SSE2-LIN-NEXT: movl %ecx, %edx -; X86-SSE2-LIN-NEXT: sarl $31, %edx ; X86-SSE2-LIN-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-LIN-NEXT: cvttsd2si %xmm0, %eax +; X86-SSE2-LIN-NEXT: sarl $31, %edx ; X86-SSE2-LIN-NEXT: andl %edx, %eax ; X86-SSE2-LIN-NEXT: orl %ecx, %eax ; X86-SSE2-LIN-NEXT: retl @@ -795,26 +795,23 @@ define i32 @t_to_u32(fp128 %a) nounwind { ; X64-AVX512-LIN-NEXT: popq %rcx ; X64-AVX512-LIN-NEXT: retq ; -; X86-SSE-WIN-LABEL: t_to_u32: -; X86-SSE-WIN: # %bb.0: -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: calll ___fixunstfsi -; X86-SSE-WIN-NEXT: addl $16, %esp -; X86-SSE-WIN-NEXT: retl +; X86-SSE3-WIN-LABEL: t_to_u32: +; X86-SSE3-WIN: # %bb.0: +; X86-SSE3-WIN-NEXT: subl $16, %esp +; X86-SSE3-WIN-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-SSE3-WIN-NEXT: movups %xmm0, (%esp) +; X86-SSE3-WIN-NEXT: calll ___fixunstfsi +; X86-SSE3-WIN-NEXT: addl $16, %esp +; X86-SSE3-WIN-NEXT: retl ; -; X86-SSE-LIN-LABEL: t_to_u32: -; X86-SSE-LIN: # %bb.0: -; X86-SSE-LIN-NEXT: subl $12, %esp -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: calll __fixunstfsi -; X86-SSE-LIN-NEXT: addl $28, %esp -; X86-SSE-LIN-NEXT: retl +; X86-SSE3-LIN-LABEL: t_to_u32: +; X86-SSE3-LIN: # %bb.0: +; X86-SSE3-LIN-NEXT: subl $28, %esp +; X86-SSE3-LIN-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-SSE3-LIN-NEXT: movups %xmm0, (%esp) +; X86-SSE3-LIN-NEXT: calll __fixunstfsi +; X86-SSE3-LIN-NEXT: addl $28, %esp +; X86-SSE3-LIN-NEXT: retl ; ; X64-SSE-WIN-LABEL: t_to_u32: ; X64-SSE-WIN: # %bb.0: @@ -833,6 +830,45 @@ define i32 @t_to_u32(fp128 %a) nounwind { ; X64-SSE-LIN-NEXT: popq %rcx ; X64-SSE-LIN-NEXT: retq ; +; X86-SSE2-WIN-LABEL: t_to_u32: +; X86-SSE2-WIN: # %bb.0: +; X86-SSE2-WIN-NEXT: subl $16, %esp +; X86-SSE2-WIN-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-SSE2-WIN-NEXT: movups %xmm0, (%esp) +; X86-SSE2-WIN-NEXT: calll ___fixunstfsi +; X86-SSE2-WIN-NEXT: addl $16, %esp +; X86-SSE2-WIN-NEXT: retl +; +; X86-SSE2-LIN-LABEL: t_to_u32: +; X86-SSE2-LIN: # %bb.0: +; X86-SSE2-LIN-NEXT: subl $28, %esp +; X86-SSE2-LIN-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-SSE2-LIN-NEXT: movups %xmm0, (%esp) +; X86-SSE2-LIN-NEXT: calll __fixunstfsi +; X86-SSE2-LIN-NEXT: addl $28, %esp +; X86-SSE2-LIN-NEXT: retl +; +; X86-SSE1-WIN-LABEL: t_to_u32: +; X86-SSE1-WIN: # %bb.0: +; X86-SSE1-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-WIN-NEXT: calll ___fixunstfsi +; X86-SSE1-WIN-NEXT: addl $16, %esp +; X86-SSE1-WIN-NEXT: retl +; +; X86-SSE1-LIN-LABEL: t_to_u32: +; X86-SSE1-LIN: # %bb.0: +; X86-SSE1-LIN-NEXT: subl $12, %esp +; X86-SSE1-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-LIN-NEXT: calll __fixunstfsi +; X86-SSE1-LIN-NEXT: addl $28, %esp +; X86-SSE1-LIN-NEXT: retl +; ; X87-WIN-LABEL: t_to_u32: ; X87-WIN: # %bb.0: ; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) @@ -893,26 +929,23 @@ define i32 @t_to_s32(fp128 %a) nounwind { ; X64-AVX512-LIN-NEXT: popq %rcx ; X64-AVX512-LIN-NEXT: retq ; -; X86-SSE-WIN-LABEL: t_to_s32: -; X86-SSE-WIN: # %bb.0: -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: calll ___fixtfsi -; X86-SSE-WIN-NEXT: addl $16, %esp -; X86-SSE-WIN-NEXT: retl +; X86-SSE3-WIN-LABEL: t_to_s32: +; X86-SSE3-WIN: # %bb.0: +; X86-SSE3-WIN-NEXT: subl $16, %esp +; X86-SSE3-WIN-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-SSE3-WIN-NEXT: movups %xmm0, (%esp) +; X86-SSE3-WIN-NEXT: calll ___fixtfsi +; X86-SSE3-WIN-NEXT: addl $16, %esp +; X86-SSE3-WIN-NEXT: retl ; -; X86-SSE-LIN-LABEL: t_to_s32: -; X86-SSE-LIN: # %bb.0: -; X86-SSE-LIN-NEXT: subl $12, %esp -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: calll __fixtfsi -; X86-SSE-LIN-NEXT: addl $28, %esp -; X86-SSE-LIN-NEXT: retl +; X86-SSE3-LIN-LABEL: t_to_s32: +; X86-SSE3-LIN: # %bb.0: +; X86-SSE3-LIN-NEXT: subl $28, %esp +; X86-SSE3-LIN-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-SSE3-LIN-NEXT: movups %xmm0, (%esp) +; X86-SSE3-LIN-NEXT: calll __fixtfsi +; X86-SSE3-LIN-NEXT: addl $28, %esp +; X86-SSE3-LIN-NEXT: retl ; ; X64-SSE-WIN-LABEL: t_to_s32: ; X64-SSE-WIN: # %bb.0: @@ -931,6 +964,45 @@ define i32 @t_to_s32(fp128 %a) nounwind { ; X64-SSE-LIN-NEXT: popq %rcx ; X64-SSE-LIN-NEXT: retq ; +; X86-SSE2-WIN-LABEL: t_to_s32: +; X86-SSE2-WIN: # %bb.0: +; X86-SSE2-WIN-NEXT: subl $16, %esp +; X86-SSE2-WIN-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-SSE2-WIN-NEXT: movups %xmm0, (%esp) +; X86-SSE2-WIN-NEXT: calll ___fixtfsi +; X86-SSE2-WIN-NEXT: addl $16, %esp +; X86-SSE2-WIN-NEXT: retl +; +; X86-SSE2-LIN-LABEL: t_to_s32: +; X86-SSE2-LIN: # %bb.0: +; X86-SSE2-LIN-NEXT: subl $28, %esp +; X86-SSE2-LIN-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-SSE2-LIN-NEXT: movups %xmm0, (%esp) +; X86-SSE2-LIN-NEXT: calll __fixtfsi +; X86-SSE2-LIN-NEXT: addl $28, %esp +; X86-SSE2-LIN-NEXT: retl +; +; X86-SSE1-WIN-LABEL: t_to_s32: +; X86-SSE1-WIN: # %bb.0: +; X86-SSE1-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-WIN-NEXT: calll ___fixtfsi +; X86-SSE1-WIN-NEXT: addl $16, %esp +; X86-SSE1-WIN-NEXT: retl +; +; X86-SSE1-LIN-LABEL: t_to_s32: +; X86-SSE1-LIN: # %bb.0: +; X86-SSE1-LIN-NEXT: subl $12, %esp +; X86-SSE1-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-LIN-NEXT: calll __fixtfsi +; X86-SSE1-LIN-NEXT: addl $28, %esp +; X86-SSE1-LIN-NEXT: retl +; ; X87-WIN-LABEL: t_to_s32: ; X87-WIN: # %bb.0: ; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll index f516db8b30ffe..ae5925caf6f69 100644 --- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll @@ -157,9 +157,9 @@ define i64 @f_to_u64(float %a) nounwind { ; X64-SSE-WIN: # %bb.0: ; X64-SSE-WIN-NEXT: cvttss2si %xmm0, %rcx ; X64-SSE-WIN-NEXT: movq %rcx, %rdx -; X64-SSE-WIN-NEXT: sarq $63, %rdx ; X64-SSE-WIN-NEXT: subss __real@5f000000(%rip), %xmm0 ; X64-SSE-WIN-NEXT: cvttss2si %xmm0, %rax +; X64-SSE-WIN-NEXT: sarq $63, %rdx ; X64-SSE-WIN-NEXT: andq %rdx, %rax ; X64-SSE-WIN-NEXT: orq %rcx, %rax ; X64-SSE-WIN-NEXT: retq @@ -168,9 +168,9 @@ define i64 @f_to_u64(float %a) nounwind { ; X64-SSE-LIN: # %bb.0: ; X64-SSE-LIN-NEXT: cvttss2si %xmm0, %rcx ; X64-SSE-LIN-NEXT: movq %rcx, %rdx -; X64-SSE-LIN-NEXT: sarq $63, %rdx ; X64-SSE-LIN-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-LIN-NEXT: cvttss2si %xmm0, %rax +; X64-SSE-LIN-NEXT: sarq $63, %rdx ; X64-SSE-LIN-NEXT: andq %rdx, %rax ; X64-SSE-LIN-NEXT: orq %rcx, %rax ; X64-SSE-LIN-NEXT: retq @@ -590,9 +590,9 @@ define i64 @d_to_u64(double %a) nounwind { ; X64-SSE-WIN: # %bb.0: ; X64-SSE-WIN-NEXT: cvttsd2si %xmm0, %rcx ; X64-SSE-WIN-NEXT: movq %rcx, %rdx -; X64-SSE-WIN-NEXT: sarq $63, %rdx ; X64-SSE-WIN-NEXT: subsd __real@43e0000000000000(%rip), %xmm0 ; X64-SSE-WIN-NEXT: cvttsd2si %xmm0, %rax +; X64-SSE-WIN-NEXT: sarq $63, %rdx ; X64-SSE-WIN-NEXT: andq %rdx, %rax ; X64-SSE-WIN-NEXT: orq %rcx, %rax ; X64-SSE-WIN-NEXT: retq @@ -601,9 +601,9 @@ define i64 @d_to_u64(double %a) nounwind { ; X64-SSE-LIN: # %bb.0: ; X64-SSE-LIN-NEXT: cvttsd2si %xmm0, %rcx ; X64-SSE-LIN-NEXT: movq %rcx, %rdx -; X64-SSE-LIN-NEXT: sarq $63, %rdx ; X64-SSE-LIN-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-LIN-NEXT: cvttsd2si %xmm0, %rax +; X64-SSE-LIN-NEXT: sarq $63, %rdx ; X64-SSE-LIN-NEXT: andq %rdx, %rax ; X64-SSE-LIN-NEXT: orq %rcx, %rax ; X64-SSE-LIN-NEXT: retq @@ -919,8 +919,8 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind { ; X86-AVX512-WIN-NEXT: fcmovbe %st(1), %st ; X86-AVX512-WIN-NEXT: fstp %st(1) ; X86-AVX512-WIN-NEXT: fsubrp %st, %st(1) -; X86-AVX512-WIN-NEXT: fisttpll (%esp) ; X86-AVX512-WIN-NEXT: setbe %dl +; X86-AVX512-WIN-NEXT: fisttpll (%esp) ; X86-AVX512-WIN-NEXT: shll $31, %edx ; X86-AVX512-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-AVX512-WIN-NEXT: movl (%esp), %eax @@ -939,8 +939,8 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind { ; X86-AVX512-LIN-NEXT: fcmovbe %st(1), %st ; X86-AVX512-LIN-NEXT: fstp %st(1) ; X86-AVX512-LIN-NEXT: fsubrp %st, %st(1) -; X86-AVX512-LIN-NEXT: fisttpll (%esp) ; X86-AVX512-LIN-NEXT: setbe %dl +; X86-AVX512-LIN-NEXT: fisttpll (%esp) ; X86-AVX512-LIN-NEXT: shll $31, %edx ; X86-AVX512-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-AVX512-LIN-NEXT: movl (%esp), %eax @@ -995,8 +995,8 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind { ; X86-SSE3-WIN-NEXT: fcmovbe %st(1), %st ; X86-SSE3-WIN-NEXT: fstp %st(1) ; X86-SSE3-WIN-NEXT: fsubrp %st, %st(1) -; X86-SSE3-WIN-NEXT: fisttpll (%esp) ; X86-SSE3-WIN-NEXT: setbe %dl +; X86-SSE3-WIN-NEXT: fisttpll (%esp) ; X86-SSE3-WIN-NEXT: shll $31, %edx ; X86-SSE3-WIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE3-WIN-NEXT: movl (%esp), %eax @@ -1015,8 +1015,8 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind { ; X86-SSE3-LIN-NEXT: fcmovbe %st(1), %st ; X86-SSE3-LIN-NEXT: fstp %st(1) ; X86-SSE3-LIN-NEXT: fsubrp %st, %st(1) -; X86-SSE3-LIN-NEXT: fisttpll (%esp) ; X86-SSE3-LIN-NEXT: setbe %dl +; X86-SSE3-LIN-NEXT: fisttpll (%esp) ; X86-SSE3-LIN-NEXT: shll $31, %edx ; X86-SSE3-LIN-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X86-SSE3-LIN-NEXT: movl (%esp), %eax @@ -1452,21 +1452,18 @@ define i64 @t_to_u64(fp128 %a) nounwind { ; ; X86-SSE-WIN-LABEL: t_to_u64: ; X86-SSE-WIN: # %bb.0: -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE-WIN-NEXT: subl $16, %esp +; X86-SSE-WIN-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-WIN-NEXT: movups %xmm0, (%esp) ; X86-SSE-WIN-NEXT: calll ___fixunstfdi ; X86-SSE-WIN-NEXT: addl $16, %esp ; X86-SSE-WIN-NEXT: retl ; ; X86-SSE-LIN-LABEL: t_to_u64: ; X86-SSE-LIN: # %bb.0: -; X86-SSE-LIN-NEXT: subl $12, %esp -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE-LIN-NEXT: subl $28, %esp +; X86-SSE-LIN-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-LIN-NEXT: movups %xmm0, (%esp) ; X86-SSE-LIN-NEXT: calll __fixunstfdi ; X86-SSE-LIN-NEXT: addl $28, %esp ; X86-SSE-LIN-NEXT: retl @@ -1550,21 +1547,18 @@ define i64 @t_to_s64(fp128 %a) nounwind { ; ; X86-SSE-WIN-LABEL: t_to_s64: ; X86-SSE-WIN: # %bb.0: -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE-WIN-NEXT: subl $16, %esp +; X86-SSE-WIN-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-WIN-NEXT: movups %xmm0, (%esp) ; X86-SSE-WIN-NEXT: calll ___fixtfdi ; X86-SSE-WIN-NEXT: addl $16, %esp ; X86-SSE-WIN-NEXT: retl ; ; X86-SSE-LIN-LABEL: t_to_s64: ; X86-SSE-LIN: # %bb.0: -; X86-SSE-LIN-NEXT: subl $12, %esp -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-LIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE-LIN-NEXT: subl $28, %esp +; X86-SSE-LIN-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-LIN-NEXT: movups %xmm0, (%esp) ; X86-SSE-LIN-NEXT: calll __fixtfdi ; X86-SSE-LIN-NEXT: addl $28, %esp ; X86-SSE-LIN-NEXT: retl diff --git a/llvm/test/CodeGen/X86/scalar_widen_div.ll b/llvm/test/CodeGen/X86/scalar_widen_div.ll index 1d98b4f62069d..274799661e837 100644 --- a/llvm/test/CodeGen/X86/scalar_widen_div.ll +++ b/llvm/test/CodeGen/X86/scalar_widen_div.ll @@ -106,27 +106,27 @@ define <5 x i16> @test_short_div(<5 x i16> %num, <5 x i16> %div) { ; CHECK-NEXT: pextrw $3, %xmm1, %esi ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd +; CHECK-NEXT: pextrw $2, %xmm0, %edi ; CHECK-NEXT: idivw %si ; CHECK-NEXT: movl %eax, %esi -; CHECK-NEXT: pextrw $2, %xmm0, %eax -; CHECK-NEXT: pextrw $2, %xmm1, %edi -; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: pextrw $2, %xmm1, %r8d +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %di +; CHECK-NEXT: idivw %r8w ; CHECK-NEXT: movl %eax, %edi ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movd %xmm1, %r8d -; CHECK-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %r8w -; CHECK-NEXT: movl %eax, %r8d -; CHECK-NEXT: pextrw $1, %xmm0, %eax -; CHECK-NEXT: pextrw $1, %xmm1, %r9d +; CHECK-NEXT: movd %xmm1, %r9d ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd +; CHECK-NEXT: pextrw $1, %xmm0, %r8d ; CHECK-NEXT: idivw %r9w +; CHECK-NEXT: movl %eax, %r9d +; CHECK-NEXT: pextrw $1, %xmm1, %r10d +; CHECK-NEXT: movl %r8d, %eax +; CHECK-NEXT: cwtd +; CHECK-NEXT: idivw %r10w ; CHECK-NEXT: # kill: def $ax killed $ax def $eax -; CHECK-NEXT: movd %r8d, %xmm0 +; CHECK-NEXT: movd %r9d, %xmm0 ; CHECK-NEXT: pinsrw $1, %eax, %xmm0 ; CHECK-NEXT: pinsrw $2, %edi, %xmm0 ; CHECK-NEXT: pinsrw $3, %esi, %xmm0 @@ -292,27 +292,27 @@ define <5 x i16> @test_short_rem(<5 x i16> %num, <5 x i16> %rem) { ; CHECK-NEXT: pextrw $3, %xmm1, %esi ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd +; CHECK-NEXT: pextrw $2, %xmm0, %edi ; CHECK-NEXT: idivw %si ; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: pextrw $2, %xmm0, %eax -; CHECK-NEXT: pextrw $2, %xmm1, %edi -; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: pextrw $2, %xmm1, %r8d +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %di +; CHECK-NEXT: idivw %r8w ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: movd %xmm1, %r8d -; CHECK-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-NEXT: cwtd -; CHECK-NEXT: idivw %r8w -; CHECK-NEXT: movl %edx, %r8d -; CHECK-NEXT: pextrw $1, %xmm0, %eax -; CHECK-NEXT: pextrw $1, %xmm1, %r9d +; CHECK-NEXT: movd %xmm1, %r9d ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: cwtd +; CHECK-NEXT: pextrw $1, %xmm0, %r8d ; CHECK-NEXT: idivw %r9w +; CHECK-NEXT: movl %edx, %r9d +; CHECK-NEXT: pextrw $1, %xmm1, %r10d +; CHECK-NEXT: movl %r8d, %eax +; CHECK-NEXT: cwtd +; CHECK-NEXT: idivw %r10w ; CHECK-NEXT: # kill: def $dx killed $dx def $edx -; CHECK-NEXT: movd %r8d, %xmm0 +; CHECK-NEXT: movd %r9d, %xmm0 ; CHECK-NEXT: pinsrw $1, %edx, %xmm0 ; CHECK-NEXT: pinsrw $2, %edi, %xmm0 ; CHECK-NEXT: pinsrw $3, %esi, %xmm0 diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll index 426587a84ce17..e853593ec2801 100644 --- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll +++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll @@ -12,150 +12,150 @@ define i256 @test1(i256 %a) nounwind { ; ILP-LABEL: test1: ; ILP: # %bb.0: -; ILP-NEXT: movq %rdi, %rax ; ILP-NEXT: xorps %xmm0, %xmm0 -; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; ILP-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; ILP-NEXT: leal (%rsi,%rsi), %ecx ; ILP-NEXT: addb $3, %cl ; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; ILP-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; ILP-NEXT: movl %ecx, %edx -; ILP-NEXT: shrb $3, %dl -; ILP-NEXT: andb $24, %dl -; ILP-NEXT: negb %dl -; ILP-NEXT: movsbq %dl, %rdx -; ILP-NEXT: movq -24(%rsp,%rdx), %rsi -; ILP-NEXT: movq -16(%rsp,%rdx), %rdi -; ILP-NEXT: shldq %cl, %rsi, %rdi -; ILP-NEXT: movq -40(%rsp,%rdx), %r8 -; ILP-NEXT: movq -32(%rsp,%rdx), %rdx -; ILP-NEXT: movq %r8, %r9 -; ILP-NEXT: shlq %cl, %r9 -; ILP-NEXT: movq %rdx, %r10 -; ILP-NEXT: shldq %cl, %r8, %r10 -; ILP-NEXT: movq %rdi, 24(%rax) -; ILP-NEXT: movq %r10, 8(%rax) -; ILP-NEXT: movq %r9, (%rax) -; ILP-NEXT: shlq %cl, %rsi +; ILP-NEXT: movl %ecx, %eax +; ILP-NEXT: shrb $3, %al +; ILP-NEXT: andb $24, %al +; ILP-NEXT: negb %al +; ILP-NEXT: movsbq %al, %rsi +; ILP-NEXT: movq -24(%rsp,%rsi), %rdx +; ILP-NEXT: movq -16(%rsp,%rsi), %r8 +; ILP-NEXT: shldq %cl, %rdx, %r8 +; ILP-NEXT: movq -40(%rsp,%rsi), %r9 +; ILP-NEXT: movq %r9, %r10 +; ILP-NEXT: shlq %cl, %r10 +; ILP-NEXT: movq %rdi, %rax +; ILP-NEXT: movq -32(%rsp,%rsi), %rsi +; ILP-NEXT: movq %rsi, %rdi +; ILP-NEXT: shldq %cl, %r9, %rdi +; ILP-NEXT: shlq %cl, %rdx +; ILP-NEXT: movq %r8, 24(%rax) +; ILP-NEXT: movq %rdi, 8(%rax) ; ILP-NEXT: notb %cl -; ILP-NEXT: shrq %rdx +; ILP-NEXT: shrq %rsi ; ILP-NEXT: # kill: def $cl killed $cl killed $ecx -; ILP-NEXT: shrq %cl, %rdx -; ILP-NEXT: orq %rsi, %rdx -; ILP-NEXT: movq %rdx, 16(%rax) +; ILP-NEXT: shrq %cl, %rsi +; ILP-NEXT: movq %r10, (%rax) +; ILP-NEXT: orq %rdx, %rsi +; ILP-NEXT: movq %rsi, 16(%rax) ; ILP-NEXT: retq ; ; HYBRID-LABEL: test1: ; HYBRID: # %bb.0: -; HYBRID-NEXT: movq %rdi, %rax ; HYBRID-NEXT: xorps %xmm0, %xmm0 -; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; HYBRID-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; HYBRID-NEXT: movq $1, -{{[0-9]+}}(%rsp) ; HYBRID-NEXT: leal (%rsi,%rsi), %ecx ; HYBRID-NEXT: addb $3, %cl -; HYBRID-NEXT: movl %ecx, %edx -; HYBRID-NEXT: shrb $3, %dl -; HYBRID-NEXT: andb $24, %dl -; HYBRID-NEXT: negb %dl -; HYBRID-NEXT: movsbq %dl, %rdx -; HYBRID-NEXT: movq -24(%rsp,%rdx), %rsi -; HYBRID-NEXT: movq -16(%rsp,%rdx), %rdi -; HYBRID-NEXT: shldq %cl, %rsi, %rdi -; HYBRID-NEXT: movq %rdi, 24(%rax) -; HYBRID-NEXT: movq -40(%rsp,%rdx), %rdi -; HYBRID-NEXT: movq -32(%rsp,%rdx), %rdx -; HYBRID-NEXT: movq %rdx, %r8 +; HYBRID-NEXT: movl %ecx, %eax +; HYBRID-NEXT: shrb $3, %al +; HYBRID-NEXT: andb $24, %al +; HYBRID-NEXT: negb %al +; HYBRID-NEXT: movsbq %al, %rsi +; HYBRID-NEXT: movq -24(%rsp,%rsi), %rdx +; HYBRID-NEXT: movq -16(%rsp,%rsi), %r8 +; HYBRID-NEXT: shldq %cl, %rdx, %r8 +; HYBRID-NEXT: movq %rdi, %rax +; HYBRID-NEXT: movq %r8, 24(%rdi) +; HYBRID-NEXT: movq -40(%rsp,%rsi), %rdi +; HYBRID-NEXT: movq -32(%rsp,%rsi), %rsi +; HYBRID-NEXT: movq %rsi, %r8 ; HYBRID-NEXT: shldq %cl, %rdi, %r8 -; HYBRID-NEXT: movq %r8, 8(%rax) ; HYBRID-NEXT: shlq %cl, %rdi -; HYBRID-NEXT: movq %rdi, (%rax) -; HYBRID-NEXT: shlq %cl, %rsi +; HYBRID-NEXT: shlq %cl, %rdx +; HYBRID-NEXT: movq %r8, 8(%rax) ; HYBRID-NEXT: notb %cl -; HYBRID-NEXT: shrq %rdx +; HYBRID-NEXT: shrq %rsi ; HYBRID-NEXT: # kill: def $cl killed $cl killed $ecx -; HYBRID-NEXT: shrq %cl, %rdx -; HYBRID-NEXT: orq %rsi, %rdx -; HYBRID-NEXT: movq %rdx, 16(%rax) +; HYBRID-NEXT: shrq %cl, %rsi +; HYBRID-NEXT: movq %rdi, (%rax) +; HYBRID-NEXT: orq %rdx, %rsi +; HYBRID-NEXT: movq %rsi, 16(%rax) ; HYBRID-NEXT: retq ; ; BURR-LABEL: test1: ; BURR: # %bb.0: -; BURR-NEXT: movq %rdi, %rax ; BURR-NEXT: xorps %xmm0, %xmm0 -; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; BURR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; BURR-NEXT: movq $1, -{{[0-9]+}}(%rsp) ; BURR-NEXT: leal (%rsi,%rsi), %ecx ; BURR-NEXT: addb $3, %cl -; BURR-NEXT: movl %ecx, %edx -; BURR-NEXT: shrb $3, %dl -; BURR-NEXT: andb $24, %dl -; BURR-NEXT: negb %dl -; BURR-NEXT: movsbq %dl, %rdx -; BURR-NEXT: movq -24(%rsp,%rdx), %rsi -; BURR-NEXT: movq -16(%rsp,%rdx), %rdi -; BURR-NEXT: shldq %cl, %rsi, %rdi -; BURR-NEXT: movq %rdi, 24(%rax) -; BURR-NEXT: movq -40(%rsp,%rdx), %rdi -; BURR-NEXT: movq -32(%rsp,%rdx), %rdx -; BURR-NEXT: movq %rdx, %r8 +; BURR-NEXT: movl %ecx, %eax +; BURR-NEXT: shrb $3, %al +; BURR-NEXT: andb $24, %al +; BURR-NEXT: negb %al +; BURR-NEXT: movsbq %al, %rsi +; BURR-NEXT: movq -24(%rsp,%rsi), %rdx +; BURR-NEXT: movq -16(%rsp,%rsi), %r8 +; BURR-NEXT: shldq %cl, %rdx, %r8 +; BURR-NEXT: movq %rdi, %rax +; BURR-NEXT: movq %r8, 24(%rdi) +; BURR-NEXT: movq -40(%rsp,%rsi), %rdi +; BURR-NEXT: movq -32(%rsp,%rsi), %rsi +; BURR-NEXT: movq %rsi, %r8 ; BURR-NEXT: shldq %cl, %rdi, %r8 -; BURR-NEXT: movq %r8, 8(%rax) ; BURR-NEXT: shlq %cl, %rdi -; BURR-NEXT: movq %rdi, (%rax) -; BURR-NEXT: shlq %cl, %rsi +; BURR-NEXT: shlq %cl, %rdx +; BURR-NEXT: movq %r8, 8(%rax) ; BURR-NEXT: notb %cl -; BURR-NEXT: shrq %rdx +; BURR-NEXT: shrq %rsi ; BURR-NEXT: # kill: def $cl killed $cl killed $ecx -; BURR-NEXT: shrq %cl, %rdx -; BURR-NEXT: orq %rsi, %rdx -; BURR-NEXT: movq %rdx, 16(%rax) +; BURR-NEXT: shrq %cl, %rsi +; BURR-NEXT: movq %rdi, (%rax) +; BURR-NEXT: orq %rdx, %rsi +; BURR-NEXT: movq %rsi, 16(%rax) ; BURR-NEXT: retq ; ; SRC-LABEL: test1: ; SRC: # %bb.0: -; SRC-NEXT: movq %rdi, %rax -; SRC-NEXT: leal (%rsi,%rsi), %edx -; SRC-NEXT: addb $3, %dl +; SRC-NEXT: leal (%rsi,%rsi), %eax +; SRC-NEXT: addb $3, %al ; SRC-NEXT: xorps %xmm0, %xmm0 -; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SRC-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; SRC-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SRC-NEXT: movl %edx, %ecx +; SRC-NEXT: movl %eax, %ecx ; SRC-NEXT: shrb $3, %cl ; SRC-NEXT: andb $24, %cl ; SRC-NEXT: negb %cl -; SRC-NEXT: movsbq %cl, %rsi -; SRC-NEXT: movq -24(%rsp,%rsi), %rdi -; SRC-NEXT: movq %rdi, %r8 -; SRC-NEXT: movl %edx, %ecx +; SRC-NEXT: movsbq %cl, %rdx +; SRC-NEXT: movq -24(%rsp,%rdx), %rsi +; SRC-NEXT: movq %rsi, %r8 +; SRC-NEXT: movl %eax, %ecx ; SRC-NEXT: shlq %cl, %r8 ; SRC-NEXT: notb %cl -; SRC-NEXT: movq -40(%rsp,%rsi), %r9 -; SRC-NEXT: movq -32(%rsp,%rsi), %r10 -; SRC-NEXT: movq %r10, %r11 -; SRC-NEXT: shrq %r11 -; SRC-NEXT: shrq %cl, %r11 -; SRC-NEXT: orq %r8, %r11 -; SRC-NEXT: movq -16(%rsp,%rsi), %rsi -; SRC-NEXT: movl %edx, %ecx -; SRC-NEXT: shldq %cl, %rdi, %rsi -; SRC-NEXT: movq %r9, %rdi -; SRC-NEXT: shlq %cl, %rdi -; SRC-NEXT: shldq %cl, %r9, %r10 -; SRC-NEXT: movq %rsi, 24(%rax) -; SRC-NEXT: movq %r10, 8(%rax) -; SRC-NEXT: movq %rdi, (%rax) -; SRC-NEXT: movq %r11, 16(%rax) +; SRC-NEXT: movq -32(%rsp,%rdx), %r9 +; SRC-NEXT: movq %r9, %r10 +; SRC-NEXT: shrq %r10 +; SRC-NEXT: shrq %cl, %r10 +; SRC-NEXT: movq -16(%rsp,%rdx), %r11 +; SRC-NEXT: movl %eax, %ecx +; SRC-NEXT: shldq %cl, %rsi, %r11 +; SRC-NEXT: movq -40(%rsp,%rdx), %rdx +; SRC-NEXT: movq %rdx, %rsi +; SRC-NEXT: shlq %cl, %rsi +; SRC-NEXT: shldq %cl, %rdx, %r9 +; SRC-NEXT: movq %rdi, %rax +; SRC-NEXT: orq %r8, %r10 +; SRC-NEXT: movq %r11, 24(%rdi) +; SRC-NEXT: movq %r9, 8(%rdi) +; SRC-NEXT: movq %rsi, (%rdi) +; SRC-NEXT: movq %r10, 16(%rdi) ; SRC-NEXT: retq ; ; LIN-LABEL: test1: @@ -172,8 +172,8 @@ define i256 @test1(i256 %a) nounwind { ; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; LIN-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; LIN-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; LIN-NEXT: movq -40(%rsp,%rsi), %rdi ; LIN-NEXT: movq %rdi, %r8 ; LIN-NEXT: movl %edx, %ecx @@ -209,27 +209,26 @@ define i256 @test1(i256 %a) nounwind { define i256 @test2(i256 %a) nounwind { ; ILP-LABEL: test2: ; ILP: # %bb.0: -; ILP-NEXT: movq %rdi, %rax -; ILP-NEXT: xorps %xmm0, %xmm0 -; ILP-NEXT: movaps %xmm0, 16(%rdi) -; ILP-NEXT: xorl %edi, %edi +; ILP-NEXT: xorl %r9d, %r9d ; ILP-NEXT: movq %rsi, %r11 ; ILP-NEXT: negq %r11 ; ILP-NEXT: movl $0, %r10d ; ILP-NEXT: sbbq %rdx, %r10 -; ILP-NEXT: movl $0, %r9d -; ILP-NEXT: sbbq %rcx, %r9 -; ILP-NEXT: sbbq %r8, %rdi -; ILP-NEXT: andq %r8, %rdi -; ILP-NEXT: bsrq %rdi, %r8 +; ILP-NEXT: movq %rdi, %rax +; ILP-NEXT: movl $0, %edi +; ILP-NEXT: sbbq %rcx, %rdi +; ILP-NEXT: xorps %xmm0, %xmm0 +; ILP-NEXT: sbbq %r8, %r9 +; ILP-NEXT: andq %r8, %r9 +; ILP-NEXT: bsrq %r9, %r8 ; ILP-NEXT: andq %rdx, %r10 ; ILP-NEXT: bsrq %r10, %rdx ; ILP-NEXT: xorq $63, %r8 -; ILP-NEXT: andq %rcx, %r9 -; ILP-NEXT: bsrq %r9, %rcx +; ILP-NEXT: andq %rcx, %rdi +; ILP-NEXT: bsrq %rdi, %rcx ; ILP-NEXT: xorq $63, %rcx ; ILP-NEXT: orq $64, %rcx -; ILP-NEXT: testq %rdi, %rdi +; ILP-NEXT: testq %r9, %r9 ; ILP-NEXT: cmovneq %r8, %rcx ; ILP-NEXT: xorq $63, %rdx ; ILP-NEXT: andq %rsi, %r11 @@ -240,34 +239,34 @@ define i256 @test2(i256 %a) nounwind { ; ILP-NEXT: testq %r10, %r10 ; ILP-NEXT: cmovneq %rdx, %rsi ; ILP-NEXT: subq $-128, %rsi -; ILP-NEXT: orq %rdi, %r9 +; ILP-NEXT: orq %r9, %rdi ; ILP-NEXT: cmovneq %rcx, %rsi +; ILP-NEXT: movups %xmm0, 8(%rax) ; ILP-NEXT: movq %rsi, (%rax) -; ILP-NEXT: movq $0, 8(%rax) +; ILP-NEXT: movq $0, 24(%rax) ; ILP-NEXT: retq ; ; HYBRID-LABEL: test2: ; HYBRID: # %bb.0: -; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: xorps %xmm0, %xmm0 -; HYBRID-NEXT: movaps %xmm0, 16(%rdi) -; HYBRID-NEXT: xorl %edi, %edi +; HYBRID-NEXT: xorl %r9d, %r9d ; HYBRID-NEXT: movq %rsi, %r11 ; HYBRID-NEXT: negq %r11 ; HYBRID-NEXT: movl $0, %r10d ; HYBRID-NEXT: sbbq %rdx, %r10 -; HYBRID-NEXT: movl $0, %r9d -; HYBRID-NEXT: sbbq %rcx, %r9 -; HYBRID-NEXT: sbbq %r8, %rdi -; HYBRID-NEXT: andq %r8, %rdi -; HYBRID-NEXT: bsrq %rdi, %r8 +; HYBRID-NEXT: movq %rdi, %rax +; HYBRID-NEXT: movl $0, %edi +; HYBRID-NEXT: sbbq %rcx, %rdi +; HYBRID-NEXT: sbbq %r8, %r9 +; HYBRID-NEXT: andq %r8, %r9 +; HYBRID-NEXT: bsrq %r9, %r8 ; HYBRID-NEXT: xorq $63, %r8 -; HYBRID-NEXT: andq %rcx, %r9 -; HYBRID-NEXT: bsrq %r9, %rcx +; HYBRID-NEXT: andq %rcx, %rdi +; HYBRID-NEXT: bsrq %rdi, %rcx ; HYBRID-NEXT: xorq $63, %rcx ; HYBRID-NEXT: orq $64, %rcx -; HYBRID-NEXT: testq %rdi, %rdi +; HYBRID-NEXT: testq %r9, %r9 ; HYBRID-NEXT: cmovneq %r8, %rcx +; HYBRID-NEXT: xorps %xmm0, %xmm0 ; HYBRID-NEXT: andq %rdx, %r10 ; HYBRID-NEXT: bsrq %r10, %rdx ; HYBRID-NEXT: xorq $63, %rdx @@ -279,34 +278,34 @@ define i256 @test2(i256 %a) nounwind { ; HYBRID-NEXT: testq %r10, %r10 ; HYBRID-NEXT: cmovneq %rdx, %rsi ; HYBRID-NEXT: subq $-128, %rsi -; HYBRID-NEXT: orq %rdi, %r9 +; HYBRID-NEXT: orq %r9, %rdi ; HYBRID-NEXT: cmovneq %rcx, %rsi +; HYBRID-NEXT: movups %xmm0, 8(%rax) ; HYBRID-NEXT: movq %rsi, (%rax) -; HYBRID-NEXT: movq $0, 8(%rax) +; HYBRID-NEXT: movq $0, 24(%rax) ; HYBRID-NEXT: retq ; ; BURR-LABEL: test2: ; BURR: # %bb.0: -; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: xorps %xmm0, %xmm0 -; BURR-NEXT: movaps %xmm0, 16(%rdi) -; BURR-NEXT: xorl %edi, %edi +; BURR-NEXT: xorl %r9d, %r9d ; BURR-NEXT: movq %rsi, %r11 ; BURR-NEXT: negq %r11 ; BURR-NEXT: movl $0, %r10d ; BURR-NEXT: sbbq %rdx, %r10 -; BURR-NEXT: movl $0, %r9d -; BURR-NEXT: sbbq %rcx, %r9 -; BURR-NEXT: sbbq %r8, %rdi -; BURR-NEXT: andq %r8, %rdi -; BURR-NEXT: bsrq %rdi, %r8 +; BURR-NEXT: movq %rdi, %rax +; BURR-NEXT: movl $0, %edi +; BURR-NEXT: sbbq %rcx, %rdi +; BURR-NEXT: sbbq %r8, %r9 +; BURR-NEXT: andq %r8, %r9 +; BURR-NEXT: bsrq %r9, %r8 ; BURR-NEXT: xorq $63, %r8 -; BURR-NEXT: andq %rcx, %r9 -; BURR-NEXT: bsrq %r9, %rcx +; BURR-NEXT: andq %rcx, %rdi +; BURR-NEXT: bsrq %rdi, %rcx ; BURR-NEXT: xorq $63, %rcx ; BURR-NEXT: orq $64, %rcx -; BURR-NEXT: testq %rdi, %rdi +; BURR-NEXT: testq %r9, %r9 ; BURR-NEXT: cmovneq %r8, %rcx +; BURR-NEXT: xorps %xmm0, %xmm0 ; BURR-NEXT: andq %rdx, %r10 ; BURR-NEXT: bsrq %r10, %rdx ; BURR-NEXT: xorq $63, %rdx @@ -318,33 +317,34 @@ define i256 @test2(i256 %a) nounwind { ; BURR-NEXT: testq %r10, %r10 ; BURR-NEXT: cmovneq %rdx, %rsi ; BURR-NEXT: subq $-128, %rsi -; BURR-NEXT: orq %rdi, %r9 +; BURR-NEXT: orq %r9, %rdi ; BURR-NEXT: cmovneq %rcx, %rsi +; BURR-NEXT: movups %xmm0, 8(%rax) ; BURR-NEXT: movq %rsi, (%rax) -; BURR-NEXT: movq $0, 8(%rax) +; BURR-NEXT: movq $0, 24(%rax) ; BURR-NEXT: retq ; ; SRC-LABEL: test2: ; SRC: # %bb.0: -; SRC-NEXT: movq %rdi, %rax -; SRC-NEXT: xorl %edi, %edi +; SRC-NEXT: xorl %r9d, %r9d ; SRC-NEXT: movq %rsi, %r11 ; SRC-NEXT: negq %r11 ; SRC-NEXT: movl $0, %r10d ; SRC-NEXT: sbbq %rdx, %r10 -; SRC-NEXT: movl $0, %r9d -; SRC-NEXT: sbbq %rcx, %r9 -; SRC-NEXT: sbbq %r8, %rdi +; SRC-NEXT: movq %rdi, %rax +; SRC-NEXT: movl $0, %edi +; SRC-NEXT: sbbq %rcx, %rdi +; SRC-NEXT: sbbq %r8, %r9 ; SRC-NEXT: andq %rdx, %r10 -; SRC-NEXT: andq %rcx, %r9 -; SRC-NEXT: andq %r8, %rdi +; SRC-NEXT: andq %rcx, %rdi +; SRC-NEXT: andq %r8, %r9 ; SRC-NEXT: andq %rsi, %r11 -; SRC-NEXT: bsrq %rdi, %rcx +; SRC-NEXT: bsrq %r9, %rcx ; SRC-NEXT: xorq $63, %rcx -; SRC-NEXT: bsrq %r9, %rdx +; SRC-NEXT: bsrq %rdi, %rdx ; SRC-NEXT: xorq $63, %rdx ; SRC-NEXT: orq $64, %rdx -; SRC-NEXT: testq %rdi, %rdi +; SRC-NEXT: testq %r9, %r9 ; SRC-NEXT: cmovneq %rcx, %rdx ; SRC-NEXT: bsrq %r10, %rcx ; SRC-NEXT: xorq $63, %rcx @@ -355,19 +355,17 @@ define i256 @test2(i256 %a) nounwind { ; SRC-NEXT: testq %r10, %r10 ; SRC-NEXT: cmovneq %rcx, %rsi ; SRC-NEXT: subq $-128, %rsi -; SRC-NEXT: orq %r9, %rdi +; SRC-NEXT: orq %rdi, %r9 ; SRC-NEXT: cmovneq %rdx, %rsi ; SRC-NEXT: xorps %xmm0, %xmm0 -; SRC-NEXT: movaps %xmm0, 16(%rax) +; SRC-NEXT: movups %xmm0, 8(%rax) ; SRC-NEXT: movq %rsi, (%rax) -; SRC-NEXT: movq $0, 8(%rax) +; SRC-NEXT: movq $0, 24(%rax) ; SRC-NEXT: retq ; ; LIN-LABEL: test2: ; LIN: # %bb.0: ; LIN-NEXT: movq %rdi, %rax -; LIN-NEXT: xorps %xmm0, %xmm0 -; LIN-NEXT: movaps %xmm0, 16(%rdi) ; LIN-NEXT: movl $127, %edi ; LIN-NEXT: movq %rsi, %r9 ; LIN-NEXT: negq %r9 @@ -399,7 +397,9 @@ define i256 @test2(i256 %a) nounwind { ; LIN-NEXT: orq %rdi, %rsi ; LIN-NEXT: cmoveq %rdx, %r8 ; LIN-NEXT: movq %r8, (%rax) -; LIN-NEXT: movq $0, 8(%rax) +; LIN-NEXT: xorps %xmm0, %xmm0 +; LIN-NEXT: movups %xmm0, 8(%rax) +; LIN-NEXT: movq $0, 24(%rax) ; LIN-NEXT: retq %b = sub i256 0, %a %c = and i256 %b, %a @@ -410,135 +410,131 @@ define i256 @test2(i256 %a) nounwind { define i256 @test3(i256 %n) nounwind { ; ILP-LABEL: test3: ; ILP: # %bb.0: +; ILP-NEXT: xorl %r10d, %r10d +; ILP-NEXT: movq %rsi, %r9 +; ILP-NEXT: negq %r9 +; ILP-NEXT: movl $0, %r11d +; ILP-NEXT: sbbq %rdx, %r11 ; ILP-NEXT: movq %rdi, %rax +; ILP-NEXT: movl $0, %edi +; ILP-NEXT: sbbq %rcx, %rdi ; ILP-NEXT: xorps %xmm0, %xmm0 -; ILP-NEXT: movaps %xmm0, 16(%rdi) -; ILP-NEXT: xorl %r9d, %r9d -; ILP-NEXT: movq %rsi, %rdi -; ILP-NEXT: negq %rdi -; ILP-NEXT: movl $0, %r10d -; ILP-NEXT: sbbq %rdx, %r10 -; ILP-NEXT: movl $0, %r11d -; ILP-NEXT: sbbq %rcx, %r11 -; ILP-NEXT: sbbq %r8, %r9 +; ILP-NEXT: sbbq %r8, %r10 ; ILP-NEXT: notq %r8 -; ILP-NEXT: andq %r9, %r8 -; ILP-NEXT: bsrq %r8, %r9 +; ILP-NEXT: andq %r10, %r8 +; ILP-NEXT: bsrq %r8, %r10 ; ILP-NEXT: notq %rdx -; ILP-NEXT: andq %r10, %rdx -; ILP-NEXT: bsrq %rdx, %r10 -; ILP-NEXT: xorq $63, %r9 +; ILP-NEXT: andq %r11, %rdx +; ILP-NEXT: bsrq %rdx, %r11 +; ILP-NEXT: xorq $63, %r10 ; ILP-NEXT: notq %rcx -; ILP-NEXT: andq %r11, %rcx -; ILP-NEXT: bsrq %rcx, %r11 -; ILP-NEXT: xorq $63, %r11 -; ILP-NEXT: orq $64, %r11 +; ILP-NEXT: andq %rdi, %rcx +; ILP-NEXT: bsrq %rcx, %rdi +; ILP-NEXT: xorq $63, %rdi +; ILP-NEXT: orq $64, %rdi ; ILP-NEXT: testq %r8, %r8 -; ILP-NEXT: cmovneq %r9, %r11 -; ILP-NEXT: xorq $63, %r10 +; ILP-NEXT: cmovneq %r10, %rdi +; ILP-NEXT: xorq $63, %r11 ; ILP-NEXT: notq %rsi -; ILP-NEXT: andq %rdi, %rsi -; ILP-NEXT: movl $127, %edi -; ILP-NEXT: bsrq %rsi, %rdi -; ILP-NEXT: xorq $63, %rdi -; ILP-NEXT: addq $64, %rdi +; ILP-NEXT: andq %r9, %rsi +; ILP-NEXT: movl $127, %r9d +; ILP-NEXT: bsrq %rsi, %r9 +; ILP-NEXT: xorq $63, %r9 +; ILP-NEXT: addq $64, %r9 ; ILP-NEXT: testq %rdx, %rdx -; ILP-NEXT: cmovneq %r10, %rdi -; ILP-NEXT: subq $-128, %rdi +; ILP-NEXT: cmovneq %r11, %r9 +; ILP-NEXT: subq $-128, %r9 ; ILP-NEXT: orq %r8, %rcx -; ILP-NEXT: cmovneq %r11, %rdi -; ILP-NEXT: movq %rdi, (%rax) -; ILP-NEXT: movq $0, 8(%rax) +; ILP-NEXT: cmovneq %rdi, %r9 +; ILP-NEXT: movups %xmm0, 8(%rax) +; ILP-NEXT: movq %r9, (%rax) +; ILP-NEXT: movq $0, 24(%rax) ; ILP-NEXT: retq ; ; HYBRID-LABEL: test3: ; HYBRID: # %bb.0: -; HYBRID-NEXT: pushq %rbx +; HYBRID-NEXT: xorl %r10d, %r10d +; HYBRID-NEXT: movq %rsi, %r9 +; HYBRID-NEXT: negq %r9 +; HYBRID-NEXT: movl $0, %r11d +; HYBRID-NEXT: sbbq %rdx, %r11 ; HYBRID-NEXT: movq %rdi, %rax +; HYBRID-NEXT: movl $0, %edi +; HYBRID-NEXT: sbbq %rcx, %rdi ; HYBRID-NEXT: xorps %xmm0, %xmm0 -; HYBRID-NEXT: movaps %xmm0, 16(%rdi) -; HYBRID-NEXT: xorl %r9d, %r9d -; HYBRID-NEXT: movq %rsi, %rdi -; HYBRID-NEXT: negq %rdi -; HYBRID-NEXT: movl $0, %r10d -; HYBRID-NEXT: sbbq %rdx, %r10 -; HYBRID-NEXT: movl $0, %r11d -; HYBRID-NEXT: sbbq %rcx, %r11 -; HYBRID-NEXT: sbbq %r8, %r9 +; HYBRID-NEXT: sbbq %r8, %r10 ; HYBRID-NEXT: notq %r8 -; HYBRID-NEXT: andq %r9, %r8 -; HYBRID-NEXT: bsrq %r8, %rbx -; HYBRID-NEXT: xorq $63, %rbx +; HYBRID-NEXT: andq %r10, %r8 +; HYBRID-NEXT: bsrq %r8, %r10 +; HYBRID-NEXT: xorq $63, %r10 ; HYBRID-NEXT: notq %rcx -; HYBRID-NEXT: andq %r11, %rcx -; HYBRID-NEXT: bsrq %rcx, %r9 -; HYBRID-NEXT: xorq $63, %r9 -; HYBRID-NEXT: orq $64, %r9 +; HYBRID-NEXT: andq %rdi, %rcx +; HYBRID-NEXT: bsrq %rcx, %rdi +; HYBRID-NEXT: xorq $63, %rdi +; HYBRID-NEXT: orq $64, %rdi ; HYBRID-NEXT: testq %r8, %r8 -; HYBRID-NEXT: cmovneq %rbx, %r9 +; HYBRID-NEXT: cmovneq %r10, %rdi ; HYBRID-NEXT: notq %rdx -; HYBRID-NEXT: andq %r10, %rdx +; HYBRID-NEXT: andq %r11, %rdx ; HYBRID-NEXT: bsrq %rdx, %r10 ; HYBRID-NEXT: xorq $63, %r10 ; HYBRID-NEXT: notq %rsi -; HYBRID-NEXT: andq %rdi, %rsi -; HYBRID-NEXT: movl $127, %edi -; HYBRID-NEXT: bsrq %rsi, %rdi -; HYBRID-NEXT: xorq $63, %rdi -; HYBRID-NEXT: addq $64, %rdi +; HYBRID-NEXT: andq %r9, %rsi +; HYBRID-NEXT: movl $127, %r9d +; HYBRID-NEXT: bsrq %rsi, %r9 +; HYBRID-NEXT: xorq $63, %r9 +; HYBRID-NEXT: addq $64, %r9 ; HYBRID-NEXT: testq %rdx, %rdx -; HYBRID-NEXT: cmovneq %r10, %rdi -; HYBRID-NEXT: subq $-128, %rdi +; HYBRID-NEXT: cmovneq %r10, %r9 +; HYBRID-NEXT: subq $-128, %r9 ; HYBRID-NEXT: orq %r8, %rcx -; HYBRID-NEXT: cmovneq %r9, %rdi -; HYBRID-NEXT: movq %rdi, (%rax) -; HYBRID-NEXT: movq $0, 8(%rax) -; HYBRID-NEXT: popq %rbx +; HYBRID-NEXT: cmovneq %rdi, %r9 +; HYBRID-NEXT: movups %xmm0, 8(%rax) +; HYBRID-NEXT: movq %r9, (%rax) +; HYBRID-NEXT: movq $0, 24(%rax) ; HYBRID-NEXT: retq ; ; BURR-LABEL: test3: ; BURR: # %bb.0: -; BURR-NEXT: pushq %rbx +; BURR-NEXT: xorl %r10d, %r10d +; BURR-NEXT: movq %rsi, %r9 +; BURR-NEXT: negq %r9 +; BURR-NEXT: movl $0, %r11d +; BURR-NEXT: sbbq %rdx, %r11 ; BURR-NEXT: movq %rdi, %rax +; BURR-NEXT: movl $0, %edi +; BURR-NEXT: sbbq %rcx, %rdi ; BURR-NEXT: xorps %xmm0, %xmm0 -; BURR-NEXT: movaps %xmm0, 16(%rdi) -; BURR-NEXT: xorl %r9d, %r9d -; BURR-NEXT: movq %rsi, %rdi -; BURR-NEXT: negq %rdi -; BURR-NEXT: movl $0, %r10d -; BURR-NEXT: sbbq %rdx, %r10 -; BURR-NEXT: movl $0, %r11d -; BURR-NEXT: sbbq %rcx, %r11 -; BURR-NEXT: sbbq %r8, %r9 +; BURR-NEXT: sbbq %r8, %r10 ; BURR-NEXT: notq %r8 -; BURR-NEXT: andq %r9, %r8 -; BURR-NEXT: bsrq %r8, %rbx -; BURR-NEXT: xorq $63, %rbx +; BURR-NEXT: andq %r10, %r8 +; BURR-NEXT: bsrq %r8, %r10 +; BURR-NEXT: xorq $63, %r10 ; BURR-NEXT: notq %rcx -; BURR-NEXT: andq %r11, %rcx -; BURR-NEXT: bsrq %rcx, %r9 -; BURR-NEXT: xorq $63, %r9 -; BURR-NEXT: orq $64, %r9 +; BURR-NEXT: andq %rdi, %rcx +; BURR-NEXT: bsrq %rcx, %rdi +; BURR-NEXT: xorq $63, %rdi +; BURR-NEXT: orq $64, %rdi ; BURR-NEXT: testq %r8, %r8 -; BURR-NEXT: cmovneq %rbx, %r9 +; BURR-NEXT: cmovneq %r10, %rdi ; BURR-NEXT: notq %rdx -; BURR-NEXT: andq %r10, %rdx +; BURR-NEXT: andq %r11, %rdx ; BURR-NEXT: bsrq %rdx, %r10 ; BURR-NEXT: xorq $63, %r10 ; BURR-NEXT: notq %rsi -; BURR-NEXT: andq %rdi, %rsi -; BURR-NEXT: movl $127, %edi -; BURR-NEXT: bsrq %rsi, %rdi -; BURR-NEXT: xorq $63, %rdi -; BURR-NEXT: addq $64, %rdi +; BURR-NEXT: andq %r9, %rsi +; BURR-NEXT: movl $127, %r9d +; BURR-NEXT: bsrq %rsi, %r9 +; BURR-NEXT: xorq $63, %r9 +; BURR-NEXT: addq $64, %r9 ; BURR-NEXT: testq %rdx, %rdx -; BURR-NEXT: cmovneq %r10, %rdi -; BURR-NEXT: subq $-128, %rdi +; BURR-NEXT: cmovneq %r10, %r9 +; BURR-NEXT: subq $-128, %r9 ; BURR-NEXT: orq %r8, %rcx -; BURR-NEXT: cmovneq %r9, %rdi -; BURR-NEXT: movq %rdi, (%rax) -; BURR-NEXT: movq $0, 8(%rax) -; BURR-NEXT: popq %rbx +; BURR-NEXT: cmovneq %rdi, %r9 +; BURR-NEXT: movups %xmm0, 8(%rax) +; BURR-NEXT: movq %r9, (%rax) +; BURR-NEXT: movq $0, 24(%rax) ; BURR-NEXT: retq ; ; SRC-LABEL: test3: @@ -553,8 +549,8 @@ define i256 @test3(i256 %n) nounwind { ; SRC-NEXT: notq %rdx ; SRC-NEXT: movl $0, %r11d ; SRC-NEXT: sbbq %rcx, %r11 -; SRC-NEXT: notq %rcx ; SRC-NEXT: sbbq %r8, %r9 +; SRC-NEXT: notq %rcx ; SRC-NEXT: notq %r8 ; SRC-NEXT: andq %r10, %rdx ; SRC-NEXT: andq %r11, %rcx @@ -579,25 +575,22 @@ define i256 @test3(i256 %n) nounwind { ; SRC-NEXT: orq %rcx, %r8 ; SRC-NEXT: cmovneq %r9, %r10 ; SRC-NEXT: xorps %xmm0, %xmm0 -; SRC-NEXT: movaps %xmm0, 16(%rax) +; SRC-NEXT: movups %xmm0, 8(%rax) ; SRC-NEXT: movq %r10, (%rax) -; SRC-NEXT: movq $0, 8(%rax) +; SRC-NEXT: movq $0, 24(%rax) ; SRC-NEXT: retq ; ; LIN-LABEL: test3: ; LIN: # %bb.0: -; LIN-NEXT: movq %rdi, %rax -; LIN-NEXT: xorps %xmm0, %xmm0 -; LIN-NEXT: movaps %xmm0, 16(%rdi) -; LIN-NEXT: movl $127, %r9d -; LIN-NEXT: movq %rsi, %rdi -; LIN-NEXT: negq %rdi +; LIN-NEXT: movl $127, %eax +; LIN-NEXT: movq %rsi, %r9 +; LIN-NEXT: negq %r9 ; LIN-NEXT: notq %rsi -; LIN-NEXT: andq %rdi, %rsi -; LIN-NEXT: bsrq %rsi, %r9 -; LIN-NEXT: xorq $63, %r9 -; LIN-NEXT: addq $64, %r9 -; LIN-NEXT: xorl %edi, %edi +; LIN-NEXT: andq %r9, %rsi +; LIN-NEXT: bsrq %rsi, %rax +; LIN-NEXT: xorq $63, %rax +; LIN-NEXT: addq $64, %rax +; LIN-NEXT: xorl %r9d, %r9d ; LIN-NEXT: movl $0, %esi ; LIN-NEXT: sbbq %rdx, %rsi ; LIN-NEXT: notq %rdx @@ -605,18 +598,19 @@ define i256 @test3(i256 %n) nounwind { ; LIN-NEXT: bsrq %rdx, %rsi ; LIN-NEXT: xorq $63, %rsi ; LIN-NEXT: testq %rdx, %rdx -; LIN-NEXT: cmoveq %r9, %rsi +; LIN-NEXT: cmoveq %rax, %rsi ; LIN-NEXT: subq $-128, %rsi -; LIN-NEXT: movl $0, %edx -; LIN-NEXT: sbbq %rcx, %rdx +; LIN-NEXT: movl $0, %eax +; LIN-NEXT: sbbq %rcx, %rax ; LIN-NEXT: notq %rcx -; LIN-NEXT: andq %rdx, %rcx +; LIN-NEXT: andq %rax, %rcx ; LIN-NEXT: bsrq %rcx, %rdx ; LIN-NEXT: xorq $63, %rdx ; LIN-NEXT: orq $64, %rdx -; LIN-NEXT: sbbq %r8, %rdi +; LIN-NEXT: sbbq %r8, %r9 +; LIN-NEXT: movq %rdi, %rax ; LIN-NEXT: notq %r8 -; LIN-NEXT: andq %rdi, %r8 +; LIN-NEXT: andq %r9, %r8 ; LIN-NEXT: bsrq %r8, %rdi ; LIN-NEXT: xorq $63, %rdi ; LIN-NEXT: testq %r8, %r8 @@ -624,7 +618,9 @@ define i256 @test3(i256 %n) nounwind { ; LIN-NEXT: orq %rcx, %r8 ; LIN-NEXT: cmoveq %rsi, %rdi ; LIN-NEXT: movq %rdi, (%rax) -; LIN-NEXT: movq $0, 8(%rax) +; LIN-NEXT: xorps %xmm0, %xmm0 +; LIN-NEXT: movups %xmm0, 8(%rax) +; LIN-NEXT: movq $0, 24(%rax) ; LIN-NEXT: retq %m = sub i256 -1, %n %x = sub i256 0, %n @@ -723,237 +719,247 @@ define i64 @test4(i64 %a, i64 %b) nounwind { define i256 @PR25498(i256 %a) nounwind { ; ILP-LABEL: PR25498: ; ILP: # %bb.0: +; ILP-NEXT: pushq %r14 ; ILP-NEXT: pushq %rbx -; ILP-NEXT: movq %rdi, %rax -; ILP-NEXT: xorl %edi, %edi -; ILP-NEXT: movq %rsi, %rbx -; ILP-NEXT: negq %rbx -; ILP-NEXT: movl $0, %r11d -; ILP-NEXT: sbbq %rdx, %r11 -; ILP-NEXT: movl $0, %r9d -; ILP-NEXT: sbbq %rcx, %r9 +; ILP-NEXT: xorl %r9d, %r9d +; ILP-NEXT: movq %rsi, %r14 +; ILP-NEXT: negq %r14 +; ILP-NEXT: movl $0, %ebx +; ILP-NEXT: sbbq %rdx, %rbx ; ILP-NEXT: movl $0, %r10d -; ILP-NEXT: sbbq %r8, %r10 +; ILP-NEXT: sbbq %rcx, %r10 +; ILP-NEXT: movl $0, %r11d +; ILP-NEXT: sbbq %r8, %r11 +; ILP-NEXT: movq %rdi, %rax ; ILP-NEXT: orq %r8, %rdx ; ILP-NEXT: orq %rcx, %rsi ; ILP-NEXT: orq %rdx, %rsi ; ILP-NEXT: je .LBB4_1 ; ILP-NEXT: # %bb.2: # %cond.false -; ILP-NEXT: bsrq %r11, %rdx -; ILP-NEXT: bsrq %r10, %rcx +; ILP-NEXT: bsrq %rbx, %rdx +; ILP-NEXT: bsrq %r11, %rcx ; ILP-NEXT: xorq $63, %rcx -; ILP-NEXT: bsrq %r9, %rsi +; ILP-NEXT: bsrq %r10, %rsi ; ILP-NEXT: xorq $63, %rsi ; ILP-NEXT: orq $64, %rsi -; ILP-NEXT: testq %r10, %r10 +; ILP-NEXT: testq %r11, %r11 ; ILP-NEXT: cmovneq %rcx, %rsi ; ILP-NEXT: xorq $63, %rdx -; ILP-NEXT: bsrq %rbx, %rcx +; ILP-NEXT: bsrq %r14, %rcx ; ILP-NEXT: xorq $63, %rcx ; ILP-NEXT: orq $64, %rcx -; ILP-NEXT: testq %r11, %r11 +; ILP-NEXT: testq %rbx, %rbx ; ILP-NEXT: cmovneq %rdx, %rcx ; ILP-NEXT: orq $128, %rcx -; ILP-NEXT: xorl %edi, %edi -; ILP-NEXT: orq %r10, %r9 +; ILP-NEXT: xorl %r9d, %r9d +; ILP-NEXT: orq %r11, %r10 ; ILP-NEXT: cmovneq %rsi, %rcx ; ILP-NEXT: jmp .LBB4_3 ; ILP-NEXT: .LBB4_1: ; ILP-NEXT: movl $256, %ecx # imm = 0x100 ; ILP-NEXT: .LBB4_3: # %cond.end ; ILP-NEXT: movq %rcx, (%rax) -; ILP-NEXT: movq %rdi, 8(%rax) -; ILP-NEXT: movq %rdi, 16(%rax) -; ILP-NEXT: movq %rdi, 24(%rax) +; ILP-NEXT: movq %r9, 8(%rax) +; ILP-NEXT: movq %r9, 16(%rax) +; ILP-NEXT: movq %r9, 24(%rax) ; ILP-NEXT: popq %rbx +; ILP-NEXT: popq %r14 ; ILP-NEXT: retq ; ; HYBRID-LABEL: PR25498: ; HYBRID: # %bb.0: +; HYBRID-NEXT: pushq %r14 ; HYBRID-NEXT: pushq %rbx -; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: xorl %edi, %edi -; HYBRID-NEXT: movq %rsi, %rbx -; HYBRID-NEXT: negq %rbx +; HYBRID-NEXT: xorl %r9d, %r9d +; HYBRID-NEXT: movq %rsi, %r14 +; HYBRID-NEXT: negq %r14 ; HYBRID-NEXT: movl $0, %r11d ; HYBRID-NEXT: sbbq %rdx, %r11 -; HYBRID-NEXT: movl $0, %r9d -; HYBRID-NEXT: sbbq %rcx, %r9 ; HYBRID-NEXT: movl $0, %r10d -; HYBRID-NEXT: sbbq %r8, %r10 +; HYBRID-NEXT: sbbq %rcx, %r10 +; HYBRID-NEXT: movl $0, %ebx +; HYBRID-NEXT: sbbq %r8, %rbx +; HYBRID-NEXT: movq %rdi, %rax ; HYBRID-NEXT: orq %r8, %rdx ; HYBRID-NEXT: orq %rcx, %rsi ; HYBRID-NEXT: orq %rdx, %rsi ; HYBRID-NEXT: je .LBB4_1 ; HYBRID-NEXT: # %bb.2: # %cond.false -; HYBRID-NEXT: bsrq %r10, %rcx +; HYBRID-NEXT: bsrq %rbx, %rcx ; HYBRID-NEXT: xorq $63, %rcx -; HYBRID-NEXT: bsrq %r9, %rdx +; HYBRID-NEXT: bsrq %r10, %rdx ; HYBRID-NEXT: xorq $63, %rdx ; HYBRID-NEXT: orq $64, %rdx -; HYBRID-NEXT: testq %r10, %r10 +; HYBRID-NEXT: testq %rbx, %rbx ; HYBRID-NEXT: cmovneq %rcx, %rdx ; HYBRID-NEXT: bsrq %r11, %rsi ; HYBRID-NEXT: xorq $63, %rsi -; HYBRID-NEXT: bsrq %rbx, %rcx +; HYBRID-NEXT: bsrq %r14, %rcx ; HYBRID-NEXT: xorq $63, %rcx ; HYBRID-NEXT: orq $64, %rcx ; HYBRID-NEXT: testq %r11, %r11 ; HYBRID-NEXT: cmovneq %rsi, %rcx ; HYBRID-NEXT: orq $128, %rcx -; HYBRID-NEXT: orq %r10, %r9 +; HYBRID-NEXT: orq %rbx, %r10 ; HYBRID-NEXT: cmovneq %rdx, %rcx -; HYBRID-NEXT: xorl %edi, %edi +; HYBRID-NEXT: xorl %r9d, %r9d ; HYBRID-NEXT: jmp .LBB4_3 ; HYBRID-NEXT: .LBB4_1: ; HYBRID-NEXT: movl $256, %ecx # imm = 0x100 ; HYBRID-NEXT: .LBB4_3: # %cond.end ; HYBRID-NEXT: movq %rcx, (%rax) -; HYBRID-NEXT: movq %rdi, 8(%rax) -; HYBRID-NEXT: movq %rdi, 16(%rax) -; HYBRID-NEXT: movq %rdi, 24(%rax) +; HYBRID-NEXT: movq %r9, 8(%rax) +; HYBRID-NEXT: movq %r9, 16(%rax) +; HYBRID-NEXT: movq %r9, 24(%rax) ; HYBRID-NEXT: popq %rbx +; HYBRID-NEXT: popq %r14 ; HYBRID-NEXT: retq ; ; BURR-LABEL: PR25498: ; BURR: # %bb.0: +; BURR-NEXT: pushq %r14 ; BURR-NEXT: pushq %rbx -; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: xorl %edi, %edi -; BURR-NEXT: movq %rsi, %rbx -; BURR-NEXT: negq %rbx +; BURR-NEXT: xorl %r9d, %r9d +; BURR-NEXT: movq %rsi, %r14 +; BURR-NEXT: negq %r14 ; BURR-NEXT: movl $0, %r11d ; BURR-NEXT: sbbq %rdx, %r11 -; BURR-NEXT: movl $0, %r9d -; BURR-NEXT: sbbq %rcx, %r9 ; BURR-NEXT: movl $0, %r10d -; BURR-NEXT: sbbq %r8, %r10 +; BURR-NEXT: sbbq %rcx, %r10 +; BURR-NEXT: movl $0, %ebx +; BURR-NEXT: sbbq %r8, %rbx +; BURR-NEXT: movq %rdi, %rax ; BURR-NEXT: orq %r8, %rdx ; BURR-NEXT: orq %rcx, %rsi ; BURR-NEXT: orq %rdx, %rsi ; BURR-NEXT: je .LBB4_1 ; BURR-NEXT: # %bb.2: # %cond.false -; BURR-NEXT: bsrq %r10, %rcx +; BURR-NEXT: bsrq %rbx, %rcx ; BURR-NEXT: xorq $63, %rcx -; BURR-NEXT: bsrq %r9, %rdx +; BURR-NEXT: bsrq %r10, %rdx ; BURR-NEXT: xorq $63, %rdx ; BURR-NEXT: orq $64, %rdx -; BURR-NEXT: testq %r10, %r10 +; BURR-NEXT: testq %rbx, %rbx ; BURR-NEXT: cmovneq %rcx, %rdx ; BURR-NEXT: bsrq %r11, %rsi ; BURR-NEXT: xorq $63, %rsi -; BURR-NEXT: bsrq %rbx, %rcx +; BURR-NEXT: bsrq %r14, %rcx ; BURR-NEXT: xorq $63, %rcx ; BURR-NEXT: orq $64, %rcx ; BURR-NEXT: testq %r11, %r11 ; BURR-NEXT: cmovneq %rsi, %rcx ; BURR-NEXT: orq $128, %rcx -; BURR-NEXT: orq %r10, %r9 +; BURR-NEXT: orq %rbx, %r10 ; BURR-NEXT: cmovneq %rdx, %rcx -; BURR-NEXT: xorl %edi, %edi +; BURR-NEXT: xorl %r9d, %r9d ; BURR-NEXT: jmp .LBB4_3 ; BURR-NEXT: .LBB4_1: ; BURR-NEXT: movl $256, %ecx # imm = 0x100 ; BURR-NEXT: .LBB4_3: # %cond.end ; BURR-NEXT: movq %rcx, (%rax) -; BURR-NEXT: movq %rdi, 8(%rax) -; BURR-NEXT: movq %rdi, 16(%rax) -; BURR-NEXT: movq %rdi, 24(%rax) +; BURR-NEXT: movq %r9, 8(%rax) +; BURR-NEXT: movq %r9, 16(%rax) +; BURR-NEXT: movq %r9, 24(%rax) ; BURR-NEXT: popq %rbx +; BURR-NEXT: popq %r14 ; BURR-NEXT: retq ; ; SRC-LABEL: PR25498: ; SRC: # %bb.0: +; SRC-NEXT: pushq %r14 ; SRC-NEXT: pushq %rbx -; SRC-NEXT: movq %rdi, %rax -; SRC-NEXT: xorl %edi, %edi -; SRC-NEXT: movq %rsi, %rbx -; SRC-NEXT: negq %rbx +; SRC-NEXT: xorl %r9d, %r9d +; SRC-NEXT: movq %rsi, %r14 +; SRC-NEXT: negq %r14 ; SRC-NEXT: movl $0, %r11d ; SRC-NEXT: sbbq %rdx, %r11 -; SRC-NEXT: movl $0, %r9d -; SRC-NEXT: sbbq %rcx, %r9 ; SRC-NEXT: movl $0, %r10d -; SRC-NEXT: sbbq %r8, %r10 +; SRC-NEXT: sbbq %rcx, %r10 +; SRC-NEXT: movl $0, %ebx +; SRC-NEXT: sbbq %r8, %rbx +; SRC-NEXT: movq %rdi, %rax ; SRC-NEXT: orq %r8, %rdx ; SRC-NEXT: orq %rcx, %rsi ; SRC-NEXT: orq %rdx, %rsi ; SRC-NEXT: je .LBB4_1 ; SRC-NEXT: # %bb.2: # %cond.false -; SRC-NEXT: bsrq %r10, %rcx +; SRC-NEXT: bsrq %rbx, %rcx ; SRC-NEXT: xorq $63, %rcx -; SRC-NEXT: bsrq %r9, %rdx +; SRC-NEXT: bsrq %r10, %rdx ; SRC-NEXT: xorq $63, %rdx ; SRC-NEXT: orq $64, %rdx -; SRC-NEXT: testq %r10, %r10 +; SRC-NEXT: testq %rbx, %rbx ; SRC-NEXT: cmovneq %rcx, %rdx ; SRC-NEXT: bsrq %r11, %rsi ; SRC-NEXT: xorq $63, %rsi -; SRC-NEXT: bsrq %rbx, %rcx +; SRC-NEXT: bsrq %r14, %rcx ; SRC-NEXT: xorq $63, %rcx ; SRC-NEXT: orq $64, %rcx ; SRC-NEXT: testq %r11, %r11 ; SRC-NEXT: cmovneq %rsi, %rcx ; SRC-NEXT: orq $128, %rcx -; SRC-NEXT: orq %r10, %r9 +; SRC-NEXT: orq %rbx, %r10 ; SRC-NEXT: cmovneq %rdx, %rcx -; SRC-NEXT: xorl %edi, %edi +; SRC-NEXT: xorl %r9d, %r9d ; SRC-NEXT: jmp .LBB4_3 ; SRC-NEXT: .LBB4_1: ; SRC-NEXT: movl $256, %ecx # imm = 0x100 ; SRC-NEXT: .LBB4_3: # %cond.end ; SRC-NEXT: movq %rcx, (%rax) -; SRC-NEXT: movq %rdi, 8(%rax) -; SRC-NEXT: movq %rdi, 16(%rax) -; SRC-NEXT: movq %rdi, 24(%rax) +; SRC-NEXT: movq %r9, 8(%rax) +; SRC-NEXT: movq %r9, 16(%rax) +; SRC-NEXT: movq %r9, 24(%rax) ; SRC-NEXT: popq %rbx +; SRC-NEXT: popq %r14 ; SRC-NEXT: retq ; ; LIN-LABEL: PR25498: ; LIN: # %bb.0: +; LIN-NEXT: pushq %r14 ; LIN-NEXT: pushq %rbx -; LIN-NEXT: movq %rdi, %rax -; LIN-NEXT: movq %rsi, %rbx -; LIN-NEXT: negq %rbx -; LIN-NEXT: xorl %edi, %edi -; LIN-NEXT: movl $0, %r11d -; LIN-NEXT: sbbq %rdx, %r11 -; LIN-NEXT: movl $0, %r9d -; LIN-NEXT: sbbq %rcx, %r9 +; LIN-NEXT: movq %rsi, %r14 +; LIN-NEXT: negq %r14 +; LIN-NEXT: xorl %r9d, %r9d +; LIN-NEXT: movl $0, %ebx +; LIN-NEXT: sbbq %rdx, %rbx ; LIN-NEXT: movl $0, %r10d -; LIN-NEXT: sbbq %r8, %r10 +; LIN-NEXT: sbbq %rcx, %r10 +; LIN-NEXT: movl $0, %r11d +; LIN-NEXT: sbbq %r8, %r11 +; LIN-NEXT: movq %rdi, %rax ; LIN-NEXT: orq %rcx, %rsi ; LIN-NEXT: orq %r8, %rdx ; LIN-NEXT: orq %rsi, %rdx ; LIN-NEXT: je .LBB4_1 ; LIN-NEXT: # %bb.2: # %cond.false -; LIN-NEXT: bsrq %rbx, %rcx +; LIN-NEXT: bsrq %r14, %rcx ; LIN-NEXT: xorq $63, %rcx ; LIN-NEXT: orq $64, %rcx -; LIN-NEXT: bsrq %r11, %rdx +; LIN-NEXT: bsrq %rbx, %rdx ; LIN-NEXT: xorq $63, %rdx -; LIN-NEXT: testq %r11, %r11 +; LIN-NEXT: testq %rbx, %rbx ; LIN-NEXT: cmoveq %rcx, %rdx ; LIN-NEXT: orq $128, %rdx -; LIN-NEXT: bsrq %r9, %rsi +; LIN-NEXT: bsrq %r10, %rsi ; LIN-NEXT: xorq $63, %rsi ; LIN-NEXT: orq $64, %rsi -; LIN-NEXT: bsrq %r10, %rcx +; LIN-NEXT: bsrq %r11, %rcx ; LIN-NEXT: xorq $63, %rcx -; LIN-NEXT: testq %r10, %r10 +; LIN-NEXT: testq %r11, %r11 ; LIN-NEXT: cmoveq %rsi, %rcx -; LIN-NEXT: orq %r10, %r9 +; LIN-NEXT: orq %r11, %r10 ; LIN-NEXT: cmoveq %rdx, %rcx -; LIN-NEXT: xorl %edi, %edi +; LIN-NEXT: xorl %r9d, %r9d ; LIN-NEXT: jmp .LBB4_3 ; LIN-NEXT: .LBB4_1: ; LIN-NEXT: movl $256, %ecx # imm = 0x100 ; LIN-NEXT: .LBB4_3: # %cond.end ; LIN-NEXT: movq %rcx, (%rax) -; LIN-NEXT: movq %rdi, 8(%rax) -; LIN-NEXT: movq %rdi, 16(%rax) -; LIN-NEXT: movq %rdi, 24(%rax) +; LIN-NEXT: movq %r9, 8(%rax) +; LIN-NEXT: movq %r9, 16(%rax) +; LIN-NEXT: movq %r9, 24(%rax) ; LIN-NEXT: popq %rbx +; LIN-NEXT: popq %r14 ; LIN-NEXT: retq %b = sub i256 0, %a %cmpz = icmp eq i256 %b, 0 diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll index 874913629e9e3..01e814f84f1f1 100644 --- a/llvm/test/CodeGen/X86/scmp.ll +++ b/llvm/test/CodeGen/X86/scmp.ll @@ -471,9 +471,9 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: setl %dl ; X86-NEXT: setg %dh ; X86-NEXT: subb %dl, %dh @@ -482,20 +482,20 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: setl %bl ; X86-NEXT: setg %bh ; X86-NEXT: subb %bl, %bh -; X86-NEXT: movsbl %bh, %edi ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movsbl %bh, %esi ; X86-NEXT: setl %bl ; X86-NEXT: setg %bh ; X86-NEXT: subb %bl, %bh -; X86-NEXT: movsbl %bh, %esi +; X86-NEXT: movsbl %bh, %edi ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: setl %cl ; X86-NEXT: setg %ch ; X86-NEXT: subb %cl, %ch ; X86-NEXT: movsbl %ch, %ecx ; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -628,9 +628,9 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: setl %ch ; X86-NEXT: setg %cl ; X86-NEXT: subb %ch, %cl @@ -713,9 +713,9 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: setl %dl ; X86-NEXT: setg %dh ; X86-NEXT: subb %dl, %dh @@ -724,8 +724,8 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind { ; X86-NEXT: setl %bl ; X86-NEXT: setg %bh ; X86-NEXT: subb %bl, %bh -; X86-NEXT: movsbl %bh, %esi ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movsbl %bh, %esi ; X86-NEXT: setl %ch ; X86-NEXT: setg %bl ; X86-NEXT: subb %ch, %bl @@ -858,55 +858,56 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $16, %esp +; X86-NEXT: subl $12, %esp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movb {{[0-9]+}}(%esp), %dh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movb {{[0-9]+}}(%esp), %bh -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setl %al -; X86-NEXT: setg %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bh -; X86-NEXT: setl %al -; X86-NEXT: setg %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl -; X86-NEXT: setl %al -; X86-NEXT: setg %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: setl %bl +; X86-NEXT: setg %bh +; X86-NEXT: subb %bl, %bh +; X86-NEXT: movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dh -; X86-NEXT: setl %al -; X86-NEXT: setg %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: setl %dh +; X86-NEXT: setg %bl +; X86-NEXT: subb %dh, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch -; X86-NEXT: setl %al -; X86-NEXT: setg %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: setl %dh +; X86-NEXT: setg %bl +; X86-NEXT: subb %dh, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch +; X86-NEXT: setl %ch +; X86-NEXT: setg %dh +; X86-NEXT: subb %ch, %dh +; X86-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah -; X86-NEXT: setl %al -; X86-NEXT: setg %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: setl %ah +; X86-NEXT: setg %ch +; X86-NEXT: subb %ah, %ch +; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl -; X86-NEXT: setl %al -; X86-NEXT: setg %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: setl %al -; X86-NEXT: setg %bh -; X86-NEXT: subb %al, %bh -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: setl %ah +; X86-NEXT: setg %ch +; X86-NEXT: subb %ah, %ch +; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl +; X86-NEXT: setl %dl +; X86-NEXT: setg %ah +; X86-NEXT: subb %dl, %ah +; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl +; X86-NEXT: setl %cl +; X86-NEXT: setg %dl +; X86-NEXT: subb %cl, %dl +; X86-NEXT: movb %dl, (%esp) # 1-byte Spill ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: setl %al ; X86-NEXT: setg %bl @@ -914,8 +915,8 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: setl %al -; X86-NEXT: setg %dh -; X86-NEXT: subb %al, %dh +; X86-NEXT: setg %bh +; X86-NEXT: subb %al, %bh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: setl %al @@ -928,8 +929,7 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: setl %al ; X86-NEXT: setg %dl ; X86-NEXT: subb %al, %dl -; X86-NEXT: movsbl %dl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movsbl %dl, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: setl %al @@ -941,49 +941,48 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: setl %al ; X86-NEXT: setg %dl ; X86-NEXT: subb %al, %dl -; X86-NEXT: movsbl %dl, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al +; X86-NEXT: movsbl %dl, %esi ; X86-NEXT: setl %al ; X86-NEXT: setg %ah ; X86-NEXT: subb %al, %ah -; X86-NEXT: movsbl %ah, %esi +; X86-NEXT: movsbl %ah, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: setl %al -; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl -; X86-NEXT: movsbl %dl, %ecx +; X86-NEXT: setg %ah +; X86-NEXT: subb %al, %ah +; X86-NEXT: movsbl %ah, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ecx, 60(%eax) -; X86-NEXT: movl %esi, 56(%eax) -; X86-NEXT: movl %edi, 52(%eax) +; X86-NEXT: movl %edx, 56(%eax) +; X86-NEXT: movl %esi, 52(%eax) ; X86-NEXT: movl %ebp, 48(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 44(%eax) +; X86-NEXT: movl %edi, 44(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 40(%eax) -; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movsbl %dh, %edx -; X86-NEXT: movl %edx, 36(%eax) ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload +; X86-NEXT: movsbl %bh, %ecx +; X86-NEXT: movl %ecx, 36(%eax) +; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movsbl %bl, %esi ; X86-NEXT: movl %esi, 32(%eax) ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X86-NEXT: movsbl %bh, %edi +; X86-NEXT: movsbl (%esp), %edi # 1-byte Folded Reload ; X86-NEXT: movl %edi, 28(%eax) ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload ; X86-NEXT: movl %ebx, 24(%eax) ; X86-NEXT: movl %edi, 20(%eax) ; X86-NEXT: movl %esi, 16(%eax) -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $16, %esp +; X86-NEXT: addl $12, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1513,68 +1512,68 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind { ; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: cmpl %edx, %esi ; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: sbbl %edi, %ebp ; X86-NEXT: setl %al -; X86-NEXT: cmpl %edi, %edx +; X86-NEXT: cmpl %esi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl %ebx, %edi ; X86-NEXT: setl %ah ; X86-NEXT: subb %al, %ah ; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl %ecx, %ebp -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl %ebp, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: setl %al -; X86-NEXT: cmpl %ebp, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: setl %bl +; X86-NEXT: cmpl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: setl %ah -; X86-NEXT: subb %al, %ah -; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl %edi, %ecx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: setl %al -; X86-NEXT: cmpl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: setl %cl +; X86-NEXT: subb %bl, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: cmpl %edi, %ebp +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: setl %bl +; X86-NEXT: cmpl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: setl %dl -; X86-NEXT: subb %al, %dl +; X86-NEXT: subb %bl, %dl ; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %ebp, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: sbbl %ecx, %eax +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: setl %bl -; X86-NEXT: cmpl %edi, %ebp +; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: setl %cl ; X86-NEXT: subb %bl, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl %edx, %edi -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: setl %bl -; X86-NEXT: cmpl %edi, %edx +; X86-NEXT: cmpl %esi, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: sbbl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: setl %bh @@ -1611,20 +1610,20 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind { ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: setl %bl ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: setl %dl ; X86-NEXT: subb %bl, %dl ; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %ecx, %edx +; X86-NEXT: cmpl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: sbbl %ecx, %edi ; X86-NEXT: setl %bl -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: setl %dl @@ -2221,7 +2220,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $52, %esp +; X86-NEXT: subl $56, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addb %al, %al ; X86-NEXT: sarb %al @@ -2246,10 +2245,10 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: addb %al, %al ; X86-NEXT: sarb %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movb {{[0-9]+}}(%esp), %dh -; X86-NEXT: addb %dh, %dh -; X86-NEXT: sarb %dh -; X86-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb %cl, %cl +; X86-NEXT: sarb %cl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addb %dl, %dl ; X86-NEXT: sarb %dl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax @@ -2258,12 +2257,12 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: addb %ah, %ah ; X86-NEXT: sarb %ah -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addb %cl, %cl -; X86-NEXT: sarb %cl ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: addb %ch, %ch ; X86-NEXT: sarb %ch +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: addb %dh, %dh +; X86-NEXT: sarb %dh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: addb %bl, %bl ; X86-NEXT: sarb %bl @@ -2280,131 +2279,134 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %cl, %ch -; X86-NEXT: setl %cl -; X86-NEXT: setg %ch -; X86-NEXT: subb %cl, %ch -; X86-NEXT: movsbl %ch, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $2097151, %ecx # imm = 0x1FFFFF -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %ch, %dh +; X86-NEXT: setl %ch +; X86-NEXT: setg %dh +; X86-NEXT: subb %ch, %dh +; X86-NEXT: movsbl %dh, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmpb %al, %ah ; X86-NEXT: setl %al -; X86-NEXT: setg %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movsbl %cl, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ecx, (%edi) -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $2097151, %eax # imm = 0x1FFFFF -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %dh, %dl -; X86-NEXT: setl %al +; X86-NEXT: setg %ah +; X86-NEXT: subb %al, %ah +; X86-NEXT: movsbl %ah, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %eax, (%ebx) +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %cl, %dl +; X86-NEXT: setl %cl ; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl +; X86-NEXT: subb %cl, %dl ; X86-NEXT: movsbl %dl, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: setl %al +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: setl %cl ; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl +; X86-NEXT: subb %cl, %dl +; X86-NEXT: movsbl %dl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: setl %cl +; X86-NEXT: setg %dl +; X86-NEXT: subb %cl, %dl ; X86-NEXT: movsbl %dl, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: setl %al -; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl -; X86-NEXT: movsbl %dl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload ; X86-NEXT: setl %dl ; X86-NEXT: setg %dh ; X86-NEXT: subb %dl, %dh -; X86-NEXT: movsbl %dh, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %ebx, 96(%edi) -; X86-NEXT: movl %ebx, 92(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 80(%edi) -; X86-NEXT: movl %eax, 68(%edi) -; X86-NEXT: movl %eax, 64(%edi) -; X86-NEXT: movl %esi, 52(%edi) -; X86-NEXT: movl %esi, 48(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 36(%edi) -; X86-NEXT: movl %ebp, 24(%edi) -; X86-NEXT: movl %ebp, 20(%edi) -; X86-NEXT: movl %ecx, 8(%edi) -; X86-NEXT: movl %ecx, 4(%edi) -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movw %cx, 100(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $30, %edx, %ecx -; X86-NEXT: movl %ecx, 88(%edi) +; X86-NEXT: movsbl %dh, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, 96(%ebx) +; X86-NEXT: movl %edx, 92(%ebx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $9, %edx, %ecx -; X86-NEXT: movl %ecx, 76(%edi) -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $20, %edx, %ecx -; X86-NEXT: movl %ecx, 60(%edi) +; X86-NEXT: movl %ecx, 80(%ebx) +; X86-NEXT: movl %esi, 68(%ebx) +; X86-NEXT: movl %esi, 64(%ebx) +; X86-NEXT: movl %edi, 52(%ebx) +; X86-NEXT: movl %edi, 48(%ebx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 36(%ebx) +; X86-NEXT: movl %ebp, 24(%ebx) +; X86-NEXT: movl %ebp, 20(%ebx) +; X86-NEXT: movl %eax, 8(%ebx) +; X86-NEXT: movl %eax, 4(%ebx) +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $30, %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $9, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movw %dx, 100(%ebx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $9, %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 88(%ebx) ; X86-NEXT: movl %esi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $31, %edx, %ecx -; X86-NEXT: movl %ecx, 44(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $10, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl $20, %edx, %ecx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %eax, 76(%ebx) +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $31, %ebx, %eax +; X86-NEXT: movl %ecx, 60(%edx) +; X86-NEXT: movl %eax, 44(%edx) +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $10, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $10, %edx, %ecx -; X86-NEXT: movl %ecx, 32(%edi) -; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: shldl $10, %edx, %eax +; X86-NEXT: movl %eax, 32(%ecx) +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $21, %ebx, %ecx -; X86-NEXT: movl %ecx, 16(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shll $9, %ecx -; X86-NEXT: andl $511, %eax # imm = 0x1FF -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl %eax, 72(%edi) +; X86-NEXT: shldl $21, %ebx, %eax +; X86-NEXT: movl %eax, 16(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shll $20, %eax -; X86-NEXT: andl $1048575, %esi # imm = 0xFFFFF +; X86-NEXT: shll $9, %eax +; X86-NEXT: andl $511, %esi # imm = 0x1FF ; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, 56(%edi) +; X86-NEXT: movl %esi, 72(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $20, %eax +; X86-NEXT: andl $1048575, %edi # imm = 0xFFFFF +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, 56(%ecx) ; X86-NEXT: shll $10, %edx ; X86-NEXT: andl $1023, %ebp # imm = 0x3FF ; X86-NEXT: orl %edx, %ebp -; X86-NEXT: movl %ebp, 28(%edi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shll $21, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, 12(%edi) +; X86-NEXT: movl %ebp, 28(%ecx) +; X86-NEXT: shll $21, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, 12(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: andl $7, %eax -; X86-NEXT: movb %al, 102(%edi) +; X86-NEXT: movb %al, 102(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $30, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, 84(%edi) +; X86-NEXT: movl %eax, 84(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $31, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, 40(%edi) -; X86-NEXT: movl %edi, %eax -; X86-NEXT: addl $52, %esp +; X86-NEXT: movl %eax, 40(%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/sdiv-exact.ll b/llvm/test/CodeGen/X86/sdiv-exact.ll index 456819179fcdc..a3c865ed1bc29 100644 --- a/llvm/test/CodeGen/X86/sdiv-exact.ll +++ b/llvm/test/CodeGen/X86/sdiv-exact.ll @@ -85,9 +85,9 @@ define <4 x i32> @test5(<4 x i32> %x) { ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145] ; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl @@ -110,9 +110,9 @@ define <4 x i32> @test6(<4 x i32> %x) { ; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997] ; X86-NEXT: pmuludq %xmm0, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqa %xmm1, %xmm0 @@ -132,8 +132,8 @@ define <4 x i32> @test7(<4 x i32> %x) { ; X86: # %bb.0: ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl @@ -154,9 +154,9 @@ define <4 x i32> @test8(<4 x i32> %x) { ; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531] ; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll index e7727a0ab6178..0bcb0521b217f 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -617,9 +617,9 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: cmovgeq %rdx, %rbp ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rbp, %rcx -; X64-NEXT: movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq $-1, %rax ; X64-NEXT: sbbq %r14, %rax +; X64-NEXT: movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: cmovgeq %rcx, %rbp ; X64-NEXT: movq %rbp, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -674,20 +674,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movq %rbp, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-NEXT: psrlq $1, %xmm1 -; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: psrlq $31, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: psrad $31, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: psrad $31, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm0, %rbx ; X64-NEXT: movq %rbx, %r13 ; X64-NEXT: sarq $63, %r13 ; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: psrlq $1, %xmm1 +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; X64-NEXT: pxor %xmm1, %xmm1 ; X64-NEXT: pcmpgtd %xmm0, %xmm1 @@ -992,117 +992,120 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: cmovgel %edx, %edi ; X86-NEXT: shldl $31, %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl $1, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %edx ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %bl +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %bh -; X86-NEXT: xorb %bl, %bh -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: setne %cl -; X86-NEXT: testb %bh, %cl -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload +; X86-NEXT: xorb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: setne %al +; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %esi, %edx +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: cmpl $-1, %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %edx -; X86-NEXT: cmovgel %ecx, %edi -; X86-NEXT: cmovgel %ecx, %esi +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmpl $-1, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %edx, %eax +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %edi, %eax +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: cmovgel %eax, %edi +; X86-NEXT: cmovgel %eax, %edx +; X86-NEXT: cmovgel %eax, %esi ; X86-NEXT: movl $-1, %ebx -; X86-NEXT: cmovgel %ebx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %eax +; X86-NEXT: cmovgel %ebx, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: movl $-1, %eax +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: movl $-1, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl $-1, %eax +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: cmovgel %eax, %ecx ; X86-NEXT: cmovgel %ebx, %esi -; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: shldl $31, %ecx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl $1, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %bl +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %bh -; X86-NEXT: xorb %bl, %bh -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: setne %cl -; X86-NEXT: testb %bh, %cl -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload +; X86-NEXT: xorb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: setne %al +; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: cmpl $-1, %eax -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %edx -; X86-NEXT: cmovgel %ecx, %edi -; X86-NEXT: cmovgel %ecx, %ebx -; X86-NEXT: movl $-1, %esi -; X86-NEXT: cmovgel %esi, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %eax -; X86-NEXT: cmovgel %esi, %ebx -; X86-NEXT: shldl $31, %eax, %ebx +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmpl $-1, %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %edi, %eax +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: cmovgel %eax, %esi +; X86-NEXT: cmovgel %eax, %edi +; X86-NEXT: cmovgel %eax, %ebx +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovgel %edx, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: movl $-1, %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: movl $-1, %eax +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl $-1, %eax +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: cmovgel %eax, %ecx +; X86-NEXT: cmovgel %edx, %ebx +; X86-NEXT: shldl $31, %ecx, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill diff --git a/llvm/test/CodeGen/X86/select-1-or-neg1.ll b/llvm/test/CodeGen/X86/select-1-or-neg1.ll index 9a4cb55e52bd9..ef740a1705351 100644 --- a/llvm/test/CodeGen/X86/select-1-or-neg1.ll +++ b/llvm/test/CodeGen/X86/select-1-or-neg1.ll @@ -11,7 +11,7 @@ define i32 @PR28968(i32 %x) { ; BASE-NEXT: xorl %eax, %eax ; BASE-NEXT: cmpl $1, %edi ; BASE-NEXT: sete %al -; BASE-NEXT: leal -1(%rax,%rax), %eax +; BASE-NEXT: leal -1(,%rax,2), %eax ; BASE-NEXT: retq ; ; SLOWLEA3-LABEL: PR28968: diff --git a/llvm/test/CodeGen/X86/select-constant-lea.ll b/llvm/test/CodeGen/X86/select-constant-lea.ll index ab55082d209e3..91b00a2a3d2a9 100644 --- a/llvm/test/CodeGen/X86/select-constant-lea.ll +++ b/llvm/test/CodeGen/X86/select-constant-lea.ll @@ -8,7 +8,8 @@ define i32 @select_unsigned_lt_10_8_13(i32 %0) { ; BASE-NEXT: xorl %eax, %eax ; BASE-NEXT: cmpl $10, %edi ; BASE-NEXT: setae %al -; BASE-NEXT: leal 8(%rax,%rax,4), %eax +; BASE-NEXT: leal (%rax,%rax,4), %eax +; BASE-NEXT: addl $8, %eax ; BASE-NEXT: retq ; ; SLOWLEA3-LABEL: select_unsigned_lt_10_8_13: diff --git a/llvm/test/CodeGen/X86/select-of-half-constants.ll b/llvm/test/CodeGen/X86/select-of-half-constants.ll index e3d92eb474968..a178ebbaf5bfe 100644 --- a/llvm/test/CodeGen/X86/select-of-half-constants.ll +++ b/llvm/test/CodeGen/X86/select-of-half-constants.ll @@ -6,8 +6,8 @@ define half @fcmp_select_fp_constants_olt(half %x) nounwind readnone { ; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_olt: ; X64-AVX512FP16: # %bb.0: -; X64-AVX512FP16-NEXT: vmovsh {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0] ; X64-AVX512FP16-NEXT: vcmpltsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; X64-AVX512FP16-NEXT: vmovsh {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0] ; X64-AVX512FP16-NEXT: vmovsh {{.*#+}} xmm0 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0] ; X64-AVX512FP16-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} ; X64-AVX512FP16-NEXT: retq @@ -19,8 +19,8 @@ define half @fcmp_select_fp_constants_olt(half %x) nounwind readnone { define half @fcmp_select_fp_constants_ogt(half %x) nounwind readnone { ; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_ogt: ; X64-AVX512FP16: # %bb.0: -; X64-AVX512FP16-NEXT: vmovsh {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0] ; X64-AVX512FP16-NEXT: vcmpgtsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; X64-AVX512FP16-NEXT: vmovsh {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0] ; X64-AVX512FP16-NEXT: vmovsh {{.*#+}} xmm0 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0,0.0E+0] ; X64-AVX512FP16-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} ; X64-AVX512FP16-NEXT: retq diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll index 4e31b48ec5cec..28281ec2e2cd7 100644 --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -346,8 +346,8 @@ define void @test6(i32 %C, ptr %A, ptr %B) nounwind { ; MCU-NEXT: fmul %st, %st(0) ; MCU-NEXT: fxch %st(3) ; MCU-NEXT: fmul %st, %st(0) -; MCU-NEXT: testl %eax, %eax ; MCU-NEXT: flds (%edx) +; MCU-NEXT: testl %eax, %eax ; MCU-NEXT: je .LBB5_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: fstp %st(1) @@ -744,13 +744,21 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone { ;; Select between -1 and 1. define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone { -; CHECK-LABEL: test10: -; CHECK: ## %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpq $1, %rdi -; CHECK-NEXT: setae %al -; CHECK-NEXT: leaq -1(%rax,%rax), %rax -; CHECK-NEXT: retq +; GENERIC-LABEL: test10: +; GENERIC: ## %bb.0: +; GENERIC-NEXT: xorl %eax, %eax +; GENERIC-NEXT: cmpq $1, %rdi +; GENERIC-NEXT: setae %al +; GENERIC-NEXT: leaq -1(,%rax,2), %rax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test10: +; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax +; ATOM-NEXT: cmpq $1, %rdi +; ATOM-NEXT: setae %al +; ATOM-NEXT: leaq -1(%rax,%rax), %rax +; ATOM-NEXT: retq ; ; ATHLON-LABEL: test10: ; ATHLON: ## %bb.0: @@ -983,13 +991,21 @@ define i8 @nezero_all_ones_or_const(i8 %x) { } define i32 @PR53006(i32 %x) { -; CHECK-LABEL: PR53006: -; CHECK: ## %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: negl %edi -; CHECK-NEXT: setae %al -; CHECK-NEXT: leal -1(%rax,%rax), %eax -; CHECK-NEXT: retq +; GENERIC-LABEL: PR53006: +; GENERIC: ## %bb.0: +; GENERIC-NEXT: xorl %eax, %eax +; GENERIC-NEXT: negl %edi +; GENERIC-NEXT: setae %al +; GENERIC-NEXT: leal -1(,%rax,2), %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: PR53006: +; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax +; ATOM-NEXT: negl %edi +; ATOM-NEXT: setae %al +; ATOM-NEXT: leal -1(%rax,%rax), %eax +; ATOM-NEXT: retq ; ; ATHLON-LABEL: PR53006: ; ATHLON: ## %bb.0: @@ -1005,7 +1021,7 @@ define i32 @PR53006(i32 %x) { ; MCU-NEXT: xorl %ecx, %ecx ; MCU-NEXT: negl %eax ; MCU-NEXT: setae %cl -; MCU-NEXT: leal -1(%ecx,%ecx), %eax +; MCU-NEXT: leal -1(,%ecx,2), %eax ; MCU-NEXT: retl %z = icmp eq i32 %x, 0 %r = select i1 %z, i32 1, i32 -1 @@ -1328,14 +1344,14 @@ define void @clamp_i8(i32 %src, ptr %dst) { ; ; MCU-LABEL: clamp_i8: ; MCU: # %bb.0: -; MCU-NEXT: cmpl $127, %eax ; MCU-NEXT: movl $127, %ecx +; MCU-NEXT: cmpl $127, %eax ; MCU-NEXT: jg .LBB26_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: movl %eax, %ecx ; MCU-NEXT: .LBB26_2: -; MCU-NEXT: cmpl $-128, %ecx ; MCU-NEXT: movb $-128, %al +; MCU-NEXT: cmpl $-128, %ecx ; MCU-NEXT: jl .LBB26_4 ; MCU-NEXT: # %bb.3: ; MCU-NEXT: movl %ecx, %eax @@ -1390,14 +1406,14 @@ define void @clamp(i32 %src, ptr %dst) { ; ; MCU-LABEL: clamp: ; MCU: # %bb.0: -; MCU-NEXT: cmpl $32768, %eax # imm = 0x8000 ; MCU-NEXT: movl $32767, %ecx # imm = 0x7FFF +; MCU-NEXT: cmpl $32768, %eax # imm = 0x8000 ; MCU-NEXT: jge .LBB27_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: movl %eax, %ecx ; MCU-NEXT: .LBB27_2: -; MCU-NEXT: cmpl $-32768, %ecx # imm = 0x8000 ; MCU-NEXT: movl $32768, %eax # imm = 0x8000 +; MCU-NEXT: cmpl $-32768, %ecx # imm = 0x8000 ; MCU-NEXT: jl .LBB27_4 ; MCU-NEXT: # %bb.3: ; MCU-NEXT: movl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll index 35f4655dd6d7c..0c9411ef471de 100644 --- a/llvm/test/CodeGen/X86/select_const.ll +++ b/llvm/test/CodeGen/X86/select_const.ll @@ -367,7 +367,8 @@ define i64 @select_lea_3(i1 zeroext %cond) { ; X64: # %bb.0: ; X64-NEXT: xorb $1, %dil ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: leaq -2(%rax,%rax,2), %rax +; X64-NEXT: leaq (%rax,%rax,2), %rax +; X64-NEXT: addq $-2, %rax ; X64-NEXT: retq %sel = select i1 %cond, i64 -2, i64 1 ret i64 %sel @@ -379,14 +380,16 @@ define i32 @select_lea_5(i1 zeroext %cond) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorb $1, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -2(%eax,%eax,4), %eax +; X86-NEXT: leal (%eax,%eax,4), %eax +; X86-NEXT: addl $-2, %eax ; X86-NEXT: retl ; ; X64-LABEL: select_lea_5: ; X64: # %bb.0: ; X64-NEXT: xorb $1, %dil ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: leal -2(%rax,%rax,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: addl $-2, %eax ; X64-NEXT: retq %sel = select i1 %cond, i32 -2, i32 3 ret i32 %sel @@ -415,7 +418,8 @@ define i64 @select_lea_9(i1 zeroext %cond) { ; X64: # %bb.0: ; X64-NEXT: xorb $1, %dil ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: leaq -7(%rax,%rax,8), %rax +; X64-NEXT: leaq (%rax,%rax,8), %rax +; X64-NEXT: addq $-7, %rax ; X64-NEXT: retq %sel = select i1 %cond, i64 -7, i64 2 ret i64 %sel @@ -427,11 +431,11 @@ define i64 @sel_1_2(i64 %x, i64 %y) { ; X86-LABEL: sel_1_2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $42, {{[0-9]+}}(%esp) ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl $0, %edx ; X86-NEXT: addl $2, %eax ; X86-NEXT: adcl $0, %edx @@ -505,16 +509,27 @@ define i32 @sel_1_neg1_32(i32 %x) { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $43, {{[0-9]+}}(%esp) ; X86-NEXT: setge %al -; X86-NEXT: leal -1(%eax,%eax,8), %eax +; X86-NEXT: leal (%eax,%eax,8), %eax +; X86-NEXT: decl %eax ; X86-NEXT: retl ; -; X64-LABEL: sel_1_neg1_32: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $43, %edi -; X64-NEXT: setge %al -; X64-NEXT: leal -1(%rax,%rax,8), %eax -; X64-NEXT: retq +; X64-FASTINC-LABEL: sel_1_neg1_32: +; X64-FASTINC: # %bb.0: +; X64-FASTINC-NEXT: xorl %eax, %eax +; X64-FASTINC-NEXT: cmpl $43, %edi +; X64-FASTINC-NEXT: setge %al +; X64-FASTINC-NEXT: leal (%rax,%rax,8), %eax +; X64-FASTINC-NEXT: decl %eax +; X64-FASTINC-NEXT: retq +; +; X64-SLOWINC-LABEL: sel_1_neg1_32: +; X64-SLOWINC: # %bb.0: +; X64-SLOWINC-NEXT: xorl %eax, %eax +; X64-SLOWINC-NEXT: cmpl $43, %edi +; X64-SLOWINC-NEXT: setge %al +; X64-SLOWINC-NEXT: leal (%rax,%rax,8), %eax +; X64-SLOWINC-NEXT: addl $-1, %eax +; X64-SLOWINC-NEXT: retq %cmp = icmp sgt i32 %x, 42 %sel = select i1 %cmp, i32 8, i32 -1 ret i32 %sel @@ -526,7 +541,8 @@ define i32 @sel_neg1_1_32(i32 %x) { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $43, {{[0-9]+}}(%esp) ; X86-NEXT: setl %al -; X86-NEXT: leal -7(%eax,%eax,8), %eax +; X86-NEXT: leal (%eax,%eax,8), %eax +; X86-NEXT: addl $-7, %eax ; X86-NEXT: retl ; ; X64-LABEL: sel_neg1_1_32: @@ -534,7 +550,8 @@ define i32 @sel_neg1_1_32(i32 %x) { ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl $43, %edi ; X64-NEXT: setl %al -; X64-NEXT: leal -7(%rax,%rax,8), %eax +; X64-NEXT: leal (%rax,%rax,8), %eax +; X64-NEXT: addl $-7, %eax ; X64-NEXT: retq %cmp = icmp sgt i32 %x, 42 %sel = select i1 %cmp, i32 -7, i32 2 diff --git a/llvm/test/CodeGen/X86/setcc-logic.ll b/llvm/test/CodeGen/X86/setcc-logic.ll index c98aae7fbf405..6c050bcd215c0 100644 --- a/llvm/test/CodeGen/X86/setcc-logic.ll +++ b/llvm/test/CodeGen/X86/setcc-logic.ll @@ -560,8 +560,8 @@ define i1 @or_icmps_const_1bit_diff_extra_use(i8 %x, ptr %p) { ; CHECK-NEXT: cmpb $45, %dil ; CHECK-NEXT: sete %cl ; CHECK-NEXT: cmpb $43, %dil -; CHECK-NEXT: sete %al ; CHECK-NEXT: sete (%rsi) +; CHECK-NEXT: sete %al ; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: retq %a = icmp eq i8 %x, 43 diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll index 2ac2be5545dfd..c1bf5cc1884e4 100644 --- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll +++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll @@ -53,14 +53,14 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_2 Depth 2 ; CHECK-NEXT: xorpd %xmm3, %xmm3 -; CHECK-NEXT: movq $-1024, %rsi # imm = 0xFC00 +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: movdqa %xmm0, %xmm4 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: # %vector.body ; CHECK-NEXT: # Parent Loop BB0_1 Depth=1 ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movdqu 1024(%rdx,%rsi), %xmm5 -; CHECK-NEXT: movdqu 1040(%rdx,%rsi), %xmm6 +; CHECK-NEXT: movdqu (%rdx,%rsi), %xmm5 +; CHECK-NEXT: movdqu 16(%rdx,%rsi), %xmm6 ; CHECK-NEXT: movq %xmm5, %rdi ; CHECK-NEXT: movq %xmm6, %r8 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] @@ -92,6 +92,7 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-NEXT: orpd %xmm8, %xmm3 ; CHECK-NEXT: paddq %xmm2, %xmm4 ; CHECK-NEXT: addq $32, %rsi +; CHECK-NEXT: cmpq $1024, %rsi # imm = 0x400 ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.3: # %middle.block ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 @@ -105,36 +106,36 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax ; CHECK-AVX2-NEXT: movq 24(%rsi), %rcx ; CHECK-AVX2-NEXT: movq 32(%rsi), %rdx -; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,1] -; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,1] -; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [2,2] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2] ; CHECK-AVX2-NEXT: .p2align 4 ; CHECK-AVX2-NEXT: .LBB0_1: # %vector.ph ; CHECK-AVX2-NEXT: # =>This Loop Header: Depth=1 ; CHECK-AVX2-NEXT: # Child Loop BB0_2 Depth 2 ; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: movq $-1024, %rsi # imm = 0xFC00 +; CHECK-AVX2-NEXT: xorl %esi, %esi ; CHECK-AVX2-NEXT: vmovdqa %xmm0, %xmm4 ; CHECK-AVX2-NEXT: .p2align 4 ; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body ; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1 ; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5 -; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6 +; CHECK-AVX2-NEXT: vmovdqu (%rdx,%rsi), %xmm5 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi +; CHECK-AVX2-NEXT: vmovdqu 16(%rdx,%rsi), %xmm6 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8 -; CHECK-AVX2-NEXT: vmovq %xmm5, %r9 -; CHECK-AVX2-NEXT: vmovq %xmm6, %r10 -; CHECK-AVX2-NEXT: negq %r10 -; CHECK-AVX2-NEXT: movq %rcx, %r10 -; CHECK-AVX2-NEXT: sbbq %r8, %r10 -; CHECK-AVX2-NEXT: setge %r8b -; CHECK-AVX2-NEXT: movzbl %r8b, %r8d -; CHECK-AVX2-NEXT: negq %r8 -; CHECK-AVX2-NEXT: vmovq %r8, %xmm5 +; CHECK-AVX2-NEXT: vmovq %xmm6, %r9 +; CHECK-AVX2-NEXT: negq %r9 +; CHECK-AVX2-NEXT: movq %rcx, %r9 +; CHECK-AVX2-NEXT: sbbq %r8, %r9 +; CHECK-AVX2-NEXT: vmovq %xmm5, %r8 +; CHECK-AVX2-NEXT: setge %r9b +; CHECK-AVX2-NEXT: movzbl %r9b, %r9d ; CHECK-AVX2-NEXT: negq %r9 +; CHECK-AVX2-NEXT: negq %r8 ; CHECK-AVX2-NEXT: movq %rcx, %r8 ; CHECK-AVX2-NEXT: sbbq %rdi, %r8 +; CHECK-AVX2-NEXT: vmovq %r9, %xmm5 ; CHECK-AVX2-NEXT: setge %dil ; CHECK-AVX2-NEXT: movzbl %dil, %edi ; CHECK-AVX2-NEXT: negq %rdi @@ -145,6 +146,7 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3 ; CHECK-AVX2-NEXT: vpaddq %xmm2, %xmm4, %xmm4 ; CHECK-AVX2-NEXT: addq $32, %rsi +; CHECK-AVX2-NEXT: cmpq $1024, %rsi # imm = 0x400 ; CHECK-AVX2-NEXT: jne .LBB0_2 ; CHECK-AVX2-NEXT: # %bb.3: # %middle.block ; CHECK-AVX2-NEXT: # in Loop: Header=BB0_1 Depth=1 diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll index 5aa266db6553d..06b4c6fe5000d 100644 --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -280,38 +280,38 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { ; ; SSE41-LABEL: ne_i512: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %xmm2, %rdx -; SSE41-NEXT: movq %xmm1, %rsi -; SSE41-NEXT: movq %xmm3, %rdi -; SSE41-NEXT: pextrq $1, %xmm0, %r8 -; SSE41-NEXT: pextrq $1, %xmm2, %r9 -; SSE41-NEXT: pextrq $1, %xmm1, %r10 +; SSE41-NEXT: movq %xmm0, %rdx +; SSE41-NEXT: movq %xmm2, %rsi +; SSE41-NEXT: movq %xmm1, %rdi +; SSE41-NEXT: movq %xmm3, %r8 +; SSE41-NEXT: pextrq $1, %xmm0, %r9 +; SSE41-NEXT: pextrq $1, %xmm2, %r10 +; SSE41-NEXT: pextrq $1, %xmm1, %rcx ; SSE41-NEXT: pextrq $1, %xmm3, %rax ; SSE41-NEXT: movq %xmm4, %r11 -; SSE41-NEXT: xorq %rcx, %r11 -; SSE41-NEXT: movq %xmm6, %rcx -; SSE41-NEXT: xorq %rdx, %rcx -; SSE41-NEXT: orq %r11, %rcx -; SSE41-NEXT: movq %xmm5, %rdx +; SSE41-NEXT: xorq %rdx, %r11 +; SSE41-NEXT: movq %xmm6, %rdx ; SSE41-NEXT: xorq %rsi, %rdx -; SSE41-NEXT: movq %xmm7, %rsi +; SSE41-NEXT: orq %r11, %rdx +; SSE41-NEXT: movq %xmm5, %rsi ; SSE41-NEXT: xorq %rdi, %rsi -; SSE41-NEXT: orq %rdx, %rsi -; SSE41-NEXT: orq %rcx, %rsi -; SSE41-NEXT: pextrq $1, %xmm4, %rcx -; SSE41-NEXT: xorq %r8, %rcx -; SSE41-NEXT: pextrq $1, %xmm6, %rdx -; SSE41-NEXT: xorq %r9, %rdx -; SSE41-NEXT: orq %rcx, %rdx -; SSE41-NEXT: pextrq $1, %xmm5, %rcx -; SSE41-NEXT: xorq %r10, %rcx -; SSE41-NEXT: pextrq $1, %xmm7, %rdi -; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: orq %rcx, %rdi +; SSE41-NEXT: movq %xmm7, %rdi +; SSE41-NEXT: xorq %r8, %rdi +; SSE41-NEXT: orq %rsi, %rdi ; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: pextrq $1, %xmm4, %rdx +; SSE41-NEXT: xorq %r9, %rdx +; SSE41-NEXT: pextrq $1, %xmm6, %rsi +; SSE41-NEXT: xorq %r10, %rsi +; SSE41-NEXT: orq %rdx, %rsi +; SSE41-NEXT: pextrq $1, %xmm5, %rdx +; SSE41-NEXT: pextrq $1, %xmm7, %r8 +; SSE41-NEXT: xorq %rcx, %rdx +; SSE41-NEXT: xorq %rax, %r8 +; SSE41-NEXT: orq %rdx, %r8 +; SSE41-NEXT: orq %rsi, %r8 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: orq %rdi, %r8 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -346,13 +346,13 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { ; AVX1-NEXT: xorq %r10, %rsi ; AVX1-NEXT: orq %rdx, %rsi ; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %r8 ; AVX1-NEXT: xorq %rcx, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: orq %rdx, %rcx -; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: xorq %rax, %r8 +; AVX1-NEXT: orq %rdx, %r8 +; AVX1-NEXT: orq %rsi, %r8 ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: orq %rdi, %r8 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -388,13 +388,13 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { ; AVX2-NEXT: xorq %r10, %rsi ; AVX2-NEXT: orq %rdx, %rsi ; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm1, %r8 ; AVX2-NEXT: xorq %rcx, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: xorq %rax, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: xorq %rax, %r8 +; AVX2-NEXT: orq %rdx, %r8 +; AVX2-NEXT: orq %rsi, %r8 ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rcx +; AVX2-NEXT: orq %rdi, %r8 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -462,38 +462,38 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) { ; ; SSE41-LABEL: eq_i512: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %xmm2, %rdx -; SSE41-NEXT: movq %xmm1, %rsi -; SSE41-NEXT: movq %xmm3, %rdi -; SSE41-NEXT: pextrq $1, %xmm0, %r8 -; SSE41-NEXT: pextrq $1, %xmm2, %r9 -; SSE41-NEXT: pextrq $1, %xmm1, %r10 +; SSE41-NEXT: movq %xmm0, %rdx +; SSE41-NEXT: movq %xmm2, %rsi +; SSE41-NEXT: movq %xmm1, %rdi +; SSE41-NEXT: movq %xmm3, %r8 +; SSE41-NEXT: pextrq $1, %xmm0, %r9 +; SSE41-NEXT: pextrq $1, %xmm2, %r10 +; SSE41-NEXT: pextrq $1, %xmm1, %rcx ; SSE41-NEXT: pextrq $1, %xmm3, %rax ; SSE41-NEXT: movq %xmm4, %r11 -; SSE41-NEXT: xorq %rcx, %r11 -; SSE41-NEXT: movq %xmm6, %rcx -; SSE41-NEXT: xorq %rdx, %rcx -; SSE41-NEXT: orq %r11, %rcx -; SSE41-NEXT: movq %xmm5, %rdx +; SSE41-NEXT: xorq %rdx, %r11 +; SSE41-NEXT: movq %xmm6, %rdx ; SSE41-NEXT: xorq %rsi, %rdx -; SSE41-NEXT: movq %xmm7, %rsi +; SSE41-NEXT: orq %r11, %rdx +; SSE41-NEXT: movq %xmm5, %rsi ; SSE41-NEXT: xorq %rdi, %rsi -; SSE41-NEXT: orq %rdx, %rsi -; SSE41-NEXT: orq %rcx, %rsi -; SSE41-NEXT: pextrq $1, %xmm4, %rcx -; SSE41-NEXT: xorq %r8, %rcx -; SSE41-NEXT: pextrq $1, %xmm6, %rdx -; SSE41-NEXT: xorq %r9, %rdx -; SSE41-NEXT: orq %rcx, %rdx -; SSE41-NEXT: pextrq $1, %xmm5, %rcx -; SSE41-NEXT: xorq %r10, %rcx -; SSE41-NEXT: pextrq $1, %xmm7, %rdi -; SSE41-NEXT: xorq %rax, %rdi -; SSE41-NEXT: orq %rcx, %rdi +; SSE41-NEXT: movq %xmm7, %rdi +; SSE41-NEXT: xorq %r8, %rdi +; SSE41-NEXT: orq %rsi, %rdi ; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: pextrq $1, %xmm4, %rdx +; SSE41-NEXT: xorq %r9, %rdx +; SSE41-NEXT: pextrq $1, %xmm6, %rsi +; SSE41-NEXT: xorq %r10, %rsi +; SSE41-NEXT: orq %rdx, %rsi +; SSE41-NEXT: pextrq $1, %xmm5, %rdx +; SSE41-NEXT: pextrq $1, %xmm7, %r8 +; SSE41-NEXT: xorq %rcx, %rdx +; SSE41-NEXT: xorq %rax, %r8 +; SSE41-NEXT: orq %rdx, %r8 +; SSE41-NEXT: orq %rsi, %r8 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: orq %rdi, %r8 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -528,13 +528,13 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) { ; AVX1-NEXT: xorq %r10, %rsi ; AVX1-NEXT: orq %rdx, %rsi ; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %r8 ; AVX1-NEXT: xorq %rcx, %rdx -; AVX1-NEXT: vpextrq $1, %xmm1, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: orq %rdx, %rcx -; AVX1-NEXT: orq %rsi, %rcx +; AVX1-NEXT: xorq %rax, %r8 +; AVX1-NEXT: orq %rdx, %r8 +; AVX1-NEXT: orq %rsi, %r8 ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdi, %rcx +; AVX1-NEXT: orq %rdi, %r8 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -570,13 +570,13 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) { ; AVX2-NEXT: xorq %r10, %rsi ; AVX2-NEXT: orq %rdx, %rsi ; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm1, %r8 ; AVX2-NEXT: xorq %rcx, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: xorq %rax, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: xorq %rax, %r8 +; AVX2-NEXT: orq %rdx, %r8 +; AVX2-NEXT: orq %rsi, %r8 ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdi, %rcx +; AVX2-NEXT: orq %rdi, %r8 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -600,11 +600,11 @@ define i1 @ne_v4i256(<4 x i256> %a0) { ; SSE2-LABEL: ne_v4i256: ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: movq %r10, %xmm0 -; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movq %r10, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movq %rcx, %xmm0 @@ -641,17 +641,17 @@ define i1 @ne_v4i256(<4 x i256> %a0) { ; SSE41-NEXT: movq %rax, %xmm1 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movq %rcx, %xmm0 ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT: movq %rcx, %xmm0 ; SSE41-NEXT: movq %rdx, %xmm2 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE41-NEXT: por %xmm1, %xmm2 ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; SSE41-NEXT: movq %r9, %xmm0 ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r8 +; SSE41-NEXT: movq %r9, %xmm0 ; SSE41-NEXT: movq %r8, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rsi +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE41-NEXT: movq %rsi, %xmm0 ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rdi ; SSE41-NEXT: movq %rdi, %xmm3 @@ -665,14 +665,14 @@ define i1 @ne_v4i256(<4 x i256> %a0) { ; AVX1-LABEL: ne_v4i256: ; AVX1: # %bb.0: ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rcx ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: orq %rax, %rcx ; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX1-NEXT: orq %r10, %rcx ; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: orq %r10, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r9 @@ -693,14 +693,14 @@ define i1 @ne_v4i256(<4 x i256> %a0) { ; AVX2-LABEL: ne_v4i256: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: orq %rax, %rcx ; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: orq %r10, %rcx ; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: orq %r10, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r9 @@ -721,8 +721,8 @@ define i1 @ne_v4i256(<4 x i256> %a0) { ; AVX512-LABEL: ne_v4i256: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: shrq $32, %rax ; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 @@ -738,24 +738,24 @@ define i1 @ne_v4i256(<4 x i256> %a0) { ; AVX512-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1 ; AVX512-NEXT: shrq $32, %r9 ; AVX512-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: vmovd %edx, %xmm1 +; AVX512-NEXT: vmovd %edx, %xmm2 ; AVX512-NEXT: shrq $32, %rdx -; AVX512-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 ; AVX512-NEXT: shrq $32, %rcx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm2 ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovd %edi, %xmm2 +; AVX512-NEXT: vmovd %edi, %xmm3 ; AVX512-NEXT: shrq $32, %rdi -; AVX512-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 +; AVX512-NEXT: vpinsrd $1, %edi, %xmm3, %xmm3 ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpinsrd $2, %esi, %xmm3, %xmm1 ; AVX512-NEXT: shrq $32, %rsi -; AVX512-NEXT: vpinsrd $3, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 @@ -1052,54 +1052,54 @@ define i32 @eq_i256_pair(ptr %a, ptr %b) { define i32 @ne_i512_pair(ptr %a, ptr %b) { ; NO512-LABEL: ne_i512_pair: ; NO512: # %bb.0: -; NO512-NEXT: movq 40(%rdi), %rax -; NO512-NEXT: movq 56(%rdi), %rcx -; NO512-NEXT: movq 24(%rdi), %rdx -; NO512-NEXT: xorq 24(%rsi), %rdx -; NO512-NEXT: xorq 56(%rsi), %rcx -; NO512-NEXT: movq 88(%rdi), %r8 -; NO512-NEXT: xorq 88(%rsi), %r8 +; NO512-NEXT: movq 32(%rdi), %rax +; NO512-NEXT: movq 48(%rdi), %rcx +; NO512-NEXT: movq 40(%rdi), %rdx +; NO512-NEXT: movq 24(%rdi), %r9 +; NO512-NEXT: xorq 24(%rsi), %r9 +; NO512-NEXT: movq 56(%rdi), %r8 +; NO512-NEXT: xorq 56(%rsi), %r8 +; NO512-NEXT: movq 88(%rdi), %r10 +; NO512-NEXT: xorq 88(%rsi), %r10 +; NO512-NEXT: orq %r9, %r10 +; NO512-NEXT: movq 120(%rdi), %r9 +; NO512-NEXT: xorq 120(%rsi), %r9 +; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 8(%rdi), %r8 +; NO512-NEXT: xorq 8(%rsi), %r8 +; NO512-NEXT: xorq 40(%rsi), %rdx +; NO512-NEXT: orq %r10, %r9 +; NO512-NEXT: movq 72(%rdi), %r10 +; NO512-NEXT: xorq 72(%rsi), %r10 +; NO512-NEXT: orq %r8, %r10 +; NO512-NEXT: movq 104(%rdi), %r8 +; NO512-NEXT: xorq 104(%rsi), %r8 ; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 120(%rdi), %rdx -; NO512-NEXT: xorq 120(%rsi), %rdx +; NO512-NEXT: movq 16(%rdi), %rdx +; NO512-NEXT: orq %r10, %r8 +; NO512-NEXT: movq (%rdi), %r10 +; NO512-NEXT: xorq 16(%rsi), %rdx +; NO512-NEXT: xorq 48(%rsi), %rcx +; NO512-NEXT: xorq (%rsi), %r10 +; NO512-NEXT: xorq 32(%rsi), %rax +; NO512-NEXT: orq %r9, %r8 +; NO512-NEXT: movq 80(%rdi), %r9 +; NO512-NEXT: xorq 80(%rsi), %r9 +; NO512-NEXT: orq %rdx, %r9 +; NO512-NEXT: movq 112(%rdi), %rdx +; NO512-NEXT: xorq 112(%rsi), %rdx ; NO512-NEXT: orq %rcx, %rdx -; NO512-NEXT: movq 8(%rdi), %rcx -; NO512-NEXT: xorq 8(%rsi), %rcx -; NO512-NEXT: xorq 40(%rsi), %rax -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: movq 72(%rdi), %r8 -; NO512-NEXT: xorq 72(%rsi), %r8 -; NO512-NEXT: orq %rcx, %r8 -; NO512-NEXT: movq 104(%rdi), %rcx -; NO512-NEXT: xorq 104(%rsi), %rcx +; NO512-NEXT: orq %r9, %rdx +; NO512-NEXT: movq 96(%rdi), %rcx +; NO512-NEXT: movq 64(%rdi), %rdi +; NO512-NEXT: xorq 64(%rsi), %rdi +; NO512-NEXT: xorq 96(%rsi), %rcx +; NO512-NEXT: orq %r10, %rdi ; NO512-NEXT: orq %rax, %rcx -; NO512-NEXT: movq 48(%rdi), %rax -; NO512-NEXT: orq %r8, %rcx -; NO512-NEXT: movq 16(%rdi), %r8 -; NO512-NEXT: xorq 16(%rsi), %r8 -; NO512-NEXT: xorq 48(%rsi), %rax +; NO512-NEXT: orq %rdi, %rcx ; NO512-NEXT: orq %rdx, %rcx -; NO512-NEXT: movq 80(%rdi), %rdx -; NO512-NEXT: xorq 80(%rsi), %rdx -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: movq 112(%rdi), %r8 -; NO512-NEXT: xorq 112(%rsi), %r8 -; NO512-NEXT: orq %rax, %r8 -; NO512-NEXT: movq (%rdi), %rax -; NO512-NEXT: xorq (%rsi), %rax -; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 64(%rdi), %rdx -; NO512-NEXT: xorq 64(%rsi), %rdx -; NO512-NEXT: orq %rax, %rdx -; NO512-NEXT: movq 32(%rdi), %rax -; NO512-NEXT: xorq 32(%rsi), %rax -; NO512-NEXT: movq 96(%rdi), %rdi -; NO512-NEXT: xorq 96(%rsi), %rdi -; NO512-NEXT: orq %rax, %rdi -; NO512-NEXT: orq %rdx, %rdi -; NO512-NEXT: orq %r8, %rdi ; NO512-NEXT: xorl %eax, %eax -; NO512-NEXT: orq %rcx, %rdi +; NO512-NEXT: orq %r8, %rcx ; NO512-NEXT: setne %al ; NO512-NEXT: retq ; @@ -1146,54 +1146,54 @@ define i32 @ne_i512_pair(ptr %a, ptr %b) { define i32 @eq_i512_pair(ptr %a, ptr %b) { ; NO512-LABEL: eq_i512_pair: ; NO512: # %bb.0: -; NO512-NEXT: movq 40(%rdi), %rax -; NO512-NEXT: movq 56(%rdi), %rcx -; NO512-NEXT: movq 24(%rdi), %rdx -; NO512-NEXT: xorq 24(%rsi), %rdx -; NO512-NEXT: xorq 56(%rsi), %rcx -; NO512-NEXT: movq 88(%rdi), %r8 -; NO512-NEXT: xorq 88(%rsi), %r8 +; NO512-NEXT: movq 32(%rdi), %rax +; NO512-NEXT: movq 48(%rdi), %rcx +; NO512-NEXT: movq 40(%rdi), %rdx +; NO512-NEXT: movq 24(%rdi), %r9 +; NO512-NEXT: xorq 24(%rsi), %r9 +; NO512-NEXT: movq 56(%rdi), %r8 +; NO512-NEXT: xorq 56(%rsi), %r8 +; NO512-NEXT: movq 88(%rdi), %r10 +; NO512-NEXT: xorq 88(%rsi), %r10 +; NO512-NEXT: orq %r9, %r10 +; NO512-NEXT: movq 120(%rdi), %r9 +; NO512-NEXT: xorq 120(%rsi), %r9 +; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 8(%rdi), %r8 +; NO512-NEXT: xorq 8(%rsi), %r8 +; NO512-NEXT: xorq 40(%rsi), %rdx +; NO512-NEXT: orq %r10, %r9 +; NO512-NEXT: movq 72(%rdi), %r10 +; NO512-NEXT: xorq 72(%rsi), %r10 +; NO512-NEXT: orq %r8, %r10 +; NO512-NEXT: movq 104(%rdi), %r8 +; NO512-NEXT: xorq 104(%rsi), %r8 ; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 120(%rdi), %rdx -; NO512-NEXT: xorq 120(%rsi), %rdx +; NO512-NEXT: movq 16(%rdi), %rdx +; NO512-NEXT: orq %r10, %r8 +; NO512-NEXT: movq (%rdi), %r10 +; NO512-NEXT: xorq 16(%rsi), %rdx +; NO512-NEXT: xorq 48(%rsi), %rcx +; NO512-NEXT: xorq (%rsi), %r10 +; NO512-NEXT: xorq 32(%rsi), %rax +; NO512-NEXT: orq %r9, %r8 +; NO512-NEXT: movq 80(%rdi), %r9 +; NO512-NEXT: xorq 80(%rsi), %r9 +; NO512-NEXT: orq %rdx, %r9 +; NO512-NEXT: movq 112(%rdi), %rdx +; NO512-NEXT: xorq 112(%rsi), %rdx ; NO512-NEXT: orq %rcx, %rdx -; NO512-NEXT: movq 8(%rdi), %rcx -; NO512-NEXT: xorq 8(%rsi), %rcx -; NO512-NEXT: xorq 40(%rsi), %rax -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: movq 72(%rdi), %r8 -; NO512-NEXT: xorq 72(%rsi), %r8 -; NO512-NEXT: orq %rcx, %r8 -; NO512-NEXT: movq 104(%rdi), %rcx -; NO512-NEXT: xorq 104(%rsi), %rcx +; NO512-NEXT: orq %r9, %rdx +; NO512-NEXT: movq 96(%rdi), %rcx +; NO512-NEXT: movq 64(%rdi), %rdi +; NO512-NEXT: xorq 64(%rsi), %rdi +; NO512-NEXT: xorq 96(%rsi), %rcx +; NO512-NEXT: orq %r10, %rdi ; NO512-NEXT: orq %rax, %rcx -; NO512-NEXT: movq 48(%rdi), %rax -; NO512-NEXT: orq %r8, %rcx -; NO512-NEXT: movq 16(%rdi), %r8 -; NO512-NEXT: xorq 16(%rsi), %r8 -; NO512-NEXT: xorq 48(%rsi), %rax +; NO512-NEXT: orq %rdi, %rcx ; NO512-NEXT: orq %rdx, %rcx -; NO512-NEXT: movq 80(%rdi), %rdx -; NO512-NEXT: xorq 80(%rsi), %rdx -; NO512-NEXT: orq %r8, %rdx -; NO512-NEXT: movq 112(%rdi), %r8 -; NO512-NEXT: xorq 112(%rsi), %r8 -; NO512-NEXT: orq %rax, %r8 -; NO512-NEXT: movq (%rdi), %rax -; NO512-NEXT: xorq (%rsi), %rax -; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 64(%rdi), %rdx -; NO512-NEXT: xorq 64(%rsi), %rdx -; NO512-NEXT: orq %rax, %rdx -; NO512-NEXT: movq 32(%rdi), %rax -; NO512-NEXT: xorq 32(%rsi), %rax -; NO512-NEXT: movq 96(%rdi), %rdi -; NO512-NEXT: xorq 96(%rsi), %rdi -; NO512-NEXT: orq %rax, %rdi -; NO512-NEXT: orq %rdx, %rdi -; NO512-NEXT: orq %r8, %rdi ; NO512-NEXT: xorl %eax, %eax -; NO512-NEXT: orq %rcx, %rdi +; NO512-NEXT: orq %r8, %rcx ; NO512-NEXT: sete %al ; NO512-NEXT: retq ; @@ -1267,18 +1267,18 @@ define i1 @eq_i512_args(i512 %a, i512 %b) { ; ANY-LABEL: eq_i512_args: ; ANY: # %bb.0: ; ANY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax ; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; ANY-NEXT: orq %r10, %rcx +; ANY-NEXT: orq %rax, %rcx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi ; ANY-NEXT: orq %r9, %rsi ; ANY-NEXT: orq %rcx, %rsi -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; ANY-NEXT: orq %rax, %rdx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 +; ANY-NEXT: orq %r10, %rdx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi ; ANY-NEXT: orq %r8, %rdi ; ANY-NEXT: orq %rdx, %rdi @@ -1326,65 +1326,35 @@ define i1 @eq_i256_op(i256 %a, i256 %b) { } define i1 @eq_i512_op(i512 %a, i512 %b) { -; SSE-LABEL: eq_i512_op: -; SSE: # %bb.0: -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: addq $1, %rdi -; SSE-NEXT: adcq $0, %rsi -; SSE-NEXT: adcq $0, %rdx -; SSE-NEXT: adcq $0, %rcx -; SSE-NEXT: adcq $0, %r8 -; SSE-NEXT: adcq $0, %r9 -; SSE-NEXT: adcq $0, %r10 -; SSE-NEXT: adcq $0, %rax -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rsi -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r9 -; SSE-NEXT: orq %rsi, %r9 -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: orq %rdx, %r10 -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: orq %r8, %rdi -; SSE-NEXT: orq %r10, %rdi -; SSE-NEXT: orq %rax, %rdi -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVXANY-LABEL: eq_i512_op: -; AVXANY: # %bb.0: -; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVXANY-NEXT: addq $1, %rdi -; AVXANY-NEXT: adcq $0, %rsi -; AVXANY-NEXT: adcq $0, %rdx -; AVXANY-NEXT: adcq $0, %rcx -; AVXANY-NEXT: adcq $0, %r8 -; AVXANY-NEXT: adcq $0, %r9 -; AVXANY-NEXT: adcq $0, %r10 -; AVXANY-NEXT: adcq $0, %rax -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 -; AVXANY-NEXT: orq %rsi, %r9 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; AVXANY-NEXT: orq %rcx, %rax -; AVXANY-NEXT: orq %r9, %rax -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; AVXANY-NEXT: orq %rdx, %r10 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi -; AVXANY-NEXT: orq %r8, %rdi -; AVXANY-NEXT: orq %r10, %rdi -; AVXANY-NEXT: orq %rax, %rdi -; AVXANY-NEXT: sete %al -; AVXANY-NEXT: retq +; ANY-LABEL: eq_i512_op: +; ANY: # %bb.0: +; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; ANY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: addq $1, %rdi +; ANY-NEXT: adcq $0, %rsi +; ANY-NEXT: adcq $0, %rdx +; ANY-NEXT: adcq $0, %rcx +; ANY-NEXT: adcq $0, %r8 +; ANY-NEXT: adcq $0, %r9 +; ANY-NEXT: adcq $0, %r10 +; ANY-NEXT: adcq $0, %rax +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 +; ANY-NEXT: orq %rsi, %r9 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: orq %rcx, %rax +; ANY-NEXT: orq %r9, %rax +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 +; ANY-NEXT: orq %rdx, %r10 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi +; ANY-NEXT: orq %r8, %rdi +; ANY-NEXT: orq %r10, %rdi +; ANY-NEXT: orq %rax, %rdi +; ANY-NEXT: sete %al +; ANY-NEXT: retq %a2 = add i512 %a, 1 %r = icmp eq i512 %a2, %b ret i1 %r @@ -1425,8 +1395,8 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) { ; ANY: # %bb.0: ; ANY-NEXT: movq 40(%rdi), %rax ; ANY-NEXT: movq 48(%rdi), %r10 -; ANY-NEXT: movq 56(%rdi), %r11 ; ANY-NEXT: xorq 24(%rdi), %r8 +; ANY-NEXT: movq 56(%rdi), %r11 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r11 ; ANY-NEXT: orq %r8, %r11 ; ANY-NEXT: xorq 8(%rdi), %rdx @@ -1435,8 +1405,8 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) { ; ANY-NEXT: orq %r11, %rax ; ANY-NEXT: xorq 32(%rdi), %r9 ; ANY-NEXT: xorq (%rdi), %rsi -; ANY-NEXT: orq %r9, %rsi ; ANY-NEXT: xorq 16(%rdi), %rcx +; ANY-NEXT: orq %r9, %rsi ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 ; ANY-NEXT: orq %rcx, %r10 ; ANY-NEXT: orq %rsi, %r10 diff --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll index 60ac6df3f77af..c9af7f6f11dbc 100644 --- a/llvm/test/CodeGen/X86/setcc.ll +++ b/llvm/test/CodeGen/X86/setcc.ll @@ -50,8 +50,8 @@ define zeroext i16 @t2(i16 zeroext %x) nounwind readnone ssp { define i64 @t3(i64 %x) nounwind readnone ssp { ; X86-LABEL: t3: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl $18, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: setb %al ; X86-NEXT: movzbl %al, %eax diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll index 0f473bfbe4e47..380535c0c5838 100644 --- a/llvm/test/CodeGen/X86/sext-vsetcc.ll +++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll @@ -228,7 +228,7 @@ define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind { ; AVX-LABEL: cmp_ult_load_const: ; AVX: # %bb.0: ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = [42,214,0,255] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,214,0,255] ; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %loadx = load <4 x i8>, ptr %x @@ -292,7 +292,7 @@ define <4 x i32> @cmp_slt_load_const(ptr %x) nounwind { ; AVX-LABEL: cmp_slt_load_const: ; AVX: # %bb.0: ; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [42,4294967254,0,4294967295] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,4294967254,0,4294967295] ; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %loadx = load <4 x i8>, ptr %x diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll index 9f7ac748c47e1..9409b531d56d7 100644 --- a/llvm/test/CodeGen/X86/shift-amount-mod.ll +++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll @@ -83,9 +83,9 @@ define void @store32_shl_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind define void @modify32_shl_by_negated(ptr %valptr, i32 %shamt) nounwind { ; X86-LABEL: modify32_shl_by_negated: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb $32, %cl ; X86-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll %cl, (%eax) ; X86-NEXT: retl ; @@ -171,26 +171,24 @@ define i64 @load64_shl_by_negated(ptr %valptr, i64 %shamt) nounwind { define void @store64_shl_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind { ; X86-LABEL: store64_shl_by_negated: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movb $64, %cl ; X86-NEXT: subb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movl %edi, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB6_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edx -; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB6_2: -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %eax, 4(%esi) +; X86-NEXT: movl %edx, (%esi) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: store64_shl_by_negated: @@ -321,9 +319,9 @@ define void @store32_lshr_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind define void @modify32_lshr_by_negated(ptr %valptr, i32 %shamt) nounwind { ; X86-LABEL: modify32_lshr_by_negated: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb $32, %cl ; X86-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl %cl, (%eax) ; X86-NEXT: retl ; @@ -409,26 +407,24 @@ define i64 @load64_lshr_by_negated(ptr %valptr, i64 %shamt) nounwind { define void @store64_lshr_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind { ; X86-LABEL: store64_lshr_by_negated: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movb $64, %cl ; X86-NEXT: subb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movl %edi, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: shrdl %cl, %edi, %edx +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: shrdl %cl, %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB14_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edx -; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB14_2: -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %edx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: store64_lshr_by_negated: @@ -559,9 +555,9 @@ define void @store32_ashr_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind define void @modify32_ashr_by_negated(ptr %valptr, i32 %shamt) nounwind { ; X86-LABEL: modify32_ashr_by_negated: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb $32, %cl ; X86-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl %cl, (%eax) ; X86-NEXT: retl ; @@ -652,22 +648,22 @@ define void @store64_ashr_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movb $64, %cl ; X86-NEXT: subb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sarl %cl, %esi -; X86-NEXT: shrdl %cl, %edi, %edx +; X86-NEXT: movl %esi, %edx +; X86-NEXT: sarl %cl, %edx +; X86-NEXT: shrdl %cl, %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB22_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: sarl $31, %edi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %edx, %eax ; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl %edi, %esi ; X86-NEXT: .LBB22_2: -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %edx, 4(%edi) +; X86-NEXT: movl %eax, (%edi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl @@ -825,9 +821,9 @@ define i64 @reg64_lshr_by_sub_of_negated(i64 %val, i64 %a, i64 %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: addb $-64, %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx @@ -886,9 +882,9 @@ define i64 @reg64_lshr_by_add_to_negated(i64 %val, i64 %a, i64 %b) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: addb $64, %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx @@ -1199,9 +1195,9 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounw ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: addb $-64, %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx @@ -1258,9 +1254,9 @@ define i64 @reg64_lshr_by_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b) nounw ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: addb $64, %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx @@ -1421,9 +1417,9 @@ define i32 @reg32_lshr_by_masked_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b ; X86-LABEL: reg32_lshr_by_masked_b_sub_negated_unfolded: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andl $31, %edx ; X86-NEXT: subl %edx, %ecx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll index f2627df3a98d8..d57d3238b080b 100644 --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -169,10 +169,10 @@ define i64 @t6(i64 %key, ptr nocapture %val) nounwind { ; X86-LABEL: t6: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: shrdl $3, %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shrl $3, %esi ; X86-NEXT: movl (%edx), %eax ; X86-NEXT: movl 4(%edx), %edx diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll index 76cb4e87bae18..d6a8f1b3baea6 100644 --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -674,26 +674,26 @@ define <4 x i32> @or_tree_with_shifts_vec_i32(<4 x i32> %a, <4 x i32> %b, <4 x i ; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: .cfi_offset %esi, -12 ; X86-NEXT: .cfi_offset %edi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shll $16, %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: shll $16, %edx ; X86-NEXT: orl {{[0-9]+}}(%esp), %edx ; X86-NEXT: orl {{[0-9]+}}(%esp), %edx ; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shll $16, %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shll $16, %edi ; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %esi, 8(%eax) @@ -723,23 +723,27 @@ define <4 x i32> @or_tree_with_shifts_vec_i32(<4 x i32> %a, <4 x i32> %b, <4 x i define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; X86-LABEL: or_tree_with_mismatching_shifts_vec_i32: ; X86: # %bb.0: -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edi ; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: .cfi_offset %esi, -12 -; X86-NEXT: .cfi_offset %edi, -8 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: .cfi_offset %esi, -16 +; X86-NEXT: .cfi_offset %edi, -12 +; X86-NEXT: .cfi_offset %ebx, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $16, %eax +; X86-NEXT: shll $16, %ebx ; X86-NEXT: shll $17, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: shll $16, %eax ; X86-NEXT: shll $17, %edx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax @@ -747,15 +751,14 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32 ; X86-NEXT: orl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll $16, %eax +; X86-NEXT: shll $16, %ebx +; X86-NEXT: shll $17, %edi ; X86-NEXT: shll $17, %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $16, %eax -; X86-NEXT: shll $17, %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: orl %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ecx, 12(%eax) @@ -763,8 +766,10 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32 ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %ebx ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl $4 ; @@ -806,9 +811,9 @@ define void @combineShiftOfShiftedLogic(i128 %a1, i32 %a2, ptr %p) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edx killed $edx def $rdx ; X64-NEXT: shlq $32, %rdx +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movups %xmm0, (%rcx) ; X64-NEXT: movq %rdx, 16(%rcx) -; X64-NEXT: movq $0, 8(%rcx) -; X64-NEXT: movq $0, (%rcx) ; X64-NEXT: retq %zext1 = zext i128 %a1 to i192 %zext2 = zext i32 %a2 to i192 diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 767bd772ab7a3..3e2392d9e9eaf 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -32,22 +32,22 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: movl %ecx, %eax ; i686-NEXT: shrb $3, %al ; i686-NEXT: andb $12, %al -; i686-NEXT: movzbl %al, %edi -; i686-NEXT: movl 8(%esp,%edi), %eax -; i686-NEXT: movl 4(%esp,%edi), %ebx -; i686-NEXT: movl %ebx, %edx +; i686-NEXT: movzbl %al, %esi +; i686-NEXT: movl 8(%esp,%esi), %eax +; i686-NEXT: movl 4(%esp,%esi), %edi +; i686-NEXT: movl %edi, %edx ; i686-NEXT: shrdl %cl, %eax, %edx -; i686-NEXT: movl (%esp,%edi), %esi -; i686-NEXT: movl 12(%esp,%edi), %edi -; i686-NEXT: shrdl %cl, %edi, %eax -; i686-NEXT: shrdl %cl, %ebx, %esi -; i686-NEXT: movl 40(%ebp), %ebx +; i686-NEXT: movl 12(%esp,%esi), %ebx +; i686-NEXT: shrdl %cl, %ebx, %eax +; i686-NEXT: movl (%esp,%esi), %esi +; i686-NEXT: shrdl %cl, %edi, %esi +; i686-NEXT: movl 40(%ebp), %edi ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shrl %cl, %edi -; i686-NEXT: movl %edi, 12(%ebx) -; i686-NEXT: movl %eax, 8(%ebx) -; i686-NEXT: movl %edx, 4(%ebx) -; i686-NEXT: movl %esi, (%ebx) +; i686-NEXT: shrl %cl, %ebx +; i686-NEXT: movl %ebx, 12(%edi) +; i686-NEXT: movl %eax, 8(%edi) +; i686-NEXT: movl %edx, 4(%edi) +; i686-NEXT: movl %esi, (%edi) ; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi @@ -100,22 +100,22 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: movl %ecx, %eax ; i686-NEXT: shrb $3, %al ; i686-NEXT: andb $12, %al -; i686-NEXT: movzbl %al, %edi -; i686-NEXT: movl 8(%esp,%edi), %eax -; i686-NEXT: movl 4(%esp,%edi), %ebx -; i686-NEXT: movl %ebx, %edx +; i686-NEXT: movzbl %al, %esi +; i686-NEXT: movl 8(%esp,%esi), %eax +; i686-NEXT: movl 4(%esp,%esi), %edi +; i686-NEXT: movl %edi, %edx ; i686-NEXT: shrdl %cl, %eax, %edx -; i686-NEXT: movl (%esp,%edi), %esi -; i686-NEXT: movl 12(%esp,%edi), %edi -; i686-NEXT: shrdl %cl, %edi, %eax -; i686-NEXT: shrdl %cl, %ebx, %esi -; i686-NEXT: movl 40(%ebp), %ebx +; i686-NEXT: movl 12(%esp,%esi), %ebx +; i686-NEXT: shrdl %cl, %ebx, %eax +; i686-NEXT: movl (%esp,%esi), %esi +; i686-NEXT: shrdl %cl, %edi, %esi +; i686-NEXT: movl 40(%ebp), %edi ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: sarl %cl, %edi -; i686-NEXT: movl %edi, 12(%ebx) -; i686-NEXT: movl %eax, 8(%ebx) -; i686-NEXT: movl %edx, 4(%ebx) -; i686-NEXT: movl %esi, (%ebx) +; i686-NEXT: sarl %cl, %ebx +; i686-NEXT: movl %ebx, 12(%edi) +; i686-NEXT: movl %eax, 8(%edi) +; i686-NEXT: movl %edx, 4(%edi) +; i686-NEXT: movl %esi, (%edi) ; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi @@ -172,20 +172,20 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: movsbl %al, %edi ; i686-NEXT: movl 20(%esp,%edi), %eax ; i686-NEXT: movl 24(%esp,%edi), %ebx -; i686-NEXT: movl %ebx, %esi -; i686-NEXT: shldl %cl, %eax, %esi -; i686-NEXT: movl 16(%esp,%edi), %edx -; i686-NEXT: movl 28(%esp,%edi), %edi -; i686-NEXT: shldl %cl, %ebx, %edi -; i686-NEXT: movl 40(%ebp), %ebx -; i686-NEXT: movl %edi, 12(%ebx) -; i686-NEXT: movl %esi, 8(%ebx) -; i686-NEXT: movl %edx, %esi -; i686-NEXT: shll %cl, %esi +; i686-NEXT: movl %ebx, %edx +; i686-NEXT: shldl %cl, %eax, %edx +; i686-NEXT: movl 28(%esp,%edi), %esi +; i686-NEXT: shldl %cl, %ebx, %esi +; i686-NEXT: movl 16(%esp,%edi), %edi +; i686-NEXT: movl %edi, %ebx +; i686-NEXT: shll %cl, %ebx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shldl %cl, %edx, %eax -; i686-NEXT: movl %eax, 4(%ebx) -; i686-NEXT: movl %esi, (%ebx) +; i686-NEXT: shldl %cl, %edi, %eax +; i686-NEXT: movl 40(%ebp), %ecx +; i686-NEXT: movl %esi, 12(%ecx) +; i686-NEXT: movl %edx, 8(%ecx) +; i686-NEXT: movl %eax, 4(%ecx) +; i686-NEXT: movl %ebx, (%ecx) ; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi @@ -276,58 +276,58 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edx, %ebx -; i686-NEXT: andl $31, %ebx +; i686-NEXT: movl %edx, %eax +; i686-NEXT: andl $31, %eax ; i686-NEXT: shrl $3, %edx ; i686-NEXT: andl $12, %edx -; i686-NEXT: movl 40(%esp,%edx), %eax -; i686-NEXT: movl 36(%esp,%edx), %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shrdl %cl, %eax, %esi +; i686-NEXT: movl 40(%esp,%edx), %esi +; i686-NEXT: movl 36(%esp,%edx), %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %eax, %ecx +; i686-NEXT: shrdl %cl, %esi, %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 44(%esp,%edx), %edi +; i686-NEXT: movl %edi, (%esp) # 4-byte Spill +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shrdl %cl, %edi, %esi ; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl 32(%esp,%edx), %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 44(%esp,%edx), %edx -; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: movl %ebx, %esi -; i686-NEXT: shrdl %cl, %edx, %eax -; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl 56(%ebp), %edx -; i686-NEXT: movl %edx, %eax -; i686-NEXT: andl $31, %eax +; i686-NEXT: movl %edx, %ebx +; i686-NEXT: andl $31, %ebx ; i686-NEXT: shrl $3, %edx ; i686-NEXT: andl $12, %edx -; i686-NEXT: movl 72(%esp,%edx), %ebx -; i686-NEXT: movl 68(%esp,%edx), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 72(%esp,%edx), %edi +; i686-NEXT: movl 68(%esp,%edx), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: shrdl %cl, %edi, %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 76(%esp,%edx), %esi +; i686-NEXT: shrdl %cl, %esi, %edi ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %ebx, %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 64(%esp,%edx), %edi -; i686-NEXT: movl 76(%esp,%edx), %edx -; i686-NEXT: shrdl %cl, %edx, %ebx -; i686-NEXT: movl %esi, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: movl 64(%esp,%edx), %edx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, %edi -; i686-NEXT: shrl %cl, %edx +; i686-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: shrdl %cl, %eax, %edx +; i686-NEXT: shrl %cl, %esi ; i686-NEXT: movl 72(%ebp), %eax -; i686-NEXT: movl %edx, 28(%eax) -; i686-NEXT: movl %ebx, 24(%eax) +; i686-NEXT: movl %esi, 28(%eax) +; i686-NEXT: movl %edi, 24(%eax) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: movl %ecx, 20(%eax) -; i686-NEXT: movl %edi, 16(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %edx, 16(%eax) +; i686-NEXT: movl (%esp), %ecx # 4-byte Reload ; i686-NEXT: movl %ecx, 12(%eax) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: movl %ecx, 8(%eax) @@ -345,22 +345,22 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; x86_64-LABEL: test_lshr_v2i128: ; x86_64: # %bb.0: # %entry ; x86_64-NEXT: movq %rcx, %rax -; x86_64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; x86_64-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d ; x86_64-NEXT: movl %r9d, %ecx ; x86_64-NEXT: shrdq %cl, %rax, %rdx -; x86_64-NEXT: movl %r8d, %ecx +; x86_64-NEXT: movq %r8, %rcx ; x86_64-NEXT: shrdq %cl, %rsi, %rdi ; x86_64-NEXT: shrq %cl, %rsi -; x86_64-NEXT: xorl %r11d, %r11d -; x86_64-NEXT: testb $64, %r8b +; x86_64-NEXT: xorl %r8d, %r8d +; x86_64-NEXT: testb $64, %cl ; x86_64-NEXT: cmovneq %rsi, %rdi -; x86_64-NEXT: cmovneq %r11, %rsi +; x86_64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; x86_64-NEXT: cmovneq %r8, %rsi ; x86_64-NEXT: movl %r9d, %ecx ; x86_64-NEXT: shrq %cl, %rax ; x86_64-NEXT: testb $64, %r9b ; x86_64-NEXT: cmovneq %rax, %rdx -; x86_64-NEXT: cmovneq %r11, %rax +; x86_64-NEXT: cmovneq %r8, %rax ; x86_64-NEXT: movq %rax, 24(%r10) ; x86_64-NEXT: movq %rdx, 16(%r10) ; x86_64-NEXT: movq %rsi, 8(%r10) @@ -381,11 +381,11 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: andl $-16, %esp -; i686-NEXT: subl $112, %esp -; i686-NEXT: movl 40(%ebp), %edx +; i686-NEXT: subl $128, %esp +; i686-NEXT: movl 40(%ebp), %esi ; i686-NEXT: movl 24(%ebp), %eax ; i686-NEXT: movl 28(%ebp), %ecx -; i686-NEXT: movl 32(%ebp), %esi +; i686-NEXT: movl 32(%ebp), %edx ; i686-NEXT: movl 16(%ebp), %edi ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl 12(%ebp), %edi @@ -401,7 +401,7 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl 36(%ebp), %edi ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: sarl $31, %edi @@ -409,55 +409,55 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edx, %eax -; i686-NEXT: andl $31, %eax -; i686-NEXT: shrl $3, %edx -; i686-NEXT: andl $12, %edx -; i686-NEXT: movl 40(%esp,%edx), %esi -; i686-NEXT: movl 36(%esp,%edx), %edi +; i686-NEXT: movl %esi, %ecx +; i686-NEXT: andl $31, %ecx +; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shrl $3, %esi +; i686-NEXT: andl $12, %esi +; i686-NEXT: movl 56(%esp,%esi), %edx +; i686-NEXT: movl 52(%esp,%esi), %edi ; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %esi, %edi +; i686-NEXT: shrdl %cl, %edx, %edi ; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 32(%esp,%edx), %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 44(%esp,%edx), %edx -; i686-NEXT: movl %edx, (%esp) # 4-byte Spill -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %edx, %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 60(%esp,%esi), %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shrdl %cl, %edi, %edx +; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl 56(%ebp), %edx ; i686-NEXT: movl %edx, %ebx ; i686-NEXT: andl $31, %ebx ; i686-NEXT: shrl $3, %edx ; i686-NEXT: andl $12, %edx -; i686-NEXT: movl 72(%esp,%edx), %esi -; i686-NEXT: movl 68(%esp,%edx), %edi +; i686-NEXT: movl 88(%esp,%edx), %eax +; i686-NEXT: movl 84(%esp,%edx), %edi ; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shrdl %cl, %esi, %edi +; i686-NEXT: shrdl %cl, %eax, %edi ; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 64(%esp,%edx), %ecx +; i686-NEXT: movl 92(%esp,%edx), %edi +; i686-NEXT: shrdl %cl, %edi, %eax +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 48(%esp,%esi), %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 76(%esp,%edx), %edx -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shrdl %cl, %edx, %esi -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: sarl %cl, (%esp) # 4-byte Folded Spill +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: movl 80(%esp,%edx), %edx +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: sarl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: shrdl %cl, %eax, %edi -; i686-NEXT: sarl %cl, %edx +; i686-NEXT: shrdl %cl, %eax, %edx +; i686-NEXT: sarl %cl, %edi ; i686-NEXT: movl 72(%ebp), %eax -; i686-NEXT: movl %edx, 28(%eax) -; i686-NEXT: movl %esi, 24(%eax) +; i686-NEXT: movl %edi, 28(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 24(%eax) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: movl %ecx, 20(%eax) -; i686-NEXT: movl %edi, 16(%eax) -; i686-NEXT: movl (%esp), %ecx # 4-byte Reload +; i686-NEXT: movl %edx, 16(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: movl %ecx, 12(%eax) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: movl %ecx, 8(%eax) @@ -475,29 +475,29 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; x86_64-LABEL: test_ashr_v2i128: ; x86_64: # %bb.0: # %entry ; x86_64-NEXT: movq %rcx, %rax -; x86_64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; x86_64-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d ; x86_64-NEXT: movl %r9d, %ecx ; x86_64-NEXT: shrdq %cl, %rax, %rdx -; x86_64-NEXT: movl %r8d, %ecx +; x86_64-NEXT: movq %r8, %rcx ; x86_64-NEXT: shrdq %cl, %rsi, %rdi -; x86_64-NEXT: movq %rsi, %r11 -; x86_64-NEXT: sarq %cl, %r11 +; x86_64-NEXT: movq %rsi, %r8 +; x86_64-NEXT: sarq %cl, %r8 ; x86_64-NEXT: sarq $63, %rsi -; x86_64-NEXT: testb $64, %r8b -; x86_64-NEXT: cmovneq %r11, %rdi -; x86_64-NEXT: cmoveq %r11, %rsi +; x86_64-NEXT: testb $64, %cl +; x86_64-NEXT: cmovneq %r8, %rdi +; x86_64-NEXT: cmoveq %r8, %rsi ; x86_64-NEXT: movq %rax, %r8 ; x86_64-NEXT: movl %r9d, %ecx ; x86_64-NEXT: sarq %cl, %r8 +; x86_64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; x86_64-NEXT: sarq $63, %rax ; x86_64-NEXT: testb $64, %r9b ; x86_64-NEXT: cmovneq %r8, %rdx ; x86_64-NEXT: cmoveq %r8, %rax -; x86_64-NEXT: movq %rax, 24(%r10) -; x86_64-NEXT: movq %rdx, 16(%r10) -; x86_64-NEXT: movq %rsi, 8(%r10) -; x86_64-NEXT: movq %rdi, (%r10) +; x86_64-NEXT: movq %rax, 24(%rcx) +; x86_64-NEXT: movq %rdx, 16(%rcx) +; x86_64-NEXT: movq %rsi, 8(%rcx) +; x86_64-NEXT: movq %rdi, (%rcx) ; x86_64-NEXT: retq entry: %0 = ashr <2 x i128> %x, %a @@ -515,9 +515,9 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: pushl %esi ; i686-NEXT: andl $-16, %esp ; i686-NEXT: subl $128, %esp -; i686-NEXT: movl 40(%ebp), %edi +; i686-NEXT: movl 40(%ebp), %ecx ; i686-NEXT: movl 24(%ebp), %eax -; i686-NEXT: movl 28(%ebp), %ecx +; i686-NEXT: movl 28(%ebp), %edi ; i686-NEXT: movl 32(%ebp), %edx ; i686-NEXT: movl 20(%ebp), %esi ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) @@ -530,9 +530,9 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl 36(%ebp), %esi ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, %ebx +; i686-NEXT: movl %ecx, %ebx ; i686-NEXT: shrl $3, %ebx ; i686-NEXT: andl $12, %ebx ; i686-NEXT: leal {{[0-9]+}}(%esp), %eax @@ -541,50 +541,48 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl (%eax), %esi ; i686-NEXT: movl 4(%eax), %edx ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 8(%eax), %eax -; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %edi, %ecx +; i686-NEXT: movl 8(%eax), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: andl $31, %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shldl %cl, %edx, %eax +; i686-NEXT: shldl %cl, %edx, %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl (%eax), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl 56(%ebp), %eax ; i686-NEXT: movl %eax, %edx ; i686-NEXT: shrl $3, %edx ; i686-NEXT: andl $12, %edx -; i686-NEXT: leal {{[0-9]+}}(%esp), %ecx -; i686-NEXT: subl %edx, %ecx +; i686-NEXT: leal {{[0-9]+}}(%esp), %esi +; i686-NEXT: subl %edx, %esi ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl (%ecx), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 4(%ecx), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 8(%ecx), %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: andl $31, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ecx, %eax +; i686-NEXT: movl 4(%esi), %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 8(%esi), %eax +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: shldl %cl, %edi, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %esi, %eax +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: shll %cl, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: negl %ebx ; i686-NEXT: movl 76(%esp,%ebx), %ebx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: shldl %cl, %esi, %ebx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; i686-NEXT: shldl %cl, %edi, %ebx +; i686-NEXT: movl (%esi), %edi ; i686-NEXT: movl %edi, %esi ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; i686-NEXT: movl %eax, %ecx @@ -618,22 +616,22 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; x86_64-LABEL: test_shl_v2i128: ; x86_64: # %bb.0: # %entry ; x86_64-NEXT: movq %rcx, %rax -; x86_64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; x86_64-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d ; x86_64-NEXT: movl %r9d, %ecx ; x86_64-NEXT: shldq %cl, %rdx, %rax -; x86_64-NEXT: movl %r8d, %ecx +; x86_64-NEXT: movq %r8, %rcx ; x86_64-NEXT: shldq %cl, %rdi, %rsi ; x86_64-NEXT: shlq %cl, %rdi -; x86_64-NEXT: xorl %r11d, %r11d -; x86_64-NEXT: testb $64, %r8b +; x86_64-NEXT: xorl %r8d, %r8d +; x86_64-NEXT: testb $64, %cl ; x86_64-NEXT: cmovneq %rdi, %rsi -; x86_64-NEXT: cmovneq %r11, %rdi +; x86_64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; x86_64-NEXT: cmovneq %r8, %rdi ; x86_64-NEXT: movl %r9d, %ecx ; x86_64-NEXT: shlq %cl, %rdx ; x86_64-NEXT: testb $64, %r9b ; x86_64-NEXT: cmovneq %rdx, %rax -; x86_64-NEXT: cmovneq %r11, %rdx +; x86_64-NEXT: cmovneq %r8, %rdx ; x86_64-NEXT: movq %rax, 24(%r10) ; x86_64-NEXT: movq %rdx, 16(%r10) ; x86_64-NEXT: movq %rsi, 8(%r10) diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll index 128e2199fb56f..69122ab7104ca 100644 --- a/llvm/test/CodeGen/X86/shift-i256.ll +++ b/llvm/test/CodeGen/X86/shift-i256.ll @@ -42,43 +42,43 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { ; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: shrb $5, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: movl 40(%esp,%eax,4), %edx -; CHECK-NEXT: movl 36(%esp,%eax,4), %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: shrdl %cl, %edx, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 44(%esp,%eax,4), %esi -; CHECK-NEXT: shrdl %cl, %esi, %edx +; CHECK-NEXT: movzbl %al, %edi +; CHECK-NEXT: movl 40(%esp,%edi,4), %eax +; CHECK-NEXT: movl 36(%esp,%edi,4), %edx ; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 48(%esp,%eax,4), %ebx -; CHECK-NEXT: shrdl %cl, %ebx, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 52(%esp,%eax,4), %esi -; CHECK-NEXT: shrdl %cl, %esi, %ebx -; CHECK-NEXT: movl 56(%esp,%eax,4), %edx +; CHECK-NEXT: shrdl %cl, %eax, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 44(%esp,%edi,4), %edx +; CHECK-NEXT: shrdl %cl, %edx, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 48(%esp,%edi,4), %eax +; CHECK-NEXT: shrdl %cl, %eax, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 52(%esp,%edi,4), %esi +; CHECK-NEXT: shrdl %cl, %esi, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 56(%esp,%edi,4), %edx ; CHECK-NEXT: shrdl %cl, %edx, %esi -; CHECK-NEXT: movl 32(%esp,%eax,4), %edi -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 60(%esp,%eax,4), %eax +; CHECK-NEXT: movl 60(%esp,%edi,4), %eax ; CHECK-NEXT: shrdl %cl, %eax, %edx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; CHECK-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movl 32(%esp,%edi,4), %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %ebx, %edi ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: sarl %cl, %eax ; CHECK-NEXT: movl 72(%ebp), %ecx ; CHECK-NEXT: movl %eax, 28(%ecx) ; CHECK-NEXT: movl %edx, 24(%ecx) ; CHECK-NEXT: movl %esi, 20(%ecx) -; CHECK-NEXT: movl %ebx, 16(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 16(%ecx) ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; CHECK-NEXT: movl %eax, 12(%ecx) ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; CHECK-NEXT: movl %eax, 8(%ecx) ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; CHECK-NEXT: movl %eax, 4(%ecx) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: movl %eax, (%ecx) +; CHECK-NEXT: movl %edi, (%ecx) ; CHECK-NEXT: leal -12(%ebp), %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi @@ -128,7 +128,6 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { ; ; CHECK-X64-O2-LABEL: shift1: ; CHECK-X64-O2: # %bb.0: # %entry -; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) @@ -138,23 +137,24 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { ; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movl %r8d, %eax +; CHECK-X64-O2-NEXT: shrb $6, %al +; CHECK-X64-O2-NEXT: movzbl %al, %eax +; CHECK-X64-O2-NEXT: movq -56(%rsp,%rax,8), %rdx +; CHECK-X64-O2-NEXT: movq -64(%rsp,%rax,8), %rsi +; CHECK-X64-O2-NEXT: movq %rsi, %rdi ; CHECK-X64-O2-NEXT: movl %r8d, %ecx -; CHECK-X64-O2-NEXT: shrb $6, %cl -; CHECK-X64-O2-NEXT: movzbl %cl, %edx -; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx,8), %rsi -; CHECK-X64-O2-NEXT: movq -72(%rsp,%rdx,8), %rdi -; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx,8), %r9 -; CHECK-X64-O2-NEXT: movq %r9, %r10 -; CHECK-X64-O2-NEXT: movl %r8d, %ecx -; CHECK-X64-O2-NEXT: shrdq %cl, %rsi, %r10 -; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx,8), %rdx -; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %rsi -; CHECK-X64-O2-NEXT: shrdq %cl, %r9, %rdi -; CHECK-X64-O2-NEXT: sarq %cl, %rdx -; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax) -; CHECK-X64-O2-NEXT: movq %rsi, 16(%rax) -; CHECK-X64-O2-NEXT: movq %r10, 8(%rax) -; CHECK-X64-O2-NEXT: movq %rdi, (%rax) +; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %rdi +; CHECK-X64-O2-NEXT: movq -48(%rsp,%rax,8), %r9 +; CHECK-X64-O2-NEXT: shrdq %cl, %r9, %rdx +; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-X64-O2-NEXT: movq -72(%rsp,%rax,8), %rax +; CHECK-X64-O2-NEXT: shrdq %cl, %rsi, %rax +; CHECK-X64-O2-NEXT: sarq %cl, %r9 +; CHECK-X64-O2-NEXT: movq %r9, 24(%r10) +; CHECK-X64-O2-NEXT: movq %rdx, 16(%r10) +; CHECK-X64-O2-NEXT: movq %rdi, 8(%r10) +; CHECK-X64-O2-NEXT: movq %rax, (%r10) ; CHECK-X64-O2-NEXT: retq entry: %0 = ashr i256 %x, %a @@ -226,14 +226,14 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: movl %edx, 16(%eax) ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; CHECK-NEXT: movl %edx, 12(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: movl %edx, 8(%eax) ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; CHECK-NEXT: movl %edi, %edx ; CHECK-NEXT: shll %cl, %edx ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; CHECK-NEXT: shldl %cl, %edi, %esi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 8(%eax) ; CHECK-NEXT: movl %esi, 4(%eax) ; CHECK-NEXT: movl %edx, (%eax) ; CHECK-NEXT: leal -12(%ebp), %esp @@ -285,33 +285,33 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-X64-O2-LABEL: shift2: ; CHECK-X64-O2: # %bb.0: ; CHECK-X64-O2-NEXT: movq %rsi, %rcx -; CHECK-X64-O2-NEXT: movq %rdi, %rax ; CHECK-X64-O2-NEXT: xorps %xmm0, %xmm0 -; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; CHECK-X64-O2-NEXT: movl %ecx, %edx -; CHECK-X64-O2-NEXT: shrb $3, %dl -; CHECK-X64-O2-NEXT: andb $24, %dl -; CHECK-X64-O2-NEXT: negb %dl -; CHECK-X64-O2-NEXT: movsbq %dl, %rdx -; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rsi -; CHECK-X64-O2-NEXT: movq -32(%rsp,%rdx), %rdi -; CHECK-X64-O2-NEXT: movq -24(%rsp,%rdx), %r8 -; CHECK-X64-O2-NEXT: movq %r8, %r9 -; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %r9 -; CHECK-X64-O2-NEXT: movq -16(%rsp,%rdx), %rdx -; CHECK-X64-O2-NEXT: shldq %cl, %r8, %rdx +; CHECK-X64-O2-NEXT: movl %ecx, %eax +; CHECK-X64-O2-NEXT: shrb $3, %al +; CHECK-X64-O2-NEXT: andb $24, %al +; CHECK-X64-O2-NEXT: negb %al +; CHECK-X64-O2-NEXT: movsbq %al, %rax +; CHECK-X64-O2-NEXT: movq -32(%rsp,%rax), %rdx +; CHECK-X64-O2-NEXT: movq -24(%rsp,%rax), %rsi ; CHECK-X64-O2-NEXT: movq %rsi, %r8 -; CHECK-X64-O2-NEXT: shlq %cl, %r8 +; CHECK-X64-O2-NEXT: shldq %cl, %rdx, %r8 +; CHECK-X64-O2-NEXT: movq -16(%rsp,%rax), %r9 +; CHECK-X64-O2-NEXT: shldq %cl, %rsi, %r9 +; CHECK-X64-O2-NEXT: movq -40(%rsp,%rax), %rax +; CHECK-X64-O2-NEXT: movq %rax, %rsi +; CHECK-X64-O2-NEXT: shlq %cl, %rsi ; CHECK-X64-O2-NEXT: # kill: def $cl killed $cl killed $rcx -; CHECK-X64-O2-NEXT: shldq %cl, %rsi, %rdi -; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax) -; CHECK-X64-O2-NEXT: movq %r9, 16(%rax) -; CHECK-X64-O2-NEXT: movq %rdi, 8(%rax) -; CHECK-X64-O2-NEXT: movq %r8, (%rax) +; CHECK-X64-O2-NEXT: shldq %cl, %rax, %rdx +; CHECK-X64-O2-NEXT: movq %rdi, %rax +; CHECK-X64-O2-NEXT: movq %r9, 24(%rdi) +; CHECK-X64-O2-NEXT: movq %r8, 16(%rdi) +; CHECK-X64-O2-NEXT: movq %rdx, 8(%rdi) +; CHECK-X64-O2-NEXT: movq %rsi, (%rdi) ; CHECK-X64-O2-NEXT: retq { %b = shl i256 1, %c ; %c must not be a constant diff --git a/llvm/test/CodeGen/X86/shift-parts.ll b/llvm/test/CodeGen/X86/shift-parts.ll index 62d86909ffca2..12516c50ece65 100644 --- a/llvm/test/CodeGen/X86/shift-parts.ll +++ b/llvm/test/CodeGen/X86/shift-parts.ll @@ -13,15 +13,22 @@ define i32 @int87(i32 %uint64p_8, i1 %cond) nounwind { ; CHECK-NEXT: movq g_144+16(%rip), %rcx ; CHECK-NEXT: movzbl %sil, %edx ; CHECK-NEXT: shll $6, %edx +; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_3: # %for.cond +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: jne .LBB0_4 ; CHECK-NEXT: .LBB0_1: # %for.cond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq %rax, %rsi ; CHECK-NEXT: testb $64, %dl +; CHECK-NEXT: jne .LBB0_3 +; CHECK-NEXT: # %bb.2: # %for.cond +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movq %rcx, %rsi -; CHECK-NEXT: cmovneq %rax, %rsi -; CHECK-NEXT: testl %esi, %esi -; CHECK-NEXT: je .LBB0_1 -; CHECK-NEXT: # %bb.2: # %if.then +; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: .LBB0_4: # %if.then ; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index e53eed4587797..1402b153f0d29 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -198,20 +198,18 @@ define void @mul_8xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; ; X86-AVX1-LABEL: mul_8xi8: ; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: movl c, %eax ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) -; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%eax,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, (%eax,%ecx,4) ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_8xi8: @@ -247,12 +245,12 @@ define void @mul_8xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; ; X64-AVX1-LABEL: mul_8xi8: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 ; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) ; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) @@ -321,11 +319,9 @@ define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; ; X86-AVX1-LABEL: mul_16xi8: ; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-AVX1-NEXT: movl c, %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero @@ -333,34 +329,32 @@ define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: movl c, %edx ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4) -; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4) -; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4) -; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%edx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%edx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%edx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%edx,%eax,4) ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_16xi8: ; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: pushl %esi ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: movl c, %esi ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: movl c, %eax ; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 -; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) -; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) -; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%eax,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%eax,%ecx,4) ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl ; @@ -392,7 +386,6 @@ define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; ; X64-AVX1-LABEL: mul_16xi8: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero @@ -400,6 +393,7 @@ define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 @@ -413,12 +407,12 @@ define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; ; X64-AVX2-LABEL: mul_16xi8: ; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: movq c(%rip), %rax ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: movq c(%rip), %rax ; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) @@ -615,20 +609,18 @@ define void @mul_8xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; ; X86-AVX1-LABEL: mul_8xi16: ; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: movl c, %eax ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) -; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%eax,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, (%eax,%ecx,4) ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_8xi16: @@ -663,12 +655,12 @@ define void @mul_8xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 ; ; X64-AVX1-LABEL: mul_8xi16: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 ; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) ; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) @@ -736,11 +728,9 @@ define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i6 ; ; X86-AVX1-LABEL: mul_16xi16: ; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-AVX1-NEXT: movl c, %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero @@ -748,34 +738,32 @@ define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i6 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: movl c, %edx ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4) -; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4) -; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4) -; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%edx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%edx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%edx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%edx,%eax,4) ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_16xi16: ; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: pushl %esi ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: movl c, %esi ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: movl c, %eax ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 -; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) -; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) -; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%eax,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%eax,%ecx,4) ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl ; @@ -806,7 +794,6 @@ define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i6 ; ; X64-AVX1-LABEL: mul_16xi16: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero @@ -814,6 +801,7 @@ define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i6 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 @@ -827,12 +815,12 @@ define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i6 ; ; X64-AVX2-LABEL: mul_16xi16: ; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: movq c(%rip), %rax ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: movq c(%rip), %rax ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) @@ -1233,46 +1221,42 @@ define void @mul_16xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly % ; ; X86-AVX1-LABEL: mul_16xi16_sext: ; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-AVX1-NEXT: movl c, %ecx -; X86-AVX1-NEXT: vpmovsxwd 24(%esi,%eax), %xmm0 -; X86-AVX1-NEXT: vpmovsxwd 16(%esi,%eax), %xmm1 -; X86-AVX1-NEXT: vpmovsxwd 8(%esi,%eax), %xmm2 -; X86-AVX1-NEXT: vpmovsxwd (%esi,%eax), %xmm3 -; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%eax), %xmm4 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%eax), %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%eax), %xmm1 +; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%eax), %xmm2 +; X86-AVX1-NEXT: vpmovsxwd (%edx,%eax), %xmm3 +; X86-AVX1-NEXT: vpmovsxwd 24(%ecx,%eax), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 -; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%eax), %xmm4 +; X86-AVX1-NEXT: vpmovsxwd 16(%ecx,%eax), %xmm4 +; X86-AVX1-NEXT: movl c, %edx ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%eax), %xmm4 +; X86-AVX1-NEXT: vpmovsxwd 8(%ecx,%eax), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpmovsxwd (%edx,%eax), %xmm4 +; X86-AVX1-NEXT: vpmovsxwd (%ecx,%eax), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4) -; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4) -; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4) -; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%edx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%edx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%edx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%edx,%eax,4) ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: mul_16xi16_sext: ; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: pushl %esi ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: movl c, %esi ; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 ; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 ; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: movl c, %eax ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 -; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) -; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) -; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%eax,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%eax,%ecx,4) ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl ; @@ -1303,7 +1287,6 @@ define void @mul_16xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly % ; ; X64-AVX1-LABEL: mul_16xi16_sext: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0 ; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1 ; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2 @@ -1311,6 +1294,7 @@ define void @mul_16xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly % ; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 ; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: movq c(%rip), %rax ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 @@ -1324,12 +1308,12 @@ define void @mul_16xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly % ; ; X64-AVX2-LABEL: mul_16xi16_sext: ; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: movq c(%rip), %rax ; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 ; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 ; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: movq c(%rip), %rax ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) @@ -1983,76 +1967,76 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movzwl 16(%eax), %edx ; X86-SSE-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SSE-NEXT: movdqa (%eax), %xmm2 -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: pextrw $7, %xmm2, %eax -; X86-SSE-NEXT: pextrw $4, %xmm2, %esi -; X86-SSE-NEXT: pextrw $1, %xmm2, %edi -; X86-SSE-NEXT: pextrw $0, %xmm2, %ebx -; X86-SSE-NEXT: pextrw $3, %xmm2, %ebp -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa (%eax), %xmm1 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 +; X86-SSE-NEXT: pextrw $7, %xmm1, %eax +; X86-SSE-NEXT: pextrw $4, %xmm1, %esi +; X86-SSE-NEXT: pextrw $1, %xmm1, %ebx +; X86-SSE-NEXT: pextrw $0, %xmm1, %edi +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pextrw $3, %xmm1, %ebp +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 28(%ecx) -; X86-SSE-NEXT: movd %edx, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-SSE-NEXT: movd %xmm3, %eax +; X86-SSE-NEXT: movd %edx, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; X86-SSE-NEXT: movd %xmm0, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 24(%ecx) -; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X86-SSE-NEXT: movd %edx, %xmm4 ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 16(%ecx) -; X86-SSE-NEXT: movd %edx, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE-NEXT: movd %xmm0, %eax +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; X86-SSE-NEXT: movd %xmm2, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 20(%ecx) -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; X86-SSE-NEXT: movl %edi, %eax +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: movl %ebx, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 4(%ecx) +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: movl %ebx, %eax +; X86-SSE-NEXT: movl %edi, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl (%ecx) -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE-NEXT: movl %ebp, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 12(%ecx) ; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X86-SSE-NEXT: movd %xmm2, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X86-SSE-NEXT: movd %xmm1, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 8(%ecx) -; X86-SSE-NEXT: movd %edx, %xmm2 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X86-SSE-NEXT: movd %edx, %xmm1 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 32(%ecx) -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [8199,8199,8199,8199] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm3, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X86-SSE-NEXT: movl %eax, (%eax) -; X86-SSE-NEXT: movdqa %xmm1, (%eax) ; X86-SSE-NEXT: movdqa %xmm0, (%eax) +; X86-SSE-NEXT: movdqa %xmm1, (%eax) ; X86-SSE-NEXT: addl $4, %esp ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %edi @@ -2067,10 +2051,10 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-AVX1-NEXT: pushl %edi ; X86-AVX1-NEXT: pushl %esi ; X86-AVX1-NEXT: subl $16, %esp -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vmovd %xmm2, %eax ; X86-AVX1-NEXT: xorl %edx, %edx @@ -2133,9 +2117,9 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-AVX2-LABEL: PR34947: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: pushl %esi -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; X86-AVX2-NEXT: vpextrd $1, %xmm2, %eax @@ -2172,12 +2156,12 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl 12(%esi) ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1 -; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl 32(%esi) -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] -; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X86-AVX2-NEXT: movl %eax, (%eax) ; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) @@ -2188,75 +2172,75 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X64-SSE-LABEL: PR34947: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movzwl 16(%rdi), %ecx -; X64-SSE-NEXT: movdqa (%rdi), %xmm2 -; X64-SSE-NEXT: pxor %xmm1, %xmm1 -; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: pextrw $7, %xmm2, %eax -; X64-SSE-NEXT: pextrw $4, %xmm2, %edi -; X64-SSE-NEXT: pextrw $1, %xmm2, %r8d -; X64-SSE-NEXT: pextrw $0, %xmm2, %r9d -; X64-SSE-NEXT: pextrw $3, %xmm2, %r10d -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa (%rdi), %xmm1 +; X64-SSE-NEXT: pxor %xmm0, %xmm0 +; X64-SSE-NEXT: pextrw $7, %xmm1, %eax +; X64-SSE-NEXT: pextrw $4, %xmm1, %edi +; X64-SSE-NEXT: pextrw $1, %xmm1, %r9d +; X64-SSE-NEXT: pextrw $0, %xmm1, %r8d +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pextrw $3, %xmm1, %r10d +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 28(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-SSE-NEXT: movd %xmm3, %eax +; X64-SSE-NEXT: movd %edx, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; X64-SSE-NEXT: movd %xmm0, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 24(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm3 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-SSE-NEXT: movd %edx, %xmm4 ; X64-SSE-NEXT: movl %edi, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 16(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-SSE-NEXT: movd %xmm0, %eax +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; X64-SSE-NEXT: movd %xmm2, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 20(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; X64-SSE-NEXT: movl %r8d, %eax +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-SSE-NEXT: movl %r9d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 4(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: movl %r9d, %eax +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: movl %r8d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl (%rsi) ; X64-SSE-NEXT: movd %edx, %xmm3 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X64-SSE-NEXT: movl %r10d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 12(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X64-SSE-NEXT: movd %xmm2, %eax +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X64-SSE-NEXT: movd %xmm1, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 8(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm2 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X64-SSE-NEXT: movd %edx, %xmm1 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; X64-SSE-NEXT: movl %ecx, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 32(%rsi) -; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X64-SSE-NEXT: movl %eax, (%rax) -; X64-SSE-NEXT: movdqa %xmm1, (%rax) +; X64-SSE-NEXT: movdqa %xmm0, (%rax) ; X64-SSE-NEXT: movdqa %xmm3, (%rax) ; X64-SSE-NEXT: retq ; @@ -2307,10 +2291,10 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X64-AVX1-NEXT: vpinsrd $2, %ebx, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpinsrd $3, %r11d, %xmm0, %xmm0 ; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8199,8199,8199,8199] -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %r10d, %xmm2 ; X64-AVX1-NEXT: vpinsrd $1, %r9d, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpinsrd $2, %r8d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 ; X64-AVX1-NEXT: imull $8199, %ecx, %eax # imm = 0x2007 @@ -2360,12 +2344,12 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl 12(%rsi) ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm1 -; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl 32(%rsi) -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] -; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X64-AVX2-NEXT: movl %eax, (%rax) ; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll index cc7bfb58c329b..304ed988911ca 100644 --- a/llvm/test/CodeGen/X86/shuffle-half.ll +++ b/llvm/test/CodeGen/X86/shuffle-half.ll @@ -24,7 +24,7 @@ define <32 x half> @build_vec(ptr %p, <32 x i1> %mask) { ; CHECK-NEXT: je .LBB1_1 ; CHECK-NEXT: # %bb.2: # %cond.load ; CHECK-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; CHECK-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 ; CHECK-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll index f632654f89e04..07effbc627069 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -285,8 +285,8 @@ define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind { ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vmovq %xmm0, (%rsi) @@ -295,8 +295,8 @@ define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind { ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) @@ -354,8 +354,8 @@ define void @shuffle_v16i16_to_v4i16_2(ptr %L, ptr %S) nounwind { ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vmovq %xmm0, (%rsi) @@ -364,8 +364,8 @@ define void @shuffle_v16i16_to_v4i16_2(ptr %L, ptr %S) nounwind { ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) @@ -423,8 +423,8 @@ define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind { ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vmovq %xmm0, (%rsi) @@ -433,8 +433,8 @@ define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind { ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) @@ -489,27 +489,16 @@ define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_1: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_1: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_1: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512F: # %bb.0: @@ -549,27 +538,16 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_2: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512F: # %bb.0: @@ -609,27 +587,16 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_3: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_3: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512F: # %bb.0: @@ -669,27 +636,16 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_4: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_4: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512F: # %bb.0: @@ -729,27 +685,16 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_5: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_5: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_5: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5: ; AVX512F: # %bb.0: @@ -789,27 +734,16 @@ define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_6: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_6: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_6: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6: ; AVX512F: # %bb.0: @@ -849,27 +783,16 @@ define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_7: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_7: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_7: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7: ; AVX512F: # %bb.0: @@ -908,3 +831,5 @@ define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind { ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX2: {{.*}} diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll index 82c460fc55938..1fcd56023b772 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -65,7 +65,7 @@ define void @shuffle_v16i32_to_v8i32_1(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512BWVL-FAST-ALL: # %bb.0: -; AVX512BWVL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] ; AVX512BWVL-FAST-ALL-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; AVX512BWVL-FAST-ALL-NEXT: vmovaps %ymm0, (%rsi) ; AVX512BWVL-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll index a84466bc1ca1a..888a7c023bb81 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -704,7 +704,7 @@ define <16 x i8> @evenelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwi ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,0,4] ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vzeroupper @@ -785,7 +785,7 @@ define <16 x i8> @oddelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwin ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,0,4] ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vzeroupper @@ -864,17 +864,17 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; SSE42-NEXT: pextrw $6, %xmm2, %edi ; SSE42-NEXT: pextrw $4, %xmm2, %r8d ; SSE42-NEXT: pextrw $2, %xmm2, %r9d -; SSE42-NEXT: movd %xmm2, %r10d -; SSE42-NEXT: pextrw $6, %xmm1, %r11d -; SSE42-NEXT: pextrw $4, %xmm1, %ebx +; SSE42-NEXT: pextrw $6, %xmm1, %r10d +; SSE42-NEXT: pextrw $4, %xmm1, %r11d +; SSE42-NEXT: movd %xmm2, %ebx ; SSE42-NEXT: pextrw $2, %xmm1, %ebp ; SSE42-NEXT: movd %xmm1, %r14d ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pinsrb $4, %r14d, %xmm0 ; SSE42-NEXT: pinsrb $5, %ebp, %xmm0 -; SSE42-NEXT: pinsrb $6, %ebx, %xmm0 -; SSE42-NEXT: pinsrb $7, %r11d, %xmm0 -; SSE42-NEXT: pinsrb $8, %r10d, %xmm0 +; SSE42-NEXT: pinsrb $6, %r11d, %xmm0 +; SSE42-NEXT: pinsrb $7, %r10d, %xmm0 +; SSE42-NEXT: pinsrb $8, %ebx, %xmm0 ; SSE42-NEXT: pinsrb $9, %r9d, %xmm0 ; SSE42-NEXT: pinsrb $10, %r8d, %xmm0 ; SSE42-NEXT: pinsrb $11, %edi, %xmm0 @@ -896,26 +896,26 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; AVX1-NEXT: vpextrw $6, %xmm2, %eax ; AVX1-NEXT: vpextrw $4, %xmm2, %ecx ; AVX1-NEXT: vpextrw $2, %xmm2, %edx -; AVX1-NEXT: vmovd %xmm2, %esi -; AVX1-NEXT: vpextrw $6, %xmm1, %edi -; AVX1-NEXT: vpextrw $4, %xmm1, %r8d +; AVX1-NEXT: vpextrw $6, %xmm1, %esi +; AVX1-NEXT: vpextrw $4, %xmm1, %edi +; AVX1-NEXT: vmovd %xmm2, %r8d ; AVX1-NEXT: vpextrw $2, %xmm1, %r9d -; AVX1-NEXT: vmovd %xmm1, %r10d -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrw $6, %xmm1, %r11d -; AVX1-NEXT: vpextrw $4, %xmm1, %ebx -; AVX1-NEXT: vpextrw $2, %xmm1, %ebp -; AVX1-NEXT: vmovd %xmm1, %r14d +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm2, %r10d +; AVX1-NEXT: vpextrw $4, %xmm2, %r11d +; AVX1-NEXT: vmovd %xmm1, %ebx +; AVX1-NEXT: vpextrw $2, %xmm2, %ebp +; AVX1-NEXT: vmovd %xmm2, %r14d ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 ; AVX1-NEXT: vpinsrb $5, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $6, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $8, %ebx, %xmm0, %xmm0 ; AVX1-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $12, %r8d, %xmm0, %xmm0 ; AVX1-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 ; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -934,26 +934,26 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; AVX2-NEXT: vpextrw $6, %xmm2, %eax ; AVX2-NEXT: vpextrw $4, %xmm2, %ecx ; AVX2-NEXT: vpextrw $2, %xmm2, %edx -; AVX2-NEXT: vmovd %xmm2, %esi -; AVX2-NEXT: vpextrw $6, %xmm1, %edi -; AVX2-NEXT: vpextrw $4, %xmm1, %r8d +; AVX2-NEXT: vpextrw $6, %xmm1, %esi +; AVX2-NEXT: vpextrw $4, %xmm1, %edi +; AVX2-NEXT: vmovd %xmm2, %r8d ; AVX2-NEXT: vpextrw $2, %xmm1, %r9d -; AVX2-NEXT: vmovd %xmm1, %r10d -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrw $6, %xmm1, %r11d -; AVX2-NEXT: vpextrw $4, %xmm1, %ebx -; AVX2-NEXT: vpextrw $2, %xmm1, %ebp -; AVX2-NEXT: vmovd %xmm1, %r14d +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm2, %r10d +; AVX2-NEXT: vpextrw $4, %xmm2, %r11d +; AVX2-NEXT: vmovd %xmm1, %ebx +; AVX2-NEXT: vpextrw $2, %xmm2, %ebp +; AVX2-NEXT: vmovd %xmm2, %r14d ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 ; AVX2-NEXT: vpinsrb $5, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $6, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $8, %ebx, %xmm0, %xmm0 ; AVX2-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $11, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $12, %r8d, %xmm0, %xmm0 ; AVX2-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 ; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -1114,10 +1114,10 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8: ; AVX512VBMI-FAST: # %bb.0: ; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79] +; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm2, %eax ; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2 ; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 -; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax ; AVX512VBMI-FAST-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 ; AVX512VBMI-FAST-NEXT: vzeroupper ; AVX512VBMI-FAST-NEXT: retq @@ -1144,52 +1144,52 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind define <16 x i8> @oddelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind { ; SSE2-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $7, %xmm3, %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: pextrw $5, %xmm3, %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: pextrw $3, %xmm3, %eax -; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: pextrw $1, %xmm3, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pextrw $7, %xmm2, %eax -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: pextrw $5, %xmm2, %eax -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: pextrw $3, %xmm2, %eax -; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: pextrw $1, %xmm2, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: pextrw $5, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm11 -; SSE2-NEXT: pextrw $3, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm12 -; SSE2-NEXT: pextrw $1, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pextrw $7, %xmm0, %eax -; SSE2-NEXT: movd %eax, %xmm13 ; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm1, %edx +; SSE2-NEXT: pextrw $1, %xmm2, %esi +; SSE2-NEXT: pextrw $5, %xmm2, %edi +; SSE2-NEXT: pextrw $1, %xmm3, %r8d +; SSE2-NEXT: pextrw $5, %xmm3, %r9d +; SSE2-NEXT: pextrw $7, %xmm3, %r10d +; SSE2-NEXT: movd %r10d, %xmm5 +; SSE2-NEXT: movd %r9d, %xmm4 +; SSE2-NEXT: pextrw $3, %xmm3, %r9d +; SSE2-NEXT: movd %r9d, %xmm6 +; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: pextrw $7, %xmm2, %r8d +; SSE2-NEXT: movd %r8d, %xmm8 +; SSE2-NEXT: movd %edi, %xmm7 +; SSE2-NEXT: pextrw $3, %xmm2, %edi +; SSE2-NEXT: movd %edi, %xmm9 +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: pextrw $7, %xmm1, %esi +; SSE2-NEXT: movd %esi, %xmm10 +; SSE2-NEXT: movd %edx, %xmm11 +; SSE2-NEXT: pextrw $3, %xmm1, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm12 +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: movd %ecx, %xmm13 ; SSE2-NEXT: movd %eax, %xmm14 ; SSE2-NEXT: pextrw $3, %xmm0, %eax ; SSE2-NEXT: movd %eax, %xmm15 ; SSE2-NEXT: pextrw $1, %xmm0, %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3],xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 84ae818d91832..f961add3fd826 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -29,7 +29,7 @@ define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind { ; ; AVX2-LABEL: shuffle_v32i8_to_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand 16(%rdi), %xmm0, %xmm1 ; AVX2-NEXT: vpand (%rdi), %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -38,7 +38,7 @@ define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind { ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand 16(%rdi), %xmm0, %xmm1 ; AVX512F-NEXT: vpand (%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -47,7 +47,7 @@ define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand 16(%rdi), %xmm0, %xmm1 ; AVX512VL-NEXT: vpand (%rdi), %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -324,8 +324,7 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, (%rsi) ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -810,7 +809,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no ; ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -881,7 +880,7 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no ; ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -1009,7 +1008,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { ; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1166,27 +1165,16 @@ define void @trunc_v4i64_to_v4i16(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8: ; AVX512F: # %bb.0: @@ -1231,27 +1219,16 @@ define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind { } define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v4i64_to_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: trunc_v4i64_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i8: ; AVX512F: # %bb.0: @@ -1305,7 +1282,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1313,7 +1290,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX2-LABEL: negative: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1323,7 +1300,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX512F-LABEL: negative: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512F-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1344,7 +1321,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX512BW-LABEL: negative: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BW-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1364,8 +1341,7 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; ; AVX512VBMIVL-LABEL: negative: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; AVX512VBMIVL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512VBMIVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index e27a77ed2293d..652928efac579 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -35,7 +35,7 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind { ; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,5,7] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512VL-FAST-ALL-NEXT: vzeroupper @@ -45,7 +45,7 @@ define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind { ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -275,7 +275,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) { ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,16,21] ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] @@ -286,7 +286,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,13] ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] @@ -297,7 +297,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,13] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u,33,37,41,45,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,49,53,57,62,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -306,7 +306,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,12,0,5] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [8,12,0,5] ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u,33,37,41,45,u,u,u,u,u,u,u,u,u,u,u,u,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u] @@ -343,14 +343,14 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) { ; AVX512VL-FAST-ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] ; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,5,7] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,5,7] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; ; AVX512VL-FAST-PERLANE-LABEL: trunc_shuffle_v32i16_v32i8_ofs1: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -382,22 +382,22 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) { define <4 x double> @PR34175(ptr %p) { ; AVX512F-LABEL: PR34175: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vmovdqu (%rdi), %xmm1 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: PR34175: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm0 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovdqu (%rdi), %xmm1 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/signed-truncation-check.ll b/llvm/test/CodeGen/X86/signed-truncation-check.ll index fc0fbb206cbb3..6918fe12f6921 100644 --- a/llvm/test/CodeGen/X86/signed-truncation-check.ll +++ b/llvm/test/CodeGen/X86/signed-truncation-check.ll @@ -110,8 +110,8 @@ define i1 @shifts_eqcmp_i64_i16(i64 %x) nounwind { ; X86-NEXT: movswl %ax, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: sete %al ; X86-NEXT: retl @@ -135,8 +135,8 @@ define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind { ; X86-NEXT: movsbl %al, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: sete %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll index 975ffd06db03b..39c642b7676a1 100644 --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -114,7 +114,7 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLOW-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SLOW-NEXT: pmaddwd %xmm2, %xmm0 ; SLOW-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW-NEXT: ret{{[l|q]}} @@ -124,7 +124,7 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-NEXT: pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-NEXT: pmaddwd %xmm2, %xmm0 ; SSE4-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-NEXT: ret{{[l|q]}} @@ -221,7 +221,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SLOW-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SLOW-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SLOW-NEXT: pmaddwd %xmm4, %xmm0 ; SLOW-NEXT: pmaddwd %xmm4, %xmm1 ; SLOW-NEXT: pmaddwd %xmm4, %xmm2 @@ -237,7 +237,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE4-NEXT: pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0] ; SSE4-NEXT: pmaddwd %xmm4, %xmm0 ; SSE4-NEXT: pmaddwd %xmm4, %xmm1 ; SSE4-NEXT: pmaddwd %xmm4, %xmm2 @@ -347,7 +347,7 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) { ; ; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,u,u,u,u] ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -399,7 +399,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE4-NEXT: pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] ; SSE4-NEXT: pmulld %xmm1, %xmm2 ; SSE4-NEXT: pmulld %xmm0, %xmm1 ; SSE4-NEXT: movdqa %xmm2, %xmm0 @@ -407,7 +407,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { ; ; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -480,7 +480,7 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { ; SSE4-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE4-NEXT: pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778] +; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778] ; SSE4-NEXT: pmulld %xmm3, %xmm0 ; SSE4-NEXT: pmulld %xmm3, %xmm4 ; SSE4-NEXT: pmulld %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll index 86891e964d96d..18115499c08cc 100644 --- a/llvm/test/CodeGen/X86/smax.ll +++ b/llvm/test/CodeGen/X86/smax.ll @@ -154,28 +154,28 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %ebx, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %ebx, %edx ; X86-NEXT: movl %esi, %ebp ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovll %ebx, %edx +; X86-NEXT: cmovll %ebx, %edi +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmovll %ebp, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -253,24 +253,20 @@ define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind { ; ; X86-LABEL: test_v3i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmpl %eax, %ebx -; X86-NEXT: cmovgl %ebx, %eax -; X86-NEXT: cmpl %edx, %edi -; X86-NEXT: cmovgl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: cmovgl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: cmovgl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmpl %ecx, %esi ; X86-NEXT: cmovgl %esi, %ecx ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl %r = call <3 x i32> @llvm.smax.v3i32(<3 x i32> %a, <3 x i32> %b) ret <3 x i32> %r diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll index 8907f6c4cd598..ec7fddec4af11 100644 --- a/llvm/test/CodeGen/X86/smin.ll +++ b/llvm/test/CodeGen/X86/smin.ll @@ -154,29 +154,28 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %edx, %ebx -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovll %ebx, %edx -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovll %ebx, %edx ; X86-NEXT: cmovll %edi, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ebp, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -254,24 +253,20 @@ define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind { ; ; X86-LABEL: test_v3i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmpl %eax, %ebx -; X86-NEXT: cmovll %ebx, %eax -; X86-NEXT: cmpl %edx, %edi -; X86-NEXT: cmovll %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: cmovll %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: cmovll %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmpl %ecx, %esi ; X86-NEXT: cmovll %esi, %ecx ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl %r = call <3 x i32> @llvm.smin.v3i32(<3 x i32> %a, <3 x i32> %b) ret <3 x i32> %r diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll index df167338268c4..6541adb7d93dc 100644 --- a/llvm/test/CodeGen/X86/smul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll @@ -205,7 +205,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: addl %edx, %esi ; X86-NEXT: movl %esi, %edi ; X86-NEXT: movl %edx, %esi @@ -217,7 +217,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl %edi, %ebp ; X86-NEXT: addl %eax, %ebp ; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %edx, %esi ; X86-NEXT: setb %al @@ -232,75 +232,71 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %edx, %ebp ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %eax, %ebp ; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: setb %al -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl %ebx, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: adcl %ebp, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-NEXT: addl %ebp, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: addl %eax, %edi ; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %eax, %esi +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: adcl %edi, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: movzbl (%esp), %edi # 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: addl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -412,53 +408,53 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebp, %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl %eax, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NEXT: adcl %edi, %edx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $0, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -701,37 +697,35 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; X64-NEXT: andl $1, %r13d -; X64-NEXT: negq %r13 -; X64-NEXT: andl $1, %r14d -; X64-NEXT: negq %r14 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: andl $1, %ecx +; X64-NEXT: negq %rcx +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: negq %r13 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rdx, %r12 ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: addq %rax, %r12 ; X64-NEXT: adcq %rdx, %r11 -; X64-NEXT: setb %cl -; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: setb %r10b +; X64-NEXT: movzbl %r10b, %r15d ; X64-NEXT: addq %rax, %r11 -; X64-NEXT: adcq %rdx, %rcx -; X64-NEXT: addq %rdi, %r11 -; X64-NEXT: adcq %r12, %rcx +; X64-NEXT: adcq %rdx, %r15 +; X64-NEXT: addq %rbx, %r11 +; X64-NEXT: adcq %r12, %r15 +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %rbp @@ -745,94 +739,95 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X64-NEXT: adcq %r8, %rsi ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %ebp -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rsi, %r10 ; X64-NEXT: adcq %rbp, %r8 -; X64-NEXT: addq %rdi, %r10 +; X64-NEXT: addq %rbx, %r10 ; X64-NEXT: adcq %r12, %r8 ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: adcq $0, %rcx +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rsi, %rbx -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: addq %r9, %rbx -; X64-NEXT: adcq %rsi, %r15 +; X64-NEXT: adcq %rsi, %r14 ; X64-NEXT: setb %sil ; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: addq %rax, %r15 +; X64-NEXT: addq %rax, %r14 ; X64-NEXT: adcq %rdx, %rsi -; X64-NEXT: addq %r9, %r15 +; X64-NEXT: addq %r9, %r14 ; X64-NEXT: adcq %rbx, %rsi ; X64-NEXT: addq %r9, %r10 ; X64-NEXT: adcq %r8, %rbx -; X64-NEXT: adcq $0, %r15 +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: sarq $63, %rax -; X64-NEXT: movq %rcx, %rdi -; X64-NEXT: sarq $63, %rdi -; X64-NEXT: addq %r11, %r15 -; X64-NEXT: adcq %rcx, %rsi -; X64-NEXT: movq %rdi, %r9 +; X64-NEXT: movq %r15, %r9 +; X64-NEXT: sarq $63, %r9 +; X64-NEXT: addq %r11, %r14 +; X64-NEXT: adcq %r15, %rsi +; X64-NEXT: movq %r9, %r15 +; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: adcq %rdx, %r11 -; X64-NEXT: setb %al -; X64-NEXT: addq %r8, %r11 -; X64-NEXT: movzbl %al, %r12d +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: adcq $0, %r12 +; X64-NEXT: addq %rax, %r11 ; X64-NEXT: adcq %rdx, %r12 +; X64-NEXT: setb %al +; X64-NEXT: addq %r8, %r12 +; X64-NEXT: movzbl %al, %ebp +; X64-NEXT: adcq %rdx, %rbp ; X64-NEXT: movq %r13, %rax -; X64-NEXT: imulq %r14 -; X64-NEXT: addq %rax, %rax +; X64-NEXT: imulq %rcx +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %rax, %rcx ; X64-NEXT: adcq %rdx, %rdx -; X64-NEXT: addq %r11, %rax -; X64-NEXT: adcq %r12, %rdx -; X64-NEXT: addq %r8, %r15 -; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: adcq %r9, %rax -; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: addq %r12, %rcx +; X64-NEXT: adcq %rbp, %rdx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: addq %r8, %r14 +; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: adcq %r15, %rcx +; X64-NEXT: adcq %r9, %rdx ; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: sarq $63, %rsi -; X64-NEXT: xorq %rsi, %rax -; X64-NEXT: xorq %rsi, %r15 -; X64-NEXT: orq %rax, %r15 +; X64-NEXT: xorq %rsi, %rcx +; X64-NEXT: xorq %rsi, %r14 +; X64-NEXT: orq %rcx, %r14 ; X64-NEXT: xorq %rsi, %rdx -; X64-NEXT: xorq %rcx, %rsi +; X64-NEXT: xorq %r11, %rsi ; X64-NEXT: orq %rdx, %rsi -; X64-NEXT: orq %r15, %rsi -; X64-NEXT: movl %r10d, %edx -; X64-NEXT: andl $1, %edx -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: negq %rcx -; X64-NEXT: xorq %rcx, %rbx -; X64-NEXT: xorq %r10, %rcx -; X64-NEXT: orq %rbx, %rcx -; X64-NEXT: orq %rsi, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, 8(%rax) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, (%rax) -; X64-NEXT: movb %dl, 16(%rax) -; X64-NEXT: setne 32(%rax) +; X64-NEXT: orq %r14, %rsi +; X64-NEXT: movl %r10d, %ecx +; X64-NEXT: andl $1, %ecx +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: negq %rdx +; X64-NEXT: xorq %rdx, %rbx +; X64-NEXT: xorq %r10, %rdx +; X64-NEXT: orq %rbx, %rdx +; X64-NEXT: orq %rsi, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq %rdx, 8(%rdi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; X64-NEXT: movq %rdx, (%rdi) +; X64-NEXT: movb %cl, 16(%rdi) +; X64-NEXT: setne 32(%rdi) ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index ce56283df6010..aa2812195d83f 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -52,11 +52,11 @@ define i64 @func2(i64 %x, i64 %y) { ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %esi @@ -160,38 +160,36 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; ; X86-LABEL: vec: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shldl $30, %edi, %ecx ; X86-NEXT: shldl $30, %eax, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: shldl $30, %eax, %ebx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %edx, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shldl $30, %eax, %ebp -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: shldl $30, %ebx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: shldl $30, %eax, %edx -; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl %ebp, 8(%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %edx, 12(%ebx) +; X86-NEXT: movl %edi, 8(%ebx) +; X86-NEXT: movl %esi, 4(%ebx) +; X86-NEXT: movl %ecx, (%ebx) +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2) ret <4 x i32> %tmp @@ -287,15 +285,15 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi ; X86-NEXT: imull {{[0-9]+}}(%esp), %esi ; X86-NEXT: imull {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -319,11 +317,11 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi ; X86-NEXT: addl %edx, %ebx @@ -368,42 +366,42 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %esi -; X86-NEXT: cmovnsl %edi, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: cmovnsl %ebp, %edi +; X86-NEXT: cmovnsl %esi, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %edi, %edx ; X86-NEXT: sbbl $0, %edx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %esi, %edx -; X86-NEXT: cmovnsl %ecx, %edi -; X86-NEXT: shldl $1, %edi, %edx -; X86-NEXT: shrdl $31, %edi, %eax +; X86-NEXT: cmovnsl %edi, %edx +; X86-NEXT: cmovnsl %ecx, %esi +; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: shrdl $31, %esi, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll index e68b6e328b723..4afe614055437 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -62,12 +62,12 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -200,45 +200,43 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movd %xmm2, %ecx ; X64-NEXT: movslq %ecx, %rdx ; X64-NEXT: imulq %rax, %rdx -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: shrq $32, %rcx -; X64-NEXT: shrdl $2, %ecx, %edx -; X64-NEXT: cmpl $2, %ecx -; X64-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-NEXT: cmovgel %eax, %edx -; X64-NEXT: cmpl $-2, %ecx -; X64-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 -; X64-NEXT: cmovll %ecx, %edx -; X64-NEXT: movd %edx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm3, %edx -; X64-NEXT: movslq %edx, %rdx -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-NEXT: movd %xmm3, %esi +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: shrdl $2, %eax, %edx +; X64-NEXT: cmpl $2, %eax +; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-NEXT: cmovgel %ecx, %edx +; X64-NEXT: cmpl $-2, %eax +; X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; X64-NEXT: cmovll %eax, %edx +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X64-NEXT: movd %xmm2, %esi +; X64-NEXT: movslq %esi, %rdi +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X64-NEXT: movd %xmm2, %esi ; X64-NEXT: movslq %esi, %rsi -; X64-NEXT: imulq %rdx, %rsi -; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: shrdl $2, %edx, %esi -; X64-NEXT: cmpl $2, %edx -; X64-NEXT: cmovgel %eax, %esi -; X64-NEXT: cmpl $-2, %edx -; X64-NEXT: cmovll %ecx, %esi +; X64-NEXT: imulq %rdi, %rsi +; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: shrq $32, %rdi +; X64-NEXT: shrdl $2, %edi, %esi +; X64-NEXT: cmpl $2, %edi +; X64-NEXT: cmovgel %ecx, %esi +; X64-NEXT: cmpl $-2, %edi +; X64-NEXT: cmovll %eax, %esi +; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: movslq %edi, %r8 +; X64-NEXT: movd %xmm0, %edi +; X64-NEXT: movslq %edi, %rdi +; X64-NEXT: imulq %r8, %rdi +; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: shrq $32, %r8 +; X64-NEXT: shrdl $2, %r8d, %edi +; X64-NEXT: cmpl $2, %r8d +; X64-NEXT: cmovgel %ecx, %edi +; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: cmpl $-2, %r8d +; X64-NEXT: cmovll %eax, %edi ; X64-NEXT: movd %esi, %xmm3 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movd %xmm1, %edx -; X64-NEXT: movslq %edx, %rdx -; X64-NEXT: movd %xmm0, %esi -; X64-NEXT: movslq %esi, %rsi -; X64-NEXT: imulq %rdx, %rsi -; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: shrdl $2, %edx, %esi -; X64-NEXT: cmpl $2, %edx -; X64-NEXT: cmovgel %eax, %esi -; X64-NEXT: cmpl $-2, %edx -; X64-NEXT: cmovll %ecx, %esi -; X64-NEXT: movd %esi, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X64-NEXT: movd %xmm1, %edx ; X64-NEXT: movslq %edx, %rdx @@ -250,13 +248,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: shrq $32, %rdx ; X64-NEXT: shrdl $2, %edx, %esi ; X64-NEXT: cmpl $2, %edx -; X64-NEXT: cmovgel %eax, %esi +; X64-NEXT: cmovgel %ecx, %esi +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: movd %edi, %xmm0 ; X64-NEXT: cmpl $-2, %edx -; X64-NEXT: cmovll %ecx, %esi -; X64-NEXT: movd %esi, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: cmovll %eax, %esi +; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; X64-NEXT: retq ; ; X86-LABEL: vec: @@ -265,45 +264,43 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrdl $2, %edx, %ecx ; X86-NEXT: cmpl $2, %edx -; X86-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF -; X86-NEXT: cmovgel %ebp, %ecx +; X86-NEXT: movl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-NEXT: cmovgel %ebx, %ecx ; X86-NEXT: cmpl $-2, %edx -; X86-NEXT: movl $-2147483648, %esi # imm = 0x80000000 -; X86-NEXT: cmovll %esi, %ecx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 +; X86-NEXT: cmovll %ebp, %ecx ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %edi -; X86-NEXT: shrdl $2, %edx, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shrdl $2, %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl $2, %edx -; X86-NEXT: cmovgel %ebp, %edi +; X86-NEXT: cmovgel %ebx, %esi ; X86-NEXT: cmpl $-2, %edx -; X86-NEXT: cmovll %esi, %edi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: cmovll %ebp, %esi ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shrdl $2, %edx, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: shrdl $2, %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl $2, %edx -; X86-NEXT: cmovgel %ebp, %ebx +; X86-NEXT: cmovgel %ebx, %edi ; X86-NEXT: cmpl $-2, %edx -; X86-NEXT: cmovll %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmovll %ebp, %edi ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: shrdl $2, %edx, %eax ; X86-NEXT: cmpl $2, %edx -; X86-NEXT: cmovgel %ebp, %eax +; X86-NEXT: cmovgel %ebx, %eax ; X86-NEXT: cmpl $-2, %edx -; X86-NEXT: cmovll %esi, %eax +; X86-NEXT: cmovll %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl %ebx, 8(%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edi, 8(%edx) +; X86-NEXT: movl %esi, 4(%edx) ; X86-NEXT: movl %ecx, (%edx) ; X86-NEXT: movl %edx, %eax ; X86-NEXT: popl %esi @@ -369,63 +366,62 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $8, %esp -; X86-NEXT: .cfi_def_cfa_offset 28 +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 24 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: imull %edi, %esi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: imull %edi, %edx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %eax, %esi +; X86-NEXT: imull %eax, %edi ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %eax, (%esp) # 4-byte Folded Spill -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: sarl $31, %edi ; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: imull %ebp ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %esi +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: xorl %ebp, %esi ; X86-NEXT: sarl $31, %esi -; X86-NEXT: xorl %esi, %edx -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: xorl %ebp, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: xorl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-NEXT: orl %edx, %esi -; X86-NEXT: notl %ecx -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: cmovel %edi, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: orl %edx, %edi +; X86-NEXT: notl %esi +; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: cmovel %ecx, %ebx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl %ebx, %edx -; X86-NEXT: addl $8, %esp +; X86-NEXT: addl $4, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 @@ -545,25 +541,25 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sets %al -; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: cmovol %eax, %ecx +; X86-NEXT: sets %bl +; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-NEXT: imull %eax, %ecx +; X86-NEXT: cmovol %ebx, %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movl %edx, %edi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: xorl %edi, %ebx ; X86-NEXT: sets %al ; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: imull %ebx, %edx +; X86-NEXT: imull %edi, %edx ; X86-NEXT: cmovol %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: xorl %esi, %ebx @@ -615,32 +611,32 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edx, %ebp +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %esi +; X86-NEXT: cmovnsl %ebp, %esi ; X86-NEXT: cmovsl %ecx, %edx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi @@ -703,39 +699,39 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %edi, %edx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %esi, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: sbbl $0, %ebp -; X86-NEXT: testl %esi, %esi -; X86-NEXT: cmovnsl %ebx, %ebp -; X86-NEXT: cmovnsl %edi, %edx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %ebp, %edi +; X86-NEXT: cmovnsl %esi, %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebp, %ecx +; X86-NEXT: cmovnsl %edi, %ecx ; X86-NEXT: cmovnsl %edx, %esi ; X86-NEXT: shrdl $31, %esi, %eax ; X86-NEXT: shrdl $31, %ecx, %esi diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll index 13596e1b18768..7b45364f7b3fa 100644 --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -16,43 +16,44 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X64-NEXT: .cfi_offset %r15, -16 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rsi, %r9 +; X64-NEXT: movq %rdi, %r11 ; X64-NEXT: movq %rsi, %r14 ; X64-NEXT: sarq $63, %r14 ; X64-NEXT: imulq %rdx, %r14 ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rdx -; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r11, %rbx -; X64-NEXT: adcq %r14, %r10 -; X64-NEXT: movq %r10, %r14 -; X64-NEXT: sarq $63, %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rbx, %r10 +; X64-NEXT: adcq %r14, %rdi +; X64-NEXT: movq %rdi, %r14 ; X64-NEXT: movq %rcx, %r15 ; X64-NEXT: sarq $63, %r15 -; X64-NEXT: imulq %rdi, %r15 -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: imulq %r11, %r15 +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rbx, %rdi -; X64-NEXT: adcq %r15, %r11 -; X64-NEXT: movq %r11, %rbx -; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %r14 ; X64-NEXT: addq %r10, %r11 -; X64-NEXT: adcq %r14, %rbx +; X64-NEXT: adcq %r15, %rbx +; X64-NEXT: movq %rbx, %r10 +; X64-NEXT: sarq $63, %r10 +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq %r14, %r10 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: imulq %rcx -; X64-NEXT: addq %r11, %rax -; X64-NEXT: adcq %rbx, %rdx -; X64-NEXT: movq %rdi, 8(%r8) -; X64-NEXT: sarq $63, %rdi -; X64-NEXT: xorq %rdi, %rdx -; X64-NEXT: xorq %rax, %rdi -; X64-NEXT: orq %rdx, %rdi +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: adcq %r10, %rdx +; X64-NEXT: movq %r11, 8(%r8) +; X64-NEXT: sarq $63, %r11 +; X64-NEXT: xorq %r11, %rdx +; X64-NEXT: xorq %rax, %r11 +; X64-NEXT: orq %rdx, %r11 ; X64-NEXT: setne %al ; X64-NEXT: movq %rsi, (%r8) ; X64-NEXT: popq %rbx @@ -76,15 +77,14 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl %ebx, %edi @@ -109,58 +109,59 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: imull %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: addl %eax, %edx ; X86-NEXT: addl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edi, %ebp ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: addl %edi, %eax -; X86-NEXT: movzbl %bl, %edi -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %ebx, %edi ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx @@ -212,9 +213,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: imull %ecx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movl %esi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -224,6 +224,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %esi +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: addl %eax, %esi ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill @@ -312,178 +313,177 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X64-NEXT: .cfi_offset %r14, -32 ; X64-NEXT: .cfi_offset %r15, -24 ; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %r8, %r12 -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rsi, %r10 -; X64-NEXT: movq %rdi, %r11 +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rsi, %r14 -; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r14, %r13 -; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: addq %r11, %r13 +; X64-NEXT: adcq $0, %r14 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: addq %r13, %r12 +; X64-NEXT: adcq %r14, %rbp ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %ecx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rsi, %r14 -; X64-NEXT: adcq %rcx, %r8 -; X64-NEXT: movq %rbx, %rcx -; X64-NEXT: sarq $63, %rcx -; X64-NEXT: movq %r9, %rsi -; X64-NEXT: imulq %rcx, %rsi -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: addq %rax, %r15 -; X64-NEXT: addq %rsi, %r15 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rbp, %r11 +; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: movq %rdi, %r13 +; X64-NEXT: sarq $63, %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %r9, %rdi +; X64-NEXT: imulq %r8, %rdi +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: addq %rdi, %r14 ; X64-NEXT: addq %rax, %r14 -; X64-NEXT: adcq %r8, %r15 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: addq %rax, %r11 +; X64-NEXT: adcq %rsi, %r14 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rsi, %r12 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rsi, %r15 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: addq %r12, %rax +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %r15, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %rdi, %rbx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r10, %rax +; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: setb %al +; X64-NEXT: movzbl %al, %edi +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rbx, %rsi -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %rbp +; X64-NEXT: addq %r10, %rsi +; X64-NEXT: adcq %rdi, %rbp +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload -; X64-NEXT: adcq %r13, %rbp +; X64-NEXT: adcq %r12, %rbp +; X64-NEXT: adcq $0, %r11 ; X64-NEXT: adcq $0, %r14 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r15, %r12 -; X64-NEXT: sarq $63, %r12 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %r14, %r15 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rdi, %r9 -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r11 +; X64-NEXT: adcq $0, %r12 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: addq %r9, %rax -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: adcq %r13, %rdi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: adcq %r12, %rdi ; X64-NEXT: setb %r8b -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rdi, %r13 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %rdi, %r12 ; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: movq %r11, %rdi -; X64-NEXT: movq %r11, %r8 +; X64-NEXT: movq %rcx, %rdi ; X64-NEXT: sarq $63, %rdi -; X64-NEXT: imulq %rdi, %r10 +; X64-NEXT: imulq %rdi, %rbx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: addq %r10, %r11 -; X64-NEXT: addq %rax, %r11 +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: addq %rbx, %r13 ; X64-NEXT: addq %rax, %r13 -; X64-NEXT: adcq %r9, %r11 +; X64-NEXT: addq %rax, %r12 +; X64-NEXT: adcq %r9, %r13 ; X64-NEXT: addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill -; X64-NEXT: adcq %rbp, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %rbp, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq $0, %r12 ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %r11, %rbp +; X64-NEXT: movq %r13, %rbp ; X64-NEXT: sarq $63, %rbp -; X64-NEXT: addq %r14, %r13 -; X64-NEXT: adcq %r15, %r11 -; X64-NEXT: movq %r12, %rax +; X64-NEXT: addq %r11, %r12 +; X64-NEXT: adcq %r14, %r13 +; X64-NEXT: movq %r15, %rax ; X64-NEXT: adcq %rbp, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r12, %rbp -; X64-NEXT: movq %r8, %rbx +; X64-NEXT: adcq %r15, %rbp +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rcx, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload ; X64-NEXT: imulq %rcx, %r8 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rax, %rsi -; X64-NEXT: addq %r8, %rsi +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: addq %r8, %r9 ; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload -; X64-NEXT: imulq %r12, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload +; X64-NEXT: imulq %r15, %rcx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: addq %rcx, %r10 -; X64-NEXT: addq %rax, %r10 -; X64-NEXT: addq %r9, %r14 -; X64-NEXT: adcq %rsi, %r10 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: addq %rsi, %r9 +; X64-NEXT: addq %rax, %r11 +; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: adcq %r9, %r11 ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdi, %r14 ; X64-NEXT: adcq $0, %r9 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r15, %rdi +; X64-NEXT: addq %r14, %rdi ; X64-NEXT: adcq %r9, %r8 ; X64-NEXT: setb %cl -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r10 ; X64-NEXT: addq %r8, %rax ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r14, %rax -; X64-NEXT: adcq %r10, %rdx -; X64-NEXT: addq %r13, %rsi -; X64-NEXT: adcq %r11, %rdi +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: adcq %r11, %rdx +; X64-NEXT: addq %r12, %rsi +; X64-NEXT: adcq %r13, %rdi ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload ; X64-NEXT: adcq %rbp, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload @@ -523,40 +523,41 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $128, %esp -; X86-NEXT: .cfi_def_cfa_offset 148 +; X86-NEXT: subl $124, %esp +; X86-NEXT: .cfi_def_cfa_offset 144 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -570,91 +571,87 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: addl %ebx, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: setb %cl +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl (%esp), %eax ## 4-byte Reload -; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: setb (%esp) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill @@ -663,42 +660,43 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %edi, %eax ; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebp +; X86-NEXT: addl %ebp, %edx ; X86-NEXT: addl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movl (%esp), %esi ## 4-byte Reload ; X86-NEXT: adcl %esi, %edx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl %edx, %eax ; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -722,7 +720,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill @@ -755,7 +753,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: adcl %eax, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -787,9 +785,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi @@ -805,199 +803,199 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload ; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl $0, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %esi, %edi ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebp, %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ebp ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %edi, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ecx, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl $0, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebp, %esi +; X86-NEXT: adcl %edi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebp, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sarl $31, %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %edx, %edi -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %edi -; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %ebp, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %esi, %ecx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %eax, %edx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload @@ -1005,21 +1003,21 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: adcl %edx, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %eax ; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ebp, %edx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: adcl %edx, %eax @@ -1032,29 +1030,27 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %edi, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: setb %cl -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill @@ -1089,7 +1085,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1108,7 +1104,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1119,7 +1115,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edi ; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ecx, (%esp) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload @@ -1176,10 +1172,10 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %edi, %ebp ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: imull %edi, %ebp ; X86-NEXT: addl %ebp, %edx ; X86-NEXT: addl %eax, %edx ; X86-NEXT: addl %ecx, %eax @@ -1205,14 +1201,13 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: addl %eax, %ebp ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: imull %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: addl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl %edi, %esi ; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload @@ -1236,14 +1231,14 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: movl (%esp), %edx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl (%esp), %esi ## 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload ; X86-NEXT: movl %esi, %edx ; X86-NEXT: sarl $31, %edx ; X86-NEXT: xorl %edx, %edi @@ -1259,7 +1254,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: orl %ebp, %ecx ; X86-NEXT: xorl %edx, %eax -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: xorl (%esp), %edx ## 4-byte Folded Reload ; X86-NEXT: orl %eax, %edx ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: orl %edi, %edx @@ -1280,7 +1275,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, 24(%eax) ; X86-NEXT: setne %al -; X86-NEXT: addl $128, %esp +; X86-NEXT: addl $124, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening.ll b/llvm/test/CodeGen/X86/speculative-load-hardening.ll index 5fd1f77e166d4..0b113e16e2259 100644 --- a/llvm/test/CodeGen/X86/speculative-load-hardening.ll +++ b/llvm/test/CodeGen/X86/speculative-load-hardening.ll @@ -1025,7 +1025,8 @@ define void @test_deferred_hardening(ptr %ptr1, ptr %ptr2, i32 %x) nounwind spec ; X64-NEXT: cmovneq %r15, %rax ; X64-NEXT: movl (%rbx), %ecx ; X64-NEXT: movl (%r14), %edx -; X64-NEXT: leal 1(%rcx,%rdx), %edi +; X64-NEXT: leal (%rcx,%rdx), %edi +; X64-NEXT: incl %edi ; X64-NEXT: orl %eax, %edi ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp @@ -1096,7 +1097,8 @@ define void @test_deferred_hardening(ptr %ptr1, ptr %ptr2, i32 %x) nounwind spec ; X64-LFENCE-NEXT: callq sink@PLT ; X64-LFENCE-NEXT: movl (%rbx), %eax ; X64-LFENCE-NEXT: movl (%r14), %ecx -; X64-LFENCE-NEXT: leal 1(%rax,%rcx), %edi +; X64-LFENCE-NEXT: leal (%rax,%rcx), %edi +; X64-LFENCE-NEXT: incl %edi ; X64-LFENCE-NEXT: callq sink@PLT ; X64-LFENCE-NEXT: movl (%rbx), %edi ; X64-LFENCE-NEXT: shll $7, %edi diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll index 2c7da100344b7..0f2bf7fde74f5 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -24,27 +24,8 @@ define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr - ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY [[VMULSSrr5]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]] - ; CHECK-NEXT: [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VPANDrr:%[0-9]+]]:vr128 = VPANDrr killed [[COPY2]], killed [[VPBROADCASTDrm]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDrr]] - ; CHECK-NEXT: [[VCMPSSrmi:%[0-9]+]]:fr32 = nofpexcept VCMPSSrmi killed [[COPY3]], $rip, 1, $noreg, %const.3, $noreg, 1, implicit $mxcsr :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vr128 = COPY [[VCMPSSrmi]] - ; CHECK-NEXT: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY4]], killed [[COPY1]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] - ; CHECK-NEXT: $xmm0 = COPY [[COPY5]] + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] ; CHECK-NEXT: RET 0, $xmm0 %call = tail call ninf afn float @llvm.sqrt.f32(float %f) ret float %call @@ -71,24 +52,8 @@ define float @sqrt_daz_ninf(float %f) #1 { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr - ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY [[VMULSSrr5]] - ; CHECK-NEXT: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS - ; CHECK-NEXT: [[VCMPSSrri:%[0-9]+]]:fr32 = nofpexcept VCMPSSrri [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[VCMPSSrri]] - ; CHECK-NEXT: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY2]], killed [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] - ; CHECK-NEXT: $xmm0 = COPY [[COPY3]] + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] ; CHECK-NEXT: RET 0, $xmm0 %call = tail call ninf afn float @llvm.sqrt.f32(float %f) ret float %call diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index cc4bda81bef52..3102de10b2fd3 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -172,12 +172,12 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: shlq $31, %rcx ; SSE2-NEXT: sarq $31, %rcx ; SSE2-NEXT: shlq $31, %rdi -; SSE2-NEXT: sarq $31, %rdi ; SSE2-NEXT: shlq $31, %rsi ; SSE2-NEXT: sarq $31, %rsi ; SSE2-NEXT: movabsq $2049638230412172402, %r8 # imm = 0x1C71C71C71C71C72 ; SSE2-NEXT: movq %rsi, %rax ; SSE2-NEXT: imulq %r8 +; SSE2-NEXT: sarq $31, %rdi ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $63, %rax ; SSE2-NEXT: addq %rdx, %rax @@ -194,10 +194,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: movq %rdi, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] -; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rdx +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: subq %rcx, %rdx ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $63, %rax @@ -206,8 +206,8 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE2-NEXT: addq %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2] @@ -248,7 +248,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE41-NEXT: subq %rax, %rdi ; SSE41-NEXT: movq %rdi, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967295,1,4294967295,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE41-NEXT: movq %rcx, %rax @@ -269,9 +269,9 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE41-NEXT: movq %rax, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 ; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: pextrb $8, %xmm0, %edx ; SSE41-NEXT: pextrb $0, %xmm3, %ecx +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx @@ -283,12 +283,12 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; AVX1-NEXT: shlq $31, %rcx ; AVX1-NEXT: sarq $31, %rcx ; AVX1-NEXT: shlq $31, %rdi -; AVX1-NEXT: sarq $31, %rdi ; AVX1-NEXT: shlq $31, %rsi ; AVX1-NEXT: sarq $31, %rsi ; AVX1-NEXT: movabsq $2049638230412172402, %r8 # imm = 0x1C71C71C71C71C72 ; AVX1-NEXT: movq %rsi, %rax ; AVX1-NEXT: imulq %r8 +; AVX1-NEXT: sarq $31, %rdi ; AVX1-NEXT: movq %rdx, %rax ; AVX1-NEXT: shrq $63, %rax ; AVX1-NEXT: addq %rdx, %rax @@ -320,14 +320,14 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: movl $3, %eax ; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vpextrb $4, %xmm0, %edx ; AVX1-NEXT: vpextrb $8, %xmm0, %ecx +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: # kill: def $dl killed $dl killed $edx ; AVX1-NEXT: # kill: def $cl killed $cl killed $ecx @@ -351,19 +351,19 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; AVX2-NEXT: addq %rdx, %rax ; AVX2-NEXT: leaq (%rax,%rax,8), %rax ; AVX2-NEXT: subq %rax, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm0 ; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: imulq %r8 +; AVX2-NEXT: vmovq %rsi, %xmm0 ; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: shrq $63, %rax ; AVX2-NEXT: addq %rdx, %rax ; AVX2-NEXT: leaq (%rax,%rax,8), %rax ; AVX2-NEXT: subq %rax, %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-NEXT: subq %rcx, %rdx ; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: shrq $63, %rax @@ -372,17 +372,17 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; AVX2-NEXT: leaq (%rdx,%rdx,8), %rax ; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [8589934591,8589934591,8589934591,8589934591] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8589934591,8589934591,8589934591,8589934591] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax ; AVX2-NEXT: vpextrb $8, %xmm1, %edx ; AVX2-NEXT: vpextrb $0, %xmm2, %ecx +; AVX2-NEXT: notl %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: # kill: def $dl killed $dl killed $edx ; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/srem-seteq-optsize.ll b/llvm/test/CodeGen/X86/srem-seteq-optsize.ll index 2b980683cba75..8dd232eed6c61 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-optsize.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-optsize.ll @@ -47,10 +47,10 @@ define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; X86-LABEL: test_optsize: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $429496729, %eax # imm = 0x19999999 -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD +; X86-NEXT: addl $429496729, %ecx # imm = 0x19999999 ; X86-NEXT: movl $42, %eax +; X86-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 ; X86-NEXT: jb .LBB1_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl $-10, %eax diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index 08d9183bd30b6..844a995943aa6 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -129,9 +129,9 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-AVX2-LABEL: test_srem_odd_allones_eq: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [429496729,429496729,429496729,429496729] ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -335,9 +335,9 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-AVX2-LABEL: test_srem_even_allones_ne: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [306783378,306783378,306783378,306783378] ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -449,17 +449,17 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -555,7 +555,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -826,9 +826,9 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { ; CHECK-AVX2-LABEL: test_srem_odd_one: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [429496729,429496729,429496729,429496729] ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -1050,9 +1050,9 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind { ; ; CHECK-AVX1-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 @@ -1077,9 +1077,9 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind { ; ; CHECK-AVX512VL-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 +; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 @@ -1132,62 +1132,62 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; ; CHECK-SSE41-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,1,3067833783] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: por %xmm4, %xmm3 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [306783378,306783378,1,306783378] -; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378] +; CHECK-SSE41-NEXT: pminud %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm1 ; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] +; CHECK-AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1214,32 +1214,32 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] ; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,2,1073741824] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2147483648,2,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; CHECK-SSE2-NEXT: psrld $31, %xmm1 @@ -1248,62 +1248,62 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; ; CHECK-SSE41-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,3067833783,1,3264175145] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: por %xmm4, %xmm3 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,1,42949672] -; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,1,42949672] +; CHECK-SSE41-NEXT: pminud %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm1 ; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] +; CHECK-AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1354,7 +1354,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1624,9 +1624,9 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_one: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [429496729,429496729,429496729,429496729] ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -2234,12 +2234,11 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7 ; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7 ; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7 ; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6 ; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 ; CHECK-SSE2-NEXT: pandn %xmm1, %xmm5 -; CHECK-SSE2-NEXT: por %xmm7, %xmm5 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,223,205,183,161,1,171,239] @@ -2252,9 +2251,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,128,1,1,1,128,1,64] -; CHECK-SSE2-NEXT: psrlw $8, %xmm1 +; CHECK-SSE2-NEXT: por %xmm7, %xmm5 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1,128,128,32,128,32] +; CHECK-SSE2-NEXT: psrlw $8, %xmm1 ; CHECK-SSE2-NEXT: psrlw $8, %xmm0 ; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5] @@ -2273,25 +2273,24 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; ; CHECK-SSE41-LABEL: pr51133: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE41-NEXT: movq %rdi, %rax -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0] -; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; CHECK-SSE41-NEXT: pand %xmm5, %xmm4 ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6 ; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221] ; CHECK-SSE41-NEXT: psllw $8, %xmm6 -; CHECK-SSE41-NEXT: por %xmm0, %xmm6 +; CHECK-SSE41-NEXT: por %xmm4, %xmm6 ; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm0 -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,1,128,1,128,32,1,1] -; CHECK-SSE41-NEXT: psrlw $8, %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm7 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [128,1,128,1,128,32,1,1] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32] +; CHECK-SSE41-NEXT: psrlw $8, %xmm7 ; CHECK-SSE41-NEXT: psrlw $8, %xmm6 -; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6 +; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm6 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2] ; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0 @@ -2312,9 +2311,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm0 ; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,128,1,1,1,128,1,64] -; CHECK-SSE41-NEXT: psrlw $8, %xmm0 +; CHECK-SSE41-NEXT: movq %rdi, %rax ; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,1,1,128,128,32,128,32] +; CHECK-SSE41-NEXT: psrlw $8, %xmm0 ; CHECK-SSE41-NEXT: psrlw $8, %xmm4 ; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm4 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5] @@ -2376,18 +2376,18 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0] -; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8 # [0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1] +; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpsllw $8, %xmm8, %xmm8 ; CHECK-AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 ; CHECK-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-AVX1-NEXT: vpsraw $8, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [64,256,32,64,256,64,8,4] -; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-AVX1-NEXT: vpsraw $8, %xmm8, %xmm8 ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8, %xmm8 # [256,8,64,256,16,4,8,8] +; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 ; CHECK-AVX1-NEXT: vpackuswb %xmm7, %xmm8, %xmm7 ; CHECK-AVX1-NEXT: vpsrlw $7, %xmm6, %xmm6 @@ -2395,8 +2395,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5 ; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm7, %xmm5 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0] -; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117] +; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 ; CHECK-AVX1-NEXT: vpsllw $8, %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 @@ -2418,33 +2418,33 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] ; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [34048,34048,26368,37632,21760,33024,22016,35072,2304,0,10496,37632,33024,33024,21760,36096] -; CHECK-AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [20224,26368,6912,30976,33024,33024,33024,12032,22016,24320,37632,11008,12544,32512,16640,37632] +; CHECK-AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4 +; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0] ; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0] -; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm6 # [0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1] +; CHECK-AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 ; CHECK-AVX2-NEXT: vpsllw $8, %ymm6, %ymm6 -; CHECK-AVX2-NEXT: vpor %ymm6, %ymm4, %ymm4 -; CHECK-AVX2-NEXT: vpaddb %ymm4, %ymm3, %ymm3 -; CHECK-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX2-NEXT: vpsraw $8, %ymm4, %ymm4 -; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [8,8,128,64,8,256,256,8,64,256,32,64,256,64,8,4] -; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4 +; CHECK-AVX2-NEXT: vpor %ymm6, %ymm5, %ymm5 +; CHECK-AVX2-NEXT: vpaddb %ymm5, %ymm3, %ymm3 +; CHECK-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX2-NEXT: vpsraw $8, %ymm5, %ymm5 +; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [8,8,128,64,8,256,256,8,64,256,32,64,256,64,8,4] +; CHECK-AVX2-NEXT: vpsrlw $8, %ymm5, %ymm5 ; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX2-NEXT: vpsraw $8, %ymm6, %ymm6 ; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [64,128,128,16,256,64,256,16,256,8,64,256,16,4,8,8] ; CHECK-AVX2-NEXT: vpsrlw $8, %ymm6, %ymm6 -; CHECK-AVX2-NEXT: vpackuswb %ymm4, %ymm6, %ymm4 ; CHECK-AVX2-NEXT: vpsrlw $7, %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; CHECK-AVX2-NEXT: vpaddb %ymm3, %ymm4, %ymm3 -; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0,3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0] -; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; CHECK-AVX2-NEXT: vpackuswb %ymm5, %ymm6, %ymm5 +; CHECK-AVX2-NEXT: vpaddb %ymm3, %ymm5, %ymm3 +; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0,3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0] +; CHECK-AVX2-NEXT: vpand %ymm4, %ymm5, %ymm4 ; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60,0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117] ; CHECK-AVX2-NEXT: vpsllw $8, %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll index 33592027dee93..eeb1da3f5eda3 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -44,9 +44,9 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind { ; CHECK-AVX2-LABEL: test_srem_odd_25: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [85899345,85899345,85899345,85899345] ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899345,85899345,85899345,85899345] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] ; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -185,9 +185,9 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind { ; CHECK-AVX2-LABEL: test_srem_odd_neg25: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [85899345,85899345,85899345,85899345] ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899345,85899345,85899345,85899345] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] ; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -360,12 +360,12 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { ; ; CHECK-AVX2-LABEL: test_srem_odd_undef1: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 ; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 @@ -471,12 +471,12 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; ; CHECK-AVX2-LABEL: test_srem_even_undef1: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 ; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll index e936e1ef81b74..82b55cffdb845 100644 --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -12,45 +12,45 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; SSE-NEXT: shrl $16, %ecx ; SSE-NEXT: subl %eax, %ecx ; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $9, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx +; SSE-NEXT: movd %xmm0, %edx ; SSE-NEXT: movswl %dx, %esi -; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $6, %esi +; SSE-NEXT: imull $-21385, %esi, %esi # imm = 0xAC77 +; SSE-NEXT: shrl $16, %esi ; SSE-NEXT: addl %edx, %esi -; SSE-NEXT: imull $95, %esi, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $21, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $-124, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; SSE-NEXT: movl %edx, %esi +; SSE-NEXT: movzwl %si, %esi +; SSE-NEXT: movswl %si, %edi +; SSE-NEXT: shrl $15, %esi +; SSE-NEXT: sarl $6, %edi +; SSE-NEXT: addl %esi, %edi +; SSE-NEXT: imull $95, %edi, %esi +; SSE-NEXT: subl %esi, %edx +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pextrw $1, %xmm0, %edx +; SSE-NEXT: movswl %dx, %esi +; SSE-NEXT: imull $-16913, %esi, %esi # imm = 0xBDEF +; SSE-NEXT: movl %esi, %edi +; SSE-NEXT: shrl $31, %edi +; SSE-NEXT: sarl $21, %esi +; SSE-NEXT: addl %edi, %esi +; SSE-NEXT: movswl %cx, %edi +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $9, %edi +; SSE-NEXT: imull $-124, %esi, %esi +; SSE-NEXT: subl %esi, %edx +; SSE-NEXT: pinsrw $1, %edx, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %edx +; SSE-NEXT: addl %ecx, %edi +; SSE-NEXT: movswl %dx, %ecx +; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 +; SSE-NEXT: movl %ecx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $18, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $98, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE-NEXT: sarl $18, %ecx +; SSE-NEXT: addl %esi, %ecx +; SSE-NEXT: imull $-1003, %edi, %esi # imm = 0xFC15 +; SSE-NEXT: imull $98, %ecx, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: subl %esi, %eax ; SSE-NEXT: pinsrw $3, %eax, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -63,45 +63,45 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; AVX-NEXT: shrl $16, %ecx ; AVX-NEXT: subl %eax, %ecx ; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $9, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %xmm0, %edx ; AVX-NEXT: movswl %dx, %esi -; AVX-NEXT: shrl $15, %edx -; AVX-NEXT: sarl $6, %esi +; AVX-NEXT: imull $-21385, %esi, %esi # imm = 0xAC77 +; AVX-NEXT: shrl $16, %esi ; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: imull $95, %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $21, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $-124, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: movzwl %si, %esi +; AVX-NEXT: movswl %si, %edi +; AVX-NEXT: shrl $15, %esi +; AVX-NEXT: sarl $6, %edi +; AVX-NEXT: addl %esi, %edi +; AVX-NEXT: imull $95, %edi, %esi +; AVX-NEXT: subl %esi, %edx +; AVX-NEXT: vmovd %edx, %xmm1 +; AVX-NEXT: vpextrw $1, %xmm0, %edx +; AVX-NEXT: movswl %dx, %esi +; AVX-NEXT: imull $-16913, %esi, %esi # imm = 0xBDEF +; AVX-NEXT: movl %esi, %edi +; AVX-NEXT: shrl $31, %edi +; AVX-NEXT: sarl $21, %esi +; AVX-NEXT: addl %edi, %esi +; AVX-NEXT: movswl %cx, %edi +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: sarl $9, %edi +; AVX-NEXT: imull $-124, %esi, %esi +; AVX-NEXT: subl %esi, %edx +; AVX-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %edx +; AVX-NEXT: addl %ecx, %edi +; AVX-NEXT: movswl %dx, %ecx +; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 +; AVX-NEXT: movl %ecx, %esi ; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $18, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $98, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 +; AVX-NEXT: sarl $18, %ecx +; AVX-NEXT: addl %esi, %ecx +; AVX-NEXT: imull $-1003, %edi, %esi # imm = 0xFC15 +; AVX-NEXT: imull $98, %ecx, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm0 +; AVX-NEXT: subl %esi, %eax ; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, @@ -148,7 +148,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; SSE-NEXT: psrlw $15, %xmm2 ; SSE-NEXT: psraw $6, %xmm1 ; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] ; SSE-NEXT: pmullw %xmm1, %xmm2 ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 @@ -270,18 +270,18 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; SSE-NEXT: leal (%rax,%rax,2), %edx ; SSE-NEXT: shll $3, %edx ; SSE-NEXT: subl %edx, %eax +; SSE-NEXT: pextrw $1, %xmm0, %edx ; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B -; SSE-NEXT: movl %edx, %esi +; SSE-NEXT: movswl %dx, %ecx +; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B +; SSE-NEXT: movl %ecx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E -; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: sarl $23, %ecx +; SSE-NEXT: addl %esi, %ecx +; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; SSE-NEXT: subl %ecx, %edx ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %edx, %xmm1 ; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax ; SSE-NEXT: movswl %ax, %ecx @@ -298,32 +298,32 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; ; AVX-LABEL: dont_fold_srem_one: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-NEXT: movswl %cx, %eax +; AVX-NEXT: imull $-19945, %eax, %eax # imm = 0xB217 +; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: addl %ecx, %eax +; AVX-NEXT: movzwl %ax, %edx +; AVX-NEXT: movswl %dx, %eax +; AVX-NEXT: shrl $15, %edx +; AVX-NEXT: sarl $4, %eax +; AVX-NEXT: addl %edx, %eax +; AVX-NEXT: leal (%rax,%rax,2), %edx +; AVX-NEXT: shll $3, %edx +; AVX-NEXT: subl %edx, %eax +; AVX-NEXT: vpextrw $1, %xmm0, %edx +; AVX-NEXT: addl %ecx, %eax +; AVX-NEXT: movswl %dx, %ecx ; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B ; AVX-NEXT: movl %ecx, %esi ; AVX-NEXT: shrl $31, %esi ; AVX-NEXT: sarl $23, %ecx ; AVX-NEXT: addl %esi, %ecx ; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: subl %ecx, %edx ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax ; AVX-NEXT: movswl %ax, %ecx ; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 @@ -449,11 +449,11 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F ; SSE-NEXT: subq %rax, %rcx ; SSE-NEXT: movq %rcx, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: pextrq $1, %xmm0, %rcx ; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 ; SSE-NEXT: movq %rcx, %rax ; SSE-NEXT: imulq %rdx +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movq %rdx, %rax ; SSE-NEXT: shrq $63, %rax ; SSE-NEXT: sarq $8, %rdx @@ -492,20 +492,20 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F ; AVX1-NEXT: subq %rax, %rcx ; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] ; AVX1-NEXT: movq %rdx, %rax ; AVX1-NEXT: shrq $63, %rax ; AVX1-NEXT: sarq $8, %rdx ; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E ; AVX1-NEXT: subq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: dont_fold_srem_i64: @@ -536,20 +536,20 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F ; AVX2-NEXT: subq %rax, %rcx ; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx ; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] ; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: shrq $63, %rax ; AVX2-NEXT: sarq $8, %rdx ; AVX2-NEXT: addq %rax, %rdx ; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E ; AVX2-NEXT: subq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %1 = srem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/X86/sse-fcopysign.ll b/llvm/test/CodeGen/X86/sse-fcopysign.ll index 3eadcad145b65..18b819128cd10 100644 --- a/llvm/test/CodeGen/X86/sse-fcopysign.ll +++ b/llvm/test/CodeGen/X86/sse-fcopysign.ll @@ -153,9 +153,9 @@ define x86_fp80 @int3(x86_fp80 %a, x86_fp80 %b) nounwind { ; X86-NEXT: fldt {{[0-9]+}}(%esp) ; X86-NEXT: fstpt (%esp) ; X86-NEXT: fabs +; X86-NEXT: testb $-128, {{[0-9]+}}(%esp) ; X86-NEXT: fld %st(0) ; X86-NEXT: fchs -; X86-NEXT: testb $-128, {{[0-9]+}}(%esp) ; X86-NEXT: fxch %st(1) ; X86-NEXT: fcmovne %st(1), %st ; X86-NEXT: fstp %st(1) @@ -168,9 +168,9 @@ define x86_fp80 @int3(x86_fp80 %a, x86_fp80 %b) nounwind { ; X64-NEXT: fldt {{[0-9]+}}(%rsp) ; X64-NEXT: fstpt -{{[0-9]+}}(%rsp) ; X64-NEXT: fabs +; X64-NEXT: testb $-128, -{{[0-9]+}}(%rsp) ; X64-NEXT: fld %st(0) ; X64-NEXT: fchs -; X64-NEXT: testb $-128, -{{[0-9]+}}(%rsp) ; X64-NEXT: fxch %st(1) ; X64-NEXT: fcmovne %st(1), %st ; X64-NEXT: fstp %st(1) diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll index 03b9e123eea48..3bc9ae5a83adf 100644 --- a/llvm/test/CodeGen/X86/sse-regcall.ll +++ b/llvm/test/CodeGen/X86/sse-regcall.ll @@ -311,7 +311,7 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; LINUXOSX-NEXT: movl %eax, %r9d ; LINUXOSX-NEXT: subl %ecx, %r9d ; LINUXOSX-NEXT: imull %r9d, %r8d -; LINUXOSX-NEXT: leal (%r13,%r14), %r9d +; LINUXOSX-NEXT: leal (%r14,%r13), %r9d ; LINUXOSX-NEXT: movl %r13d, %r12d ; LINUXOSX-NEXT: subl %r14d, %r12d ; LINUXOSX-NEXT: imull %r11d, %r12d diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll index 6f964f0a88ea3..7059155b98459 100644 --- a/llvm/test/CodeGen/X86/sse-regcall4.ll +++ b/llvm/test/CodeGen/X86/sse-regcall4.ll @@ -310,7 +310,7 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; LINUXOSX-NEXT: movl %eax, %r9d ; LINUXOSX-NEXT: subl %ecx, %r9d ; LINUXOSX-NEXT: imull %r9d, %r8d -; LINUXOSX-NEXT: leal (%r13,%r14), %r9d +; LINUXOSX-NEXT: leal (%r14,%r13), %r9d ; LINUXOSX-NEXT: movl %r13d, %r12d ; LINUXOSX-NEXT: subl %r14d, %r12d ; LINUXOSX-NEXT: imull %r11d, %r12d diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll index 8ac86d11d89e6..8f6fb4adac3c3 100644 --- a/llvm/test/CodeGen/X86/sse1.ll +++ b/llvm/test/CodeGen/X86/sse1.ll @@ -83,8 +83,8 @@ define <4 x float> @vselect(ptr%p, <4 x i32> %q) { ; ; X64-LABEL: vselect: ; X64: # %bb.0: # %entry -; X64-NEXT: testl %edx, %edx ; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: testl %edx, %edx ; X64-NEXT: je .LBB1_1 ; X64-NEXT: # %bb.2: # %entry ; X64-NEXT: xorps %xmm1, %xmm1 @@ -190,19 +190,19 @@ define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind { ; ; X64-LABEL: PR30512: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: xorl %edi, %edi +; X64-NEXT: xorl %r10d, %r10d ; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %r8d -; X64-NEXT: sete %dil -; X64-NEXT: negl %edi -; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: sete %r10b +; X64-NEXT: negl %r10d +; X64-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) ; X64-NEXT: xorl %edi, %edi ; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %ecx ; X64-NEXT: sete %dil ; X64-NEXT: negl %edi -; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %edx +; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ; X64-NEXT: sete %cl ; X64-NEXT: negl %ecx ; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index dbdc45abb24d6..05971ceda4ea6 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -3536,13 +3536,13 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a ; X64-AVX1-LABEL: test_mm_set_epi8: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50] +; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x48] ; X64-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x48] -; X64-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] +; X64-AVX1-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01] +; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x38] ; X64-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] -; X64-AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] +; X64-AVX1-NEXT: vpinsrb $3, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x03] ; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] ; X64-AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] @@ -3566,13 +3566,13 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a ; X64-AVX512-LABEL: test_mm_set_epi8: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50] +; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x48] ; X64-AVX512-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x48] -; X64-AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] +; X64-AVX512-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01] +; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb6,0x54,0x24,0x38] ; X64-AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] -; X64-AVX512-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] +; X64-AVX512-NEXT: vpinsrb $3, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x03] ; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] ; X64-AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X64-AVX512-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] @@ -3662,13 +3662,13 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a ; X32-AVX1-LABEL: test_mm_set_epi8: ; X32-AVX1: # %bb.0: ; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50] +; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x48] ; X32-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0] -; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48] -; X32-AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40] +; X32-AVX1-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01] +; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x38] ; X32-AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38] -; X32-AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] +; X32-AVX1-NEXT: vpinsrb $3, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x03] ; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30] ; X32-AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X32-AVX1-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28] @@ -3692,13 +3692,13 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a ; X32-AVX512-LABEL: test_mm_set_epi8: ; X32-AVX512: # %bb.0: ; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50] +; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x48] ; X32-AVX512-NEXT: vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48] -; X32-AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01] ; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40] +; X32-AVX512-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01] +; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x38] ; X32-AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] -; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38] -; X32-AVX512-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03] +; X32-AVX512-NEXT: vpinsrb $3, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x03] ; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30] ; X32-AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] ; X32-AVX512-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28] @@ -5234,29 +5234,29 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4 ; ; X64-AVX1-LABEL: test_mm_setr_epi16: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] -; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x08] +; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08] ; X64-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7] ; X64-AVX1-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc6,0x01] ; X64-AVX1-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc2,0x02] ; X64-AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03] ; X64-AVX1-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04] ; X64-AVX1-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05] -; X64-AVX1-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06] +; X64-AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] ; X64-AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_setr_epi16: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] -; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d # encoding: [0x44,0x0f,0xb7,0x54,0x24,0x08] +; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08] ; X64-AVX512-NEXT: vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] ; X64-AVX512-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc6,0x01] ; X64-AVX512-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x02] ; X64-AVX512-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x03] ; X64-AVX512-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04] ; X64-AVX512-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05] -; X64-AVX512-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06] +; X64-AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] ; X64-AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] ; @@ -5290,29 +5290,29 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4 ; ; X32-AVX1-LABEL: test_mm_setr_epi16: ; X32-AVX1: # %bb.0: -; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10] -; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08] +; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08] ; X32-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7] ; X32-AVX1-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc6,0x01] ; X32-AVX1-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc2,0x02] ; X32-AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03] ; X32-AVX1-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04] ; X32-AVX1-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05] -; X32-AVX1-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06] +; X32-AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X32-AVX1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10] ; X32-AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X32-AVX1-NEXT: retq # encoding: [0xc3] ; ; X32-AVX512-LABEL: test_mm_setr_epi16: ; X32-AVX512: # %bb.0: -; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10] -; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08] +; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08] ; X32-AVX512-NEXT: vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7] ; X32-AVX512-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc6,0x01] ; X32-AVX512-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x02] ; X32-AVX512-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x03] ; X32-AVX512-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04] ; X32-AVX512-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05] -; X32-AVX512-NEXT: vpinsrw $6, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x06] +; X32-AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X32-AVX512-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10] ; X32-AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X32-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <8 x i16> undef, i16 %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll index 7c22330d7804b..dfd17ffaed0b2 100644 --- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -201,9 +201,9 @@ define <8 x i16> @test_x86_sse41_packusdw_fold() { ; ; X86-AVX512-LABEL: test_x86_sse41_packusdw_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,4294967295,0] -; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X86-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] +; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: test_x86_sse41_packusdw_fold: @@ -222,9 +222,9 @@ define <8 x i16> @test_x86_sse41_packusdw_fold() { ; ; X64-AVX512-LABEL: test_x86_sse41_packusdw_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,4294967295,0] -; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A] -; X64-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] +; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> ) ret <8 x i16> %res diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll index e5ea911d4771a..98a5883c6e4ce 100644 --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -61,12 +61,12 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: testw %ax, %ax ; X64-NEXT: sets %dl -; X64-NEXT: addl $32767, %edx # imm = 0x7FFF ; X64-NEXT: movl %eax, %esi ; X64-NEXT: shll %cl, %esi ; X64-NEXT: movswl %si, %edi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %edi +; X64-NEXT: addl $32767, %edx # imm = 0x7FFF ; X64-NEXT: cmpw %di, %ax ; X64-NEXT: cmovnel %edx, %esi ; X64-NEXT: movswl %si, %eax @@ -227,9 +227,9 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: shll %cl, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: xorl %edi, %edi diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index f91758b861b4c..6a06a889c8e14 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -70,75 +70,75 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $20, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shll %cl, %eax -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %eax, %edi -; X86-NEXT: cmovnel %edx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sarl %cl, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %eax +; X86-NEXT: cmovnel %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovnel %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl %esi, %edx +; X86-NEXT: sarl %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %esi ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovel %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovel %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, %edx ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shll %cl, %eax +; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: shldl %cl, %esi, %ebx ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: cmovnel %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovnel %edx, %ebx +; X86-NEXT: cmovnel %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: sarl %cl, %edi ; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sarl %cl, %esi -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: sarl $31, %edx +; X86-NEXT: sarl $31, %esi ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: cmovel %edi, %esi ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: shrdl %cl, %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shrdl %cl, %edx, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %ebx, %edi +; X86-NEXT: shrdl %cl, %ebx, %edx ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: cmovnel %edi, %edx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: xorl $2147483647, %ecx # imm = 0x7FFFFFFF ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: notl %esi -; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: notl %edi +; X86-NEXT: cmovel (%esp), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %ebp, %edx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %ebp, %esi ; X86-NEXT: sarl $31, %ebp -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-NEXT: orl %esi, %edx ; X86-NEXT: notl %ebp ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: cmovel %ebx, %esi +; X86-NEXT: cmovel %ebx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %ebp, 8(%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload @@ -211,46 +211,49 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: shll %cl, %ebp ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: sarl %cl, %ebp -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testl %edi, %edi -; X86-NEXT: sets %bl -; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-NEXT: cmpl %ebp, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shll %cl, %ebp ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: sarl %cl, %eax ; X86-NEXT: xorl %edx, %edx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: sets %dl +; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF +; X86-NEXT: cmpl %eax, %ebx +; X86-NEXT: cmovel %ebp, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: sarl %cl, %eax +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %dl ; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF ; X86-NEXT: cmpl %eax, %edi -; X86-NEXT: cmovel %ebp, %edx +; X86-NEXT: cmovel %ebx, %edx ; X86-NEXT: movl %esi, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %edi +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movl %edi, %ebp ; X86-NEXT: sarl %cl, %ebp -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: testl %esi, %esi -; X86-NEXT: sets %al -; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: sets %bl +; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF ; X86-NEXT: cmpl %ebp, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: cmovel %edi, %ebx ; X86-NEXT: movl %esi, %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movb %ah, %cl ; X86-NEXT: shll %cl, %edi ; X86-NEXT: movl %edi, %ebp ; X86-NEXT: sarl %cl, %ebp @@ -260,12 +263,13 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF ; X86-NEXT: cmpl %ebp, %esi ; X86-NEXT: cmovel %edi, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, 12(%esi) -; X86-NEXT: movl %eax, 8(%esi) -; X86-NEXT: movl %edx, 4(%esi) -; X86-NEXT: movl %ebx, (%esi) -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -349,12 +353,12 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X64-AVX2-NEXT: vpsravd %ymm1, %ymm3, %ymm1 ; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; X64-AVX2-NEXT: vpcmpgtw %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767] -; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0 -; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpgtw %xmm0, %xmm3, %xmm3 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpblendvb %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm3 +; X64-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -367,46 +371,47 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: subl $16, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movswl %bx, %ebp -; X86-NEXT: sarl %cl, %ebp +; X86-NEXT: movl %esi, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movswl %di, %ebx +; X86-NEXT: sarl %cl, %ebx ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %di, %di +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %bp, %di -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovel %ebx, %ecx +; X86-NEXT: cmpw %bx, %si +; X86-NEXT: cmovel %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movswl %di, %ebx ; X86-NEXT: sarl %cl, %ebx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %bx, %si -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi -; X86-NEXT: sarl %cl, %edi -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: testw %dx, %dx -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmpw %bx, %dx +; X86-NEXT: cmovel %edi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movb %ah, %cl +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movswl %dx, %edi +; X86-NEXT: sarl %cl, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %bl +; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF +; X86-NEXT: cmpw %di, %si +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmovel %edx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, %edx ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movswl %dx, %esi @@ -429,9 +434,9 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF ; X86-NEXT: cmpw %si, %ax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovel %edx, %ecx ; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %edx @@ -495,25 +500,25 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: psllw $5, %xmm1 ; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pxor %xmm4, %xmm4 -; X64-NEXT: pcmpgtb %xmm1, %xmm4 +; X64-NEXT: pcmpgtb %xmm1, %xmm3 ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: psllw $4, %xmm2 -; X64-NEXT: pand %xmm4, %xmm2 -; X64-NEXT: pandn %xmm0, %xmm4 +; X64-NEXT: pand %xmm3, %xmm2 +; X64-NEXT: pandn %xmm0, %xmm3 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-NEXT: por %xmm4, %xmm2 +; X64-NEXT: por %xmm3, %xmm2 ; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; X64-NEXT: paddb %xmm1, %xmm1 -; X64-NEXT: pxor %xmm6, %xmm6 -; X64-NEXT: pcmpgtb %xmm1, %xmm6 -; X64-NEXT: movdqa %xmm6, %xmm7 -; X64-NEXT: pandn %xmm2, %xmm7 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtb %xmm1, %xmm3 +; X64-NEXT: movdqa %xmm3, %xmm6 +; X64-NEXT: pandn %xmm2, %xmm6 ; X64-NEXT: psllw $2, %xmm2 -; X64-NEXT: pand %xmm6, %xmm2 +; X64-NEXT: pand %xmm3, %xmm2 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-NEXT: por %xmm7, %xmm2 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: por %xmm6, %xmm2 ; X64-NEXT: paddb %xmm1, %xmm1 ; X64-NEXT: pxor %xmm6, %xmm6 ; X64-NEXT: pcmpgtb %xmm1, %xmm6 @@ -620,10 +625,10 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm5, %xmm4, %xmm1 ; X64-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0 -; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; X64-AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 ; X64-AVX2-NEXT: retq ; ; X86-LABEL: vec_v16i8: @@ -633,57 +638,58 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $44, %esp +; X86-NEXT: movb {{[0-9]+}}(%esp), %bh +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: movb %dh, %bl +; X86-NEXT: shlb %cl, %bl ; X86-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movb %ch, %bh -; X86-NEXT: shlb %cl, %bh -; X86-NEXT: movzbl %bh, %esi -; X86-NEXT: sarb %cl, %bh +; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: sarb %cl, %bl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: testb %dh, %dh +; X86-NEXT: sets %al +; X86-NEXT: addl $127, %eax +; X86-NEXT: cmpb %bl, %dh +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movb %ch, %dh +; X86-NEXT: movb %dl, %cl +; X86-NEXT: shlb %cl, %dh +; X86-NEXT: movzbl %dh, %esi +; X86-NEXT: sarb %cl, %dh ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testb %ch, %ch ; X86-NEXT: sets %al ; X86-NEXT: addl $127, %eax -; X86-NEXT: cmpb %bh, %ch +; X86-NEXT: cmpb %dh, %ch ; X86-NEXT: cmovel %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: shlb %cl, %al -; X86-NEXT: movzbl %al, %esi -; X86-NEXT: sarb %cl, %al -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testb %bl, %bl -; X86-NEXT: sets %cl -; X86-NEXT: addl $127, %ecx -; X86-NEXT: cmpb %al, %bl -; X86-NEXT: cmovel %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %dh, %al -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shlb %cl, %al -; X86-NEXT: movzbl %al, %esi -; X86-NEXT: sarb %cl, %al -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testb %dh, %dh -; X86-NEXT: sets %cl -; X86-NEXT: addl $127, %ecx -; X86-NEXT: cmpb %al, %dh -; X86-NEXT: cmovel %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: movb %ah, %al +; X86-NEXT: movb %bh, %dh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb %cl, %dh +; X86-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movzbl %dh, %esi +; X86-NEXT: sarb %cl, %dh +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: testb %bh, %bh +; X86-NEXT: sets %al +; X86-NEXT: addl $127, %eax +; X86-NEXT: cmpb %dh, %bh +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movb %ch, %al +; X86-NEXT: movb %dl, %cl ; X86-NEXT: shlb %cl, %al ; X86-NEXT: movzbl %al, %esi ; X86-NEXT: sarb %cl, %al ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testb %ah, %ah +; X86-NEXT: testb %ch, %ch ; X86-NEXT: sets %dl ; X86-NEXT: addl $127, %edx -; X86-NEXT: cmpb %al, %ah +; X86-NEXT: cmpb %al, %ch ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovel %esi, %edx diff --git a/llvm/test/CodeGen/X86/ssub_sat.ll b/llvm/test/CodeGen/X86/ssub_sat.ll index 8ecc8b39ac468..752db7950a026 100644 --- a/llvm/test/CodeGen/X86/ssub_sat.ll +++ b/llvm/test/CodeGen/X86/ssub_sat.ll @@ -40,8 +40,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl ; X86-NEXT: movl %ecx, %edx @@ -178,8 +178,8 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: setns %al ; X86-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF ; X86-NEXT: subl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovol %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edi, %esi diff --git a/llvm/test/CodeGen/X86/ssub_sat_plus.ll b/llvm/test/CodeGen/X86/ssub_sat_plus.ll index 5baf7a1dac74c..38d67b23eb58f 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_plus.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_plus.ll @@ -42,8 +42,8 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl ; X86-NEXT: movl %ecx, %edx @@ -140,9 +140,9 @@ define signext i8 @func8(i8 signext %x, i8 signext %y, i8 signext %z) nounwind { define signext i4 @func4(i4 signext %x, i4 signext %y, i4 signext %z) nounwind { ; X86-LABEL: func4: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mulb {{[0-9]+}}(%esp) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shlb $4, %al ; X86-NEXT: sarb $4, %al ; X86-NEXT: subb %al, %cl diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index 88df3c175ec9c..2c2a2727c5768 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -530,14 +530,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -545,14 +545,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512F-LABEL: v16i4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -560,13 +560,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512BW-LABEL: v16i4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -663,10 +663,10 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: @@ -764,10 +764,10 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i32: @@ -909,12 +909,12 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm3, %ymm0 +; AVX1-NEXT: vblendvps %ymm0, %ymm3, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32: @@ -1061,49 +1061,43 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; ; SSE41-LABEL: v16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pxor %xmm12, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm9 ; SSE41-NEXT: movdqa %xmm0, %xmm8 ; SSE41-NEXT: psubd %xmm4, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: psrad $31, %xmm10 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE41-NEXT: pxor %xmm4, %xmm1 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm8 -; SSE41-NEXT: movdqa %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm10 +; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm8 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psubd %xmm5, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm9 -; SSE41-NEXT: pxor %xmm5, %xmm9 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm10, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: psubd %xmm6, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE41-NEXT: pxor %xmm6, %xmm10 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: pxor %xmm4, %xmm5 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: psubd %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm11 -; SSE41-NEXT: pxor %xmm7, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: pxor %xmm4, %xmm5 -; SSE41-NEXT: movdqa %xmm11, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3 ; SSE41-NEXT: movaps %xmm8, %xmm0 ; SSE41-NEXT: retq @@ -1126,7 +1120,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 @@ -1257,7 +1251,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE41-LABEL: v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: psubq %xmm1, %xmm2 @@ -1291,11 +1285,11 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i64: @@ -1304,11 +1298,11 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: # xmm2 = mem[0,0] -; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vblendvpd %xmm0, %xmm3, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v2i64: @@ -1317,11 +1311,11 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: # xmm3 = mem[0,0] +; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX512F-NEXT: # xmm2 = mem[0,0] -; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: vblendvpd %xmm0, %xmm3, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: v2i64: @@ -1475,7 +1469,7 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: psubq %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 @@ -1483,9 +1477,9 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm8 -; SSE41-NEXT: por %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm9 +; SSE41-NEXT: por %xmm0, %xmm9 ; SSE41-NEXT: pxor %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 @@ -1493,12 +1487,12 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] ; SSE41-NEXT: pand %xmm0, %xmm5 ; SSE41-NEXT: por %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm8, %xmm5 ; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movapd %xmm7, %xmm2 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm9, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1541,12 +1535,12 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vxorpd %ymm0, %ymm4, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm4, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i64: @@ -1844,7 +1838,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm10 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: psubq %xmm4, %xmm8 ; SSE41-NEXT: movdqa %xmm8, %xmm9 @@ -1888,10 +1882,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] ; SSE41-NEXT: pand %xmm0, %xmm4 ; SSE41-NEXT: por %xmm5, %xmm4 -; SSE41-NEXT: pxor %xmm13, %xmm4 ; SSE41-NEXT: movapd %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: pxor %xmm13, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -1936,9 +1930,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] ; SSE41-NEXT: pand %xmm0, %xmm4 ; SSE41-NEXT: por %xmm7, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: pxor %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 @@ -1962,7 +1956,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 @@ -2022,20 +2016,20 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE-LABEL: v2i128: ; SSE: # %bb.0: -; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 -; SSE-NEXT: seto %dil +; SSE-NEXT: seto %al ; SSE-NEXT: movq %r8, %r10 ; SSE-NEXT: sarq $63, %r10 -; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: testb %al, %al ; SSE-NEXT: cmovneq %r10, %rcx ; SSE-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 ; SSE-NEXT: xorq %r11, %r10 -; SSE-NEXT: testb %dil, %dil +; SSE-NEXT: testb %al, %al ; SSE-NEXT: cmoveq %r8, %r10 ; SSE-NEXT: subq %r9, %rsi ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: seto %dil ; SSE-NEXT: movq %rdx, %r8 ; SSE-NEXT: sarq $63, %r8 @@ -2052,20 +2046,20 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; ; AVX-LABEL: v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: seto %dil +; AVX-NEXT: seto %al ; AVX-NEXT: movq %r8, %r10 ; AVX-NEXT: sarq $63, %r10 -; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: testb %al, %al ; AVX-NEXT: cmovneq %r10, %rcx ; AVX-NEXT: movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000 ; AVX-NEXT: xorq %r11, %r10 -; AVX-NEXT: testb %dil, %dil +; AVX-NEXT: testb %al, %al ; AVX-NEXT: cmoveq %r8, %r10 ; AVX-NEXT: subq %r9, %rsi ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: seto %dil ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: sarq $63, %r8 diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll index b5b9ce95a46ba..29b77aee11754 100644 --- a/llvm/test/CodeGen/X86/stack-clash-large.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large.ll @@ -141,16 +141,16 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i ; CHECK-X86-NEXT: .cfi_offset %edx, -12 ; CHECK-X86-NEXT: .cfi_offset %esi, -8 ; CHECK-X86-NEXT: movl 72056(%esp), %eax -; CHECK-X86-NEXT: movl 72048(%esp), %edx -; CHECK-X86-NEXT: movl 72040(%esp), %ecx -; CHECK-X86-NEXT: movl 72032(%esp), %esi -; CHECK-X86-NEXT: addl 72036(%esp), %esi -; CHECK-X86-NEXT: addl 72044(%esp), %ecx -; CHECK-X86-NEXT: addl %esi, %ecx -; CHECK-X86-NEXT: addl 72052(%esp), %edx +; CHECK-X86-NEXT: movl 72048(%esp), %ecx +; CHECK-X86-NEXT: movl 72032(%esp), %edx +; CHECK-X86-NEXT: addl 72036(%esp), %edx +; CHECK-X86-NEXT: movl 72040(%esp), %esi +; CHECK-X86-NEXT: addl 72044(%esp), %esi +; CHECK-X86-NEXT: addl %edx, %esi +; CHECK-X86-NEXT: addl 72052(%esp), %ecx ; CHECK-X86-NEXT: addl 72060(%esp), %eax -; CHECK-X86-NEXT: addl %edx, %eax ; CHECK-X86-NEXT: addl %ecx, %eax +; CHECK-X86-NEXT: addl %esi, %eax ; CHECK-X86-NEXT: movl %eax, 392(%esp) ; CHECK-X86-NEXT: movl %eax, 28792(%esp) ; CHECK-X86-NEXT: addl $72012, %esp # imm = 0x1194C diff --git a/llvm/test/CodeGen/X86/stack-folding-adx-x86_64.ll b/llvm/test/CodeGen/X86/stack-folding-adx-x86_64.ll index 83b98e6805d01..bbe4e7ff36838 100644 --- a/llvm/test/CodeGen/X86/stack-folding-adx-x86_64.ll +++ b/llvm/test/CodeGen/X86/stack-folding-adx-x86_64.ll @@ -32,18 +32,15 @@ define i8 @stack_fold_addcarry_u32(i8 %a0, i32 %a1, i32 %a2, ptr %a3) { ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: addb $-1, %dil +; CHECK-NEXT: adcl %edx, %esi ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: addb $-1, %al -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: adcl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload ; CHECK-NEXT: setb %al ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; CHECK-NEXT: movl %edx, (%rcx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 @@ -88,18 +85,15 @@ define i8 @stack_fold_addcarry_u64(i8 %a0, i64 %a1, i64 %a2, ptr %a3) { ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: addb $-1, %dil +; CHECK-NEXT: adcq %rdx, %rsi ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: addb $-1, %al -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; CHECK-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; CHECK-NEXT: setb %al ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; CHECK-NEXT: movq %rdx, (%rcx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 @@ -144,18 +138,15 @@ define i8 @stack_fold_addcarryx_u32(i8 %a0, i32 %a1, i32 %a2, ptr %a3) { ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: addb $-1, %dil +; CHECK-NEXT: adcl %edx, %esi ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: addb $-1, %al -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: adcl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload ; CHECK-NEXT: setb %al ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; CHECK-NEXT: movl %edx, (%rcx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 @@ -200,18 +191,15 @@ define i8 @stack_fold_addcarryx_u64(i8 %a0, i64 %a1, i64 %a2, ptr %a3) { ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: addb $-1, %dil +; CHECK-NEXT: adcq %rdx, %rsi ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: addb $-1, %al -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; CHECK-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; CHECK-NEXT: setb %al ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; CHECK-NEXT: movq %rdx, (%rcx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 @@ -256,18 +244,15 @@ define i8 @stack_fold_subborrow_u32(i8 %a0, i32 %a1, i32 %a2, ptr %a3) { ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: addb $-1, %dil +; CHECK-NEXT: sbbl %edx, %esi ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: addb $-1, %al -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: sbbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload ; CHECK-NEXT: setb %al ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; CHECK-NEXT: movl %edx, (%rcx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 @@ -312,18 +297,15 @@ define i8 @stack_fold_subborrow_u64(i8 %a0, i64 %a1, i64 %a2, ptr %a3) { ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: addb $-1, %dil +; CHECK-NEXT: sbbq %rdx, %rsi ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: addb $-1, %al -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; CHECK-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; CHECK-NEXT: setb %al ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; CHECK-NEXT: movq %rdx, (%rcx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll index 6625cc4f07a27..9899dbf652a64 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -1210,11 +1210,10 @@ define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) { ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: popq %rbx @@ -1301,11 +1300,10 @@ define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) { ; CHECK-NEXT: .cfi_offset %r14, -32 ; CHECK-NEXT: .cfi_offset %r15, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: popq %rbx @@ -1671,11 +1669,10 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_dpps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vdpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) @@ -1686,11 +1683,10 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_dpps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vdpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) @@ -3333,7 +3329,7 @@ define i32 @stack_fold_ucomisd(double %a0, double %a1) { ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: sete %al -; CHECK-NEXT: leal -1(%rax,%rax), %eax +; CHECK-NEXT: leal -1(,%rax,2), %eax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp ueq double %a0, %a1 @@ -3370,7 +3366,7 @@ define i32 @stack_fold_ucomiss(float %a0, float %a1) { ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: sete %al -; CHECK-NEXT: leal -1(%rax,%rax), %eax +; CHECK-NEXT: leal -1(,%rax,2), %eax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp ueq float %a0, %a1 diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll index 306ee31098090..981980faf9237 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -1288,11 +1288,10 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_dpps: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: dpps $7, %xmm1, %xmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: dpps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) @@ -2306,7 +2305,7 @@ define i32 @stack_fold_ucomisd(double %a0, double %a1) { ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ucomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: sete %al -; CHECK-NEXT: leal -1(%rax,%rax), %eax +; CHECK-NEXT: leal -1(,%rax,2), %eax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp ueq double %a0, %a1 @@ -2343,7 +2342,7 @@ define i32 @stack_fold_ucomiss(float %a0, float %a1) { ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: sete %al -; CHECK-NEXT: leal -1(%rax,%rax), %eax +; CHECK-NEXT: leal -1(,%rax,2), %eax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = fcmp ueq float %a0, %a1 diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll index 46fd1ab5dfb6d..0730b10af8c3c 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll @@ -793,12 +793,11 @@ define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) { define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpistri: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0 +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpcmpistri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) @@ -809,11 +808,10 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpistrm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpcmpistrm $7, %xmm1, %xmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpcmpistrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) diff --git a/llvm/test/CodeGen/X86/stack-folding-int-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-int-sse42.ll index e209a1ac627f1..6741b2e21ea67 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-sse42.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-sse42.ll @@ -979,12 +979,11 @@ define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) { define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpistri: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0 +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpistri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) @@ -995,11 +994,10 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpistrm: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: pcmpistrm $7, %xmm1, %xmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpistrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) diff --git a/llvm/test/CodeGen/X86/statepoint-call-lowering.ll b/llvm/test/CodeGen/X86/statepoint-call-lowering.ll index 758cb8b7b63d5..c096aa0fa4a83 100644 --- a/llvm/test/CodeGen/X86/statepoint-call-lowering.ll +++ b/llvm/test/CodeGen/X86/statepoint-call-lowering.ll @@ -204,27 +204,18 @@ declare void @consume_attributes(i32, ptr nest, i32, ptr byval(%struct2)) define void @test_attributes(ptr byval(%struct2) %s) gc "statepoint-example" { ; CHECK-LABEL: test_attributes: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: subq $8, %rsp -; CHECK-NEXT: .cfi_adjust_cfa_offset 8 +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsp) ; CHECK-NEXT: movl $42, %edi ; CHECK-NEXT: xorl %r10d, %r10d ; CHECK-NEXT: movl $17, %esi -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_adjust_cfa_offset 8 -; CHECK-NEXT: pushq %rdx -; CHECK-NEXT: .cfi_adjust_cfa_offset 8 -; CHECK-NEXT: pushq %rcx -; CHECK-NEXT: .cfi_adjust_cfa_offset 8 ; CHECK-NEXT: callq consume_attributes@PLT ; CHECK-NEXT: .Ltmp9: -; CHECK-NEXT: addq $32, %rsp -; CHECK-NEXT: .cfi_adjust_cfa_offset -32 -; CHECK-NEXT: popq %rax +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/statepoint-deopt-lowering.ll b/llvm/test/CodeGen/X86/statepoint-deopt-lowering.ll index 72f4fa37dc5e5..4b0b8149ff3b5 100644 --- a/llvm/test/CodeGen/X86/statepoint-deopt-lowering.ll +++ b/llvm/test/CodeGen/X86/statepoint-deopt-lowering.ll @@ -44,17 +44,17 @@ define void @test_illegal_constants() gc "statepoint-example" { ; CHECK-NEXT: subq $248, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 256 ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $144, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/statepoint-stackmap-format.ll b/llvm/test/CodeGen/X86/statepoint-stackmap-format.ll index 1d45c7db84f81..dd518374878cd 100644 --- a/llvm/test/CodeGen/X86/statepoint-stackmap-format.ll +++ b/llvm/test/CodeGen/X86/statepoint-stackmap-format.ll @@ -78,18 +78,18 @@ declare void @many_arg(i64, i64, i64, i64, i64, i64, i64, i64) define i32 @test_spadj(ptr addrspace(1) %p) gc "statepoint-example" { ; CHECK-LABEL: test_spadj - ; CHECK: movq %rdi, (%rsp) + ; CHECK: subq $24, %rsp + ; CHECK: movq %rdi, 16(%rsp) + ; CHECK: xorps %xmm0, %xmm0 + ; CHECK: movups %xmm0, (%rsp) ; CHECK: xorl %edi, %edi ; CHECK: xorl %esi, %esi ; CHECK: xorl %edx, %edx ; CHECK: xorl %ecx, %ecx ; CHECK: xorl %r8d, %r8d ; CHECK: xorl %r9d, %r9d - ; CHECK: pushq $0 - ; CHECK: pushq $0 ; CHECK: callq many_arg - ; CHECK: addq $16, %rsp - ; CHECK: movq (%rsp) + ; CHECK: movq 16(%rsp) %statepoint_token = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void (i64, i64, i64, i64, i64, i64, i64, i64)) @many_arg, i32 8, i32 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0, i32 0) ["gc-live"(ptr addrspace(1) %p)] %p.relocated = call ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %statepoint_token, i32 0, i32 0) ; (%p, %p) %ld = load i32, ptr addrspace(1) %p.relocated @@ -148,7 +148,7 @@ declare ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token, i32, i32) #3 ; CHECK-NEXT: .quad 8 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad test_spadj -; CHECK-NEXT: .quad 8 +; CHECK-NEXT: .quad 24 ; CHECK-NEXT: .quad 1 ; CHECK-NEXT: .quad test_fixed_arg ; CHECK-NEXT: .quad 8 diff --git a/llvm/test/CodeGen/X86/strict-fsub-combines.ll b/llvm/test/CodeGen/X86/strict-fsub-combines.ll index 774ea02ccd87a..04c149248fd73 100644 --- a/llvm/test/CodeGen/X86/strict-fsub-combines.ll +++ b/llvm/test/CodeGen/X86/strict-fsub-combines.ll @@ -8,10 +8,10 @@ define float @fneg_strict_fsub_to_strict_fadd(float %x, float %y) nounwind stric ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: subss %xmm1, %xmm0 -; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: subss %xmm0, %xmm1 +; X86-NEXT: movss %xmm1, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: wait ; X86-NEXT: popl %eax @@ -36,10 +36,10 @@ define double @fneg_strict_fsub_to_strict_fadd_d(double %x, double %y) nounwind ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: subsd %xmm1, %xmm0 -; X86-NEXT: movsd %xmm0, (%esp) +; X86-NEXT: subsd %xmm0, %xmm1 +; X86-NEXT: movsd %xmm1, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: wait ; X86-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll index 39cbee54737c3..74b1700520a9f 100644 --- a/llvm/test/CodeGen/X86/sttni.ll +++ b/llvm/test/CodeGen/X86/sttni.ll @@ -1173,8 +1173,8 @@ define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr ; X86-LABEL: pcmpistr_mask_index: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 ; X86-NEXT: movdqa %xmm0, (%edx) ; X86-NEXT: movl %ecx, (%eax) @@ -1198,21 +1198,21 @@ entry: define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %iptr, ptr %fptr) nounwind { ; X86-LABEL: pcmpistr_mask_index_flag: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: pcmpistri $24, %xmm1, %xmm2 -; X86-NEXT: setb %bl -; X86-NEXT: movdqa %xmm0, (%esi) -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: setb %dl +; X86-NEXT: movdqa %xmm0, (%edi) +; X86-NEXT: movl %ecx, (%esi) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_mask_index_flag: @@ -1240,23 +1240,23 @@ entry: define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, ptr %rhsptr, ptr %mptr, ptr %iptr, ptr %fptr) nounwind { ; X86-LABEL: pcmpistr_mask_index_flag_load: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movdqu (%ecx), %xmm2 ; X86-NEXT: pcmpistrm $24, %xmm2, %xmm0 -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: pcmpistri $24, %xmm2, %xmm1 -; X86-NEXT: setb %bl -; X86-NEXT: movdqa %xmm0, (%esi) -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: setb %dl +; X86-NEXT: movdqa %xmm0, (%edi) +; X86-NEXT: movl %ecx, (%esi) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_mask_index_flag_load: diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll index 7d5db07c0172a..625a815e28fef 100644 --- a/llvm/test/CodeGen/X86/subcarry.ll +++ b/llvm/test/CodeGen/X86/subcarry.ll @@ -20,9 +20,9 @@ entry: define i256 @sub256(i256 %a, i256 %b) nounwind { ; CHECK-LABEL: sub256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: subq %r9, %rsi ; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 ; CHECK-NEXT: movq %rcx, 16(%rdi) @@ -92,29 +92,29 @@ entry: define %S @sub(ptr nocapture readonly %this, %S %arg.b) { ; CHECK-LABEL: sub: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq (%rsi), %rdi -; CHECK-NEXT: movq 8(%rsi), %r10 -; CHECK-NEXT: subq %rdx, %rdi +; CHECK-NEXT: movq (%rsi), %r10 +; CHECK-NEXT: movq 8(%rsi), %rax +; CHECK-NEXT: subq %rdx, %r10 ; CHECK-NEXT: setae %dl ; CHECK-NEXT: addb $-1, %dl -; CHECK-NEXT: adcq $0, %r10 +; CHECK-NEXT: adcq $0, %rax ; CHECK-NEXT: setb %dl ; CHECK-NEXT: movzbl %dl, %edx ; CHECK-NEXT: notq %rcx -; CHECK-NEXT: addq %r10, %rcx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: adcq 16(%rsi), %rdx -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: movzbl %r10b, %r10d +; CHECK-NEXT: setb %al +; CHECK-NEXT: movzbl %al, %r11d ; CHECK-NEXT: notq %r8 ; CHECK-NEXT: addq %rdx, %r8 -; CHECK-NEXT: adcq 24(%rsi), %r10 +; CHECK-NEXT: adcq 24(%rsi), %r11 +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: notq %r9 -; CHECK-NEXT: addq %r10, %r9 -; CHECK-NEXT: movq %rdi, (%rax) -; CHECK-NEXT: movq %rcx, 8(%rax) -; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %r9, 24(%rax) +; CHECK-NEXT: addq %r11, %r9 +; CHECK-NEXT: movq %r10, (%rdi) +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq %r8, 16(%rdi) +; CHECK-NEXT: movq %r9, 24(%rdi) ; CHECK-NEXT: retq entry: %0 = extractvalue %S %arg.b, 0 @@ -586,23 +586,23 @@ define void @PR39464(ptr noalias nocapture sret(%struct.U192) %0, ptr nocapture define void @sub_U256_without_i128_or_recursive(ptr sret(%uint256) %0, ptr %1, ptr %2) nounwind { ; CHECK-LABEL: sub_U256_without_i128_or_recursive: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rcx -; CHECK-NEXT: movq 8(%rsi), %rdi ; CHECK-NEXT: movq 16(%rsi), %r8 -; CHECK-NEXT: movq 24(%rsi), %rsi -; CHECK-NEXT: xorl %r9d, %r9d +; CHECK-NEXT: movq 24(%rsi), %r9 +; CHECK-NEXT: xorl %r10d, %r10d ; CHECK-NEXT: subq 16(%rdx), %r8 -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: subq 24(%rdx), %rsi +; CHECK-NEXT: setb %r10b +; CHECK-NEXT: subq 24(%rdx), %r9 ; CHECK-NEXT: subq (%rdx), %rcx -; CHECK-NEXT: sbbq 8(%rdx), %rdi +; CHECK-NEXT: movq 8(%rsi), %rsi +; CHECK-NEXT: sbbq 8(%rdx), %rsi ; CHECK-NEXT: sbbq $0, %r8 -; CHECK-NEXT: sbbq %r9, %rsi -; CHECK-NEXT: movq %rcx, (%rax) -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %rsi, 24(%rax) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: sbbq %r10, %r9 +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: movq %rsi, 8(%rdi) +; CHECK-NEXT: movq %r8, 16(%rdi) +; CHECK-NEXT: movq %r9, 24(%rdi) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 %5 = getelementptr inbounds %uint256, ptr %1, i64 0, i32 0, i32 1 diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index 75333bf835f89..9021e0bb50a9c 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -757,8 +757,8 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(ptr %p0, ptr %p1) { ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vmovaps %xmm1, (%eax) ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X86-AVX-NEXT: retl @@ -774,8 +774,8 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(ptr %p0, ptr %p1) { ; ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq @@ -806,7 +806,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; X86-AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] ; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX1-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,4] +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0] ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm6 @@ -827,7 +827,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; ; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4] +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 @@ -856,7 +856,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; X64-AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-AVX1-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,4] +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4] ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm6 @@ -877,7 +877,7 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> ; ; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 @@ -917,11 +917,11 @@ define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x d ; X86-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: ; X86-AVX: # %bb.0: # %entry ; X86-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] -; X86-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; X86-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ; X86-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 ; X86-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 +; X86-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ; X86-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 +; X86-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ; X86-AVX-NEXT: vmovupd %ymm0, ga2 ; X86-AVX-NEXT: vmovupd %ymm2, gb2+32 ; X86-AVX-NEXT: vmovupd %ymm1, gb2 @@ -932,9 +932,9 @@ define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x d ; X86-AVX512: # %bb.0: # %entry ; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; X86-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; X86-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ; X86-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 +; X86-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; X86-AVX512-NEXT: vmovupd %ymm0, ga2 ; X86-AVX512-NEXT: vmovupd %zmm1, gb2 ; X86-AVX512-NEXT: vzeroupper @@ -943,11 +943,11 @@ define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x d ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] -; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 +; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 +; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ; X64-AVX-NEXT: vmovupd %ymm0, ga2(%rip) ; X64-AVX-NEXT: vmovupd %ymm2, gb2+32(%rip) ; X64-AVX-NEXT: vmovupd %ymm1, gb2(%rip) @@ -958,9 +958,9 @@ define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x d ; X64-AVX512: # %bb.0: # %entry ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 +; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; X64-AVX512-NEXT: vmovupd %ymm0, ga2(%rip) ; X64-AVX512-NEXT: vmovupd %zmm1, gb2(%rip) ; X64-AVX512-NEXT: vzeroupper @@ -992,13 +992,13 @@ define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm4 ; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm5 +; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X86-AVX1-NEXT: vandps %ymm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 5bd624c0697a0..8068f59418d02 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -3333,9 +3333,9 @@ define void @vec384_v3i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; ; SSE2-LABEL: vec384_v3i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq 16(%rdi), %rax ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor (%rdi), %xmm0 +; SSE2-NEXT: movq 16(%rdi), %rax ; SSE2-NEXT: movdqa %xmm0, (%rsi) ; SSE2-NEXT: notq %rax ; SSE2-NEXT: movq %rax, 16(%rsi) @@ -3405,9 +3405,9 @@ define void @vec384_v3f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; ; SSE2-LABEL: vec384_v3f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movq 16(%rdi), %rax ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor (%rdi), %xmm0 +; SSE2-NEXT: movq 16(%rdi), %rax ; SSE2-NEXT: movdqa %xmm0, (%rsi) ; SSE2-NEXT: notq %rax ; SSE2-NEXT: movq %rax, 16(%rsi) diff --git a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll index 72e4fe410e269..2bdc70cdf9010 100644 --- a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll +++ b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll @@ -143,8 +143,8 @@ define i32 @loop_shared_header(ptr %exe, i32 %exesz, i32 %headsize, i32 %min, i3 ; CHECK-NEXT: jae .LBB1_11 ; CHECK-NEXT: # %bb.13: # %if.end287.i ; CHECK-NEXT: # in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: # implicit-def: $cl +; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_4 ; CHECK-NEXT: # %bb.14: # %if.end308.i ; CHECK-NEXT: # in Loop: Header=BB1_4 Depth=1 diff --git a/llvm/test/CodeGen/X86/tbm_patterns.ll b/llvm/test/CodeGen/X86/tbm_patterns.ll index e595803efdfca..7715ab27f895a 100644 --- a/llvm/test/CodeGen/X86/tbm_patterns.ll +++ b/llvm/test/CodeGen/X86/tbm_patterns.ll @@ -50,8 +50,8 @@ define i32 @test_x86_tbm_bextri_u32_z(i32 %a, i32 %b) nounwind { define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2: ; CHECK: # %bb.0: +; CHECK-NEXT: bextrl $3076, %edi, %eax # imm = 0xC04 ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 ; CHECK-NEXT: cmovnel %edx, %eax ; CHECK-NEXT: retq %t0 = lshr i32 %a, 4 @@ -125,8 +125,8 @@ define i64 @test_x86_tbm_bextri_u64_z(i64 %a, i64 %b) nounwind { define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2: ; CHECK: # %bb.0: +; CHECK-NEXT: bextrl $3076, %edi, %eax # imm = 0xC04 ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 ; CHECK-NEXT: cmovneq %rdx, %rax ; CHECK-NEXT: retq %t0 = lshr i64 %a, 4 diff --git a/llvm/test/CodeGen/X86/trunc-nsw-nuw.ll b/llvm/test/CodeGen/X86/trunc-nsw-nuw.ll index 5c5f7045ea030..dc21bbf2cd9b2 100644 --- a/llvm/test/CodeGen/X86/trunc-nsw-nuw.ll +++ b/llvm/test/CodeGen/X86/trunc-nsw-nuw.ll @@ -47,7 +47,8 @@ entry: define ptr @gep_nusw_zext_nneg_add_trunc_nuw_nsw(ptr %p, i64 %x) nounwind { ; CHECK-LABEL: gep_nusw_zext_nneg_add_trunc_nuw_nsw: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: leaq 20(%rdi,%rsi,4), %rax +; CHECK-NEXT: leaq (%rdi,%rsi,4), %rax +; CHECK-NEXT: addq $20, %rax ; CHECK-NEXT: retq entry: %trunc = trunc nuw nsw i64 %x to i32 diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilps.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilps.ll index 01f8df684e740..2dfba90c8f518 100644 --- a/llvm/test/CodeGen/X86/tuning-shuffle-permilps.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilps.ll @@ -69,7 +69,7 @@ define <8 x float> @transform_VPERMILPSYrm(ptr %ap) nounwind { define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind { ; CHECK-AVX1-LABEL: transform_VPERMILPSrm: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,2,1,0] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX1-DELAY-LABEL: transform_VPERMILPSrm: @@ -79,7 +79,7 @@ define <4 x float> @transform_VPERMILPSrm(ptr %ap) nounwind { ; ; CHECK-AVX2-LABEL: transform_VPERMILPSrm: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,2,1,0] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX2-DELAY-LABEL: transform_VPERMILPSrm: diff --git a/llvm/test/CodeGen/X86/uadd_sat.ll b/llvm/test/CodeGen/X86/uadd_sat.ll index 0a3c2ae344fd3..5c43ed97aa011 100644 --- a/llvm/test/CodeGen/X86/uadd_sat.ll +++ b/llvm/test/CodeGen/X86/uadd_sat.ll @@ -32,8 +32,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-LABEL: func2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovbl %ecx, %edx @@ -125,7 +125,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -138,6 +137,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: addl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovbl %ebx, %esi ; X86-NEXT: addl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovbl %ebx, %edi ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %esi, 8(%eax) diff --git a/llvm/test/CodeGen/X86/uadd_sat_plus.ll b/llvm/test/CodeGen/X86/uadd_sat_plus.ll index 654e3d77f52aa..1ddc4cc3e58e8 100644 --- a/llvm/test/CodeGen/X86/uadd_sat_plus.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_plus.ll @@ -34,8 +34,8 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-LABEL: func64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovbl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll index d744ce6ed6af0..27a5c5df52df8 100644 --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -501,7 +501,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -510,7 +510,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; ; AVX512-LABEL: v16i4: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 @@ -925,7 +925,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 @@ -1160,38 +1160,38 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE-LABEL: v2i128: ; SSE: # %bb.0: -; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: addq %r9, %rsi ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %rdx -; SSE-NEXT: movq $-1, %rdi -; SSE-NEXT: cmovbq %rdi, %rsi -; SSE-NEXT: cmovbq %rdi, %rdx +; SSE-NEXT: movq $-1, %r9 +; SSE-NEXT: cmovbq %r9, %rsi +; SSE-NEXT: cmovbq %r9, %rdx ; SSE-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; SSE-NEXT: cmovbq %rdi, %r8 -; SSE-NEXT: cmovbq %rdi, %rcx -; SSE-NEXT: movq %r8, 24(%rax) -; SSE-NEXT: movq %rcx, 16(%rax) -; SSE-NEXT: movq %rdx, 8(%rax) -; SSE-NEXT: movq %rsi, (%rax) +; SSE-NEXT: cmovbq %r9, %r8 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: cmovbq %r9, %rcx +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rcx, 16(%rdi) +; SSE-NEXT: movq %rdx, 8(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: addq %r9, %rsi ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rdx -; AVX-NEXT: movq $-1, %rdi -; AVX-NEXT: cmovbq %rdi, %rsi -; AVX-NEXT: cmovbq %rdi, %rdx +; AVX-NEXT: movq $-1, %r9 +; AVX-NEXT: cmovbq %r9, %rsi +; AVX-NEXT: cmovbq %r9, %rdx ; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: cmovbq %rdi, %r8 -; AVX-NEXT: cmovbq %rdi, %rcx -; AVX-NEXT: movq %r8, 24(%rax) -; AVX-NEXT: movq %rcx, 16(%rax) -; AVX-NEXT: movq %rdx, 8(%rax) -; AVX-NEXT: movq %rsi, (%rax) +; AVX-NEXT: cmovbq %r9, %r8 +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: cmovbq %r9, %rcx +; AVX-NEXT: movq %r8, 24(%rdi) +; AVX-NEXT: movq %rcx, 16(%rdi) +; AVX-NEXT: movq %rdx, 8(%rdi) +; AVX-NEXT: movq %rsi, (%rdi) ; AVX-NEXT: retq %z = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll index 6a52acfe2fb30..0ff517771e00c 100644 --- a/llvm/test/CodeGen/X86/ucmp.ll +++ b/llvm/test/CodeGen/X86/ucmp.ll @@ -310,9 +310,9 @@ define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind { ; X86-LABEL: ucmp_wide_result: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: seta %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movsbl %cl, %ecx ; X86-NEXT: movl %ecx, (%eax) @@ -469,27 +469,27 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; X86-NEXT: seta %dl ; X86-NEXT: sbbb $0, %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movsbl %dl, %edx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi ; X86-NEXT: seta %bl ; X86-NEXT: sbbb $0, %bl -; X86-NEXT: movsbl %bl, %edi ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: seta %bl -; X86-NEXT: sbbb $0, %bl +; X86-NEXT: seta %bh ; X86-NEXT: movsbl %bl, %esi +; X86-NEXT: sbbb $0, %bh +; X86-NEXT: movsbl %bh, %edi ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seta %cl ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movsbl %cl, %ecx ; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -604,33 +604,29 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind { ; ; X86-LABEL: ucmp_narrow_vec_result: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seta %cl ; X86-NEXT: sbbb $0, %cl -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: seta %ch ; X86-NEXT: sbbb $0, %ch -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: seta %bl -; X86-NEXT: sbbb $0, %bl ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; X86-NEXT: seta %dl ; X86-NEXT: sbbb $0, %dl -; X86-NEXT: movb %dl, 3(%eax) -; X86-NEXT: movb %bl, 2(%eax) +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: seta %dh +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbb $0, %dh +; X86-NEXT: movb %dh, 3(%eax) +; X86-NEXT: movb %dl, 2(%eax) ; X86-NEXT: movb %ch, 1(%eax) ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl $4 %1 = call <4 x i8> @llvm.ucmp(<4 x i32> %x, <4 x i32> %y) ret <4 x i8> %1 @@ -690,18 +686,18 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl ; X86-NEXT: seta %dl ; X86-NEXT: sbbb $0, %dl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movsbl %dl, %edx ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl ; X86-NEXT: seta %bl ; X86-NEXT: sbbb $0, %bl -; X86-NEXT: movsbl %bl, %esi ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch ; X86-NEXT: seta %ch +; X86-NEXT: movsbl %bl, %esi ; X86-NEXT: sbbb $0, %ch ; X86-NEXT: movsbl %ch, %edi ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl @@ -867,48 +863,48 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl -; X86-NEXT: seta %bl -; X86-NEXT: sbbb $0, %bl -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, (%esp) # 1-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: seta %bh -; X86-NEXT: sbbb $0, %bh +; X86-NEXT: seta %bl +; X86-NEXT: sbbb $0, %bl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: seta %al -; X86-NEXT: sbbb $0, %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: seta %bh +; X86-NEXT: sbbb $0, %bh ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah +; X86-NEXT: seta %ah +; X86-NEXT: movsbl %al, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbb $0, %ah +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al +; X86-NEXT: movsbl %ah, %edi ; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah +; X86-NEXT: seta %ah ; X86-NEXT: movsbl %al, %ebp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: seta %al -; X86-NEXT: sbbb $0, %al -; X86-NEXT: movsbl %al, %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbb $0, %ah +; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al ; X86-NEXT: seta %al +; X86-NEXT: movsbl %ah, %esi ; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah +; X86-NEXT: seta %ah ; X86-NEXT: movsbl %al, %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al -; X86-NEXT: seta %al -; X86-NEXT: sbbb $0, %al -; X86-NEXT: movsbl %al, %ecx +; X86-NEXT: sbbb $0, %ah +; X86-NEXT: movsbl %ah, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ecx, 60(%eax) ; X86-NEXT: movl %edx, 56(%eax) @@ -921,10 +917,10 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: movsbl %bh, %ecx ; X86-NEXT: movl %ecx, 36(%eax) ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: movsbl (%esp), %edx # 1-byte Folded Reload +; X86-NEXT: movsbl %bl, %edx ; X86-NEXT: movl %edx, 32(%eax) ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: movsbl %bl, %edi +; X86-NEXT: movsbl (%esp), %edi # 1-byte Folded Reload ; X86-NEXT: movl %edi, 28(%eax) ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload @@ -1358,10 +1354,10 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $12, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp @@ -1369,47 +1365,47 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind { ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seta %al ; X86-NEXT: sbbb $0, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: seta %bh ; X86-NEXT: sbbb $0, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll index 271d11edff9a7..da797ab3a7ca9 100644 --- a/llvm/test/CodeGen/X86/udiv-exact.ll +++ b/llvm/test/CodeGen/X86/udiv-exact.ll @@ -85,9 +85,9 @@ define <4 x i32> @test5(<4 x i32> %x) { ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145] ; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl @@ -110,9 +110,9 @@ define <4 x i32> @test6(<4 x i32> %x) { ; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997] ; X86-NEXT: pmuludq %xmm0, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqa %xmm1, %xmm0 @@ -132,8 +132,8 @@ define <4 x i32> @test7(<4 x i32> %x) { ; X86: # %bb.0: ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl @@ -154,9 +154,9 @@ define <4 x i32> @test8(<4 x i32> %x) { ; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531] ; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/uint64-to-float.ll b/llvm/test/CodeGen/X86/uint64-to-float.ll index 03a8171589622..3299b9a106e49 100644 --- a/llvm/test/CodeGen/X86/uint64-to-float.ll +++ b/llvm/test/CodeGen/X86/uint64-to-float.ll @@ -54,13 +54,13 @@ define float @test(i64 %a) nounwind { ; X86-WIN-NEXT: movl 12(%ebp), %eax ; X86-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-WIN-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) -; X86-WIN-NEXT: shrl $31, %eax ; X86-WIN-NEXT: fildll {{[0-9]+}}(%esp) ; X86-WIN-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-WIN-NEXT: orl $768, %ecx # imm = 0x300 ; X86-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp) ; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-WIN-NEXT: shrl $31, %eax ; X86-WIN-NEXT: fadds __real@5f80000000000000(,%eax,4) ; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp) ; X86-WIN-NEXT: fstps {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/uint_to_half.ll b/llvm/test/CodeGen/X86/uint_to_half.ll index b62a07eec1ce6..d4da357d76199 100644 --- a/llvm/test/CodeGen/X86/uint_to_half.ll +++ b/llvm/test/CodeGen/X86/uint_to_half.ll @@ -91,9 +91,9 @@ define <16 x half> @test_uitofp_v16i32_v16f16(<16 x i32> %a) { ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] ; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -148,9 +148,9 @@ define <16 x half> @test_strict_uitofp_v16i32_v16f16(<16 x i32> %a) { ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] ; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll index f589d4a7b04a9..8ef59e4ad6018 100644 --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -235,28 +235,28 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %ebx, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %ebx, %edx ; X86-NEXT: movl %esi, %ebp ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovbl %ebx, %edx +; X86-NEXT: cmovbl %ebx, %edi +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmovbl %ebp, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -286,32 +286,32 @@ define i128 @test_i128_1(i128 %a) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl $1, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: testl %edx, %edx -; X86-NEXT: movl $1, %edi -; X86-NEXT: cmovnel %eax, %edi -; X86-NEXT: cmovel %ebx, %edi -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl $1, %ebx +; X86-NEXT: cmovnel %edx, %ebx +; X86-NEXT: cmovel %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: negl %ebp ; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: sbbl %ecx, %ebp ; X86-NEXT: movl $1, %ebp -; X86-NEXT: cmovbl %eax, %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: cmovbl %edx, %ebx -; X86-NEXT: orl %esi, %eax +; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: cmovbl %edx, %ebp +; X86-NEXT: movl %esi, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: cmovel %ebx, %ebp +; X86-NEXT: cmovel %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: cmovel %edi, %ebp -; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %ebp, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -354,7 +354,7 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: test_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -418,36 +418,34 @@ define <2 x i64> @test_v2i64_1(<2 x i64> %a) nounwind { ; ; X86-LABEL: test_v2i64_1: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpl $1, %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: testl %esi, %esi -; X86-NEXT: movl $1, %ebp -; X86-NEXT: cmovel %ebp, %ecx -; X86-NEXT: cmovel %ebx, %ecx -; X86-NEXT: cmpl $1, %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: testl %edx, %edx -; X86-NEXT: cmovnel %edi, %ebp -; X86-NEXT: cmovel %ebx, %ebp -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %ebp, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl $1, %esi +; X86-NEXT: cmovel %esi, %ecx +; X86-NEXT: cmovel %edi, %ecx +; X86-NEXT: cmpl $1, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %r = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> ) ret <2 x i64> %r @@ -573,8 +571,14 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: cmoval %esi, %edx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edi, %eax @@ -583,16 +587,10 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: cmoval %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: cmoval %eax, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmoval %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -624,21 +622,21 @@ define <4 x i32> @test_v4i32_1(<4 x i32> %a) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $1, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: cmpl $1, %edx -; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmpl $1, %esi ; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmpl $1, %edi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -690,31 +688,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebp, %eax -; X86-NEXT: cmoval %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ebx, %eax -; X86-NEXT: cmoval %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edi, %eax -; X86-NEXT: cmoval %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: cmoval %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %edx, %eax -; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: cmpl %edi, %ebp +; X86-NEXT: cmoval %ebp, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl %esi, %ebp +; X86-NEXT: cmoval %ebp, %esi +; X86-NEXT: cmpl %edx, %ebx +; X86-NEXT: cmoval %ebx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: cmoval %eax, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: cmoval %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmoval %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %edx, %eax @@ -726,14 +724,14 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, 28(%ecx) ; X86-NEXT: movl %edx, 24(%ecx) +; X86-NEXT: movl %ebp, 20(%ecx) +; X86-NEXT: movl %ebx, 16(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%ecx) +; X86-NEXT: movl %eax, 12(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %eax, 8(%ecx) +; X86-NEXT: movl %esi, 4(%ecx) +; X86-NEXT: movl %edi, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -778,41 +776,41 @@ define <8 x i32> @test_v8i32_1(<8 x i32> %a) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl $1, %eax ; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl $1, %edx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: cmpl $1, %esi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmpl $1, %edi ; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: cmpl $1, %ebx ; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: cmpl $1, %ebp ; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: adcl $0, %edx ; X86-NEXT: cmpl $1, %ecx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpl $1, %esi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl $1, %eax ; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %esi, 24(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%ecx) -; X86-NEXT: movl %ebp, 20(%ecx) -; X86-NEXT: movl %ebx, 16(%ecx) -; X86-NEXT: movl %edi, 12(%ecx) -; X86-NEXT: movl %esi, 8(%ecx) -; X86-NEXT: movl %edx, 4(%ecx) +; X86-NEXT: movl %eax, 20(%ecx) +; X86-NEXT: movl %edx, 16(%ecx) +; X86-NEXT: movl %ebp, 12(%ecx) +; X86-NEXT: movl %ebx, 8(%ecx) +; X86-NEXT: movl %edi, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: movl %ecx, %eax @@ -845,32 +843,32 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %di, %cx +; X86-NEXT: cmoval %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %si, %cx +; X86-NEXT: cmoval %ecx, %esi +; X86-NEXT: cmpw %bp, %bx +; X86-NEXT: cmoval %ebx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %bp, %ax -; X86-NEXT: cmoval %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %bx, %ax ; X86-NEXT: cmoval %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmoval %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: cmoval %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %dx, %ax -; X86-NEXT: cmoval %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw %cx, %ax -; X86-NEXT: cmoval %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: cmpw %bp, %ax +; X86-NEXT: cmoval %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpw %dx, %ax @@ -882,14 +880,14 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) ; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movw %bp, 10(%ecx) +; X86-NEXT: movw %bx, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 10(%ecx) +; X86-NEXT: movw %ax, 6(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 8(%ecx) -; X86-NEXT: movw %si, 6(%ecx) -; X86-NEXT: movw %di, 4(%ecx) -; X86-NEXT: movw %bx, 2(%ecx) -; X86-NEXT: movw %bp, (%ecx) +; X86-NEXT: movw %ax, 4(%ecx) +; X86-NEXT: movw %si, 2(%ecx) +; X86-NEXT: movw %di, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -924,20 +922,20 @@ define <8 x i16> @test_v8i16_1(<8 x i16> %a) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpw $1, %si +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpw $1, %di +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpw $1, %bx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpw $1, %bp +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpw $1, %dx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: cmpw $1, %bp -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: cmpw $1, %bx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: cmpw $1, %di -; X86-NEXT: adcl $0, %edi -; X86-NEXT: cmpw $1, %si -; X86-NEXT: adcl $0, %esi ; X86-NEXT: cmpw $1, %cx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -953,11 +951,11 @@ define <8 x i16> @test_v8i16_1(<8 x i16> %a) nounwind { ; X86-NEXT: movw %ax, 12(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 10(%ecx) -; X86-NEXT: movw %si, 8(%ecx) -; X86-NEXT: movw %di, 6(%ecx) +; X86-NEXT: movw %dx, 8(%ecx) +; X86-NEXT: movw %bp, 6(%ecx) ; X86-NEXT: movw %bx, 4(%ecx) -; X86-NEXT: movw %bp, 2(%ecx) -; X86-NEXT: movw %dx, (%ecx) +; X86-NEXT: movw %di, 2(%ecx) +; X86-NEXT: movw %si, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi @@ -988,20 +986,20 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $40, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb %bl, %al -; X86-NEXT: cmoval %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb %dl, %al ; X86-NEXT: cmoval %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpb %cl, %al -; X86-NEXT: cmoval %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb %cl, %al @@ -1125,10 +1123,10 @@ define <16 x i8> @test_v16i8_1(<16 x i8> %a) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmpb $1, %bl -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb $1, %dl +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpb $1, %dl ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll index 7a5cdbb9ce758..0c4f31bdba6fb 100644 --- a/llvm/test/CodeGen/X86/umin.ll +++ b/llvm/test/CodeGen/X86/umin.ll @@ -150,29 +150,28 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %edx, %ebx -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovbl %ebx, %edx -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovbl %ebx, %edx ; X86-NEXT: cmovbl %edi, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ebp, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -256,24 +255,20 @@ define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind { ; ; X86-LABEL: test_v3i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmpl %eax, %ebx -; X86-NEXT: cmovbl %ebx, %eax -; X86-NEXT: cmpl %edx, %edi -; X86-NEXT: cmovbl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: cmovbl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: cmovbl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmpl %ecx, %esi ; X86-NEXT: cmovbl %esi, %ecx ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl %r = call <3 x i32> @llvm.umin.v3i32(<3 x i32> %a, <3 x i32> %b) ret <3 x i32> %r diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll index ccabb360a990c..fb19eb6f884e4 100644 --- a/llvm/test/CodeGen/X86/umul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -81,36 +81,35 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $76, %esp -; X86-NEXT: movl $4095, %ecx # imm = 0xFFF -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl $4095, %ebx # imm = 0xFFF ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: andl %ebx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi +; X86-NEXT: andl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx @@ -128,15 +127,15 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: addl %edi, %ecx ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %edi, %esi @@ -462,21 +461,21 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: imull %esi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %eax, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %edx, %esi -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edx, %edi -; X86-NEXT: addl %esi, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %esi ; X86-NEXT: mull %edx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx ; X86-NEXT: addl %edx, %ebx ; X86-NEXT: addl %esi, %ebx ; X86-NEXT: addl %ebp, %eax @@ -519,63 +518,62 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r8, %r11 -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %rbx, %r15 ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: movq %r10, %rbx +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r15, %rbx +; X64-NEXT: addq %r15, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r14, %rbp ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %r10d -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rbp, %r13 ; X64-NEXT: adcq %r10, %r12 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %r15, %r10 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %r10, %r15 ; X64-NEXT: adcq %rbp, %rdx -; X64-NEXT: imulq %r9, %r11 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: imulq %rbx, %r11 ; X64-NEXT: addq %r13, %r14 ; X64-NEXT: adcq %r12, %r15 ; X64-NEXT: adcq %rdx, %r11 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %r10, %rbp @@ -585,19 +583,19 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X64-NEXT: mulq %r10 ; X64-NEXT: addq %rbp, %rax ; X64-NEXT: adcq %r13, %rdx -; X64-NEXT: imulq %r10, %rcx -; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: imulq %r10, %r8 +; X64-NEXT: addq %rdx, %r8 ; X64-NEXT: addq %r14, %r12 ; X64-NEXT: adcq %r15, %rax -; X64-NEXT: adcq %r11, %rcx -; X64-NEXT: imulq %r9, %r8 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx +; X64-NEXT: adcq %r11, %r8 +; X64-NEXT: imulq %rbx, %rcx +; X64-NEXT: imulq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi -; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: addq %r8, %rsi +; X64-NEXT: addq %r9, %rsi ; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: movq %rbx, 8(%rdi) +; X64-NEXT: addq %r8, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, 8(%rdi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: movq %r12, 16(%rdi) diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index eacc714b49a4d..2ee7d093965c6 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -43,26 +43,26 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %edx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %edi, %eax +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: shldl $30, %eax, %edx -; X86-NEXT: shldl $30, %esi, %eax +; X86-NEXT: shldl $30, %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -116,38 +116,36 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; ; X86-LABEL: vec: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: shldl $30, %edi, %ecx ; X86-NEXT: shldl $30, %eax, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: shldl $30, %eax, %ebx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %edx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shldl $30, %eax, %ebp -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: shldl $30, %ebx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: shldl $30, %eax, %edx -; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl %ebp, 8(%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %edx, 12(%ebx) +; X86-NEXT: movl %edi, 8(%ebx) +; X86-NEXT: movl %esi, 4(%ebx) +; X86-NEXT: movl %ecx, (%ebx) +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %tmp = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2) ret <4 x i32> %tmp @@ -236,15 +234,15 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi ; X86-NEXT: imull {{[0-9]+}}(%esp), %esi ; X86-NEXT: imull {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -268,11 +266,11 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ebp ; X86-NEXT: addl %edx, %edi @@ -306,31 +304,30 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %edx, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %esi, %eax ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: shldl $1, %ecx, %esi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: shldl $1, %ecx, %edi ; X86-NEXT: shrdl $31, %ecx, %eax -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -354,30 +351,30 @@ define i64 @func9(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll index 8c7078c726328..e9ef0bb4212d5 100644 --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -52,31 +52,31 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %edx, %edi ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: adcl %esi, %edx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: shrdl $2, %eax, %ecx ; X86-NEXT: shrdl $2, %edx, %eax ; X86-NEXT: shrl $2, %edx -; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ebx, %edx ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: cmovel %eax, %edx @@ -161,7 +161,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: cmpl $4, %ecx ; X64-NEXT: cmovael %eax, %edx ; X64-NEXT: movd %edx, %xmm3 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: movd %xmm0, %edx ; X64-NEXT: imulq %rcx, %rdx @@ -170,21 +169,22 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: shrdl $2, %ecx, %edx ; X64-NEXT: cmpl $4, %ecx ; X64-NEXT: cmovael %eax, %edx -; X64-NEXT: movd %edx, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: movd %edx, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: movd %xmm0, %edx ; X64-NEXT: imulq %rcx, %rdx ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: shrq $32, %rcx ; X64-NEXT: shrdl $2, %ecx, %edx +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X64-NEXT: cmpl $4, %ecx ; X64-NEXT: cmovael %eax, %edx ; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: vec: @@ -194,16 +194,15 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrdl $2, %edx, %esi ; X86-NEXT: cmpl $4, %edx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovael %ecx, %esi -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shrdl $2, %edx, %ebx @@ -278,28 +277,28 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: testl %esi, %esi -; X86-NEXT: setne %dl -; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: setne %cl -; X86-NEXT: andb %dl, %cl +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %bl +; X86-NEXT: andb %cl, %bl ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %edi -; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: seto %cl +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: seto %ch -; X86-NEXT: orb %bl, %ch -; X86-NEXT: orb %cl, %ch -; X86-NEXT: leal (%edi,%eax), %esi +; X86-NEXT: seto %bh +; X86-NEXT: orb %cl, %bh +; X86-NEXT: orb %bl, %bh +; X86-NEXT: leal (%esi,%eax), %ecx ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: setb %cl -; X86-NEXT: orb %ch, %cl +; X86-NEXT: orb %bh, %cl ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: cmovnel %ecx, %edx @@ -356,32 +355,33 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; X64-NEXT: movd %xmm2, %ecx ; X64-NEXT: mull %ecx -; X64-NEXT: movl $-1, %ecx -; X64-NEXT: cmovol %ecx, %eax -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm3, %edx +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: movl $-1, %esi +; X64-NEXT: cmovol %esi, %ecx +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X64-NEXT: movd %xmm2, %eax +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X64-NEXT: movd %xmm2, %edx ; X64-NEXT: mull %edx -; X64-NEXT: cmovol %ecx, %eax +; X64-NEXT: cmovol %esi, %eax +; X64-NEXT: movd %ecx, %xmm2 ; X64-NEXT: movd %eax, %xmm3 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %edx -; X64-NEXT: mull %edx -; X64-NEXT: cmovol %ecx, %eax -; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: mull %ecx +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: cmovol %esi, %ecx ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X64-NEXT: movd %xmm0, %edx ; X64-NEXT: mull %edx -; X64-NEXT: cmovol %ecx, %eax -; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: cmovol %esi, %eax +; X64-NEXT: movd %eax, %xmm1 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; X64-NEXT: retq ; ; X86-LABEL: vec2: @@ -391,14 +391,13 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $-1, %edi ; X86-NEXT: cmovol %edi, %esi -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: cmovol %edi, %ebx @@ -442,22 +441,21 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %edx, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: adcl $0, %edi @@ -496,22 +494,21 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %edx, %edi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %edi, %eax ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: adcl $0, %ecx diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll index 4c3170304b980..27252ee65919e 100644 --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -44,59 +44,60 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: leal (%ecx,%eax), %esi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: leal (%ecx,%esi), %eax +; X86-NEXT: addl %eax, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx -; X86-NEXT: leal (%esi,%eax), %esi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: leal (%ecx,%esi), %eax +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: adcl %edi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax @@ -128,7 +129,6 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: orb %ch, %bl -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: setne %bh ; X86-NEXT: orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -144,6 +144,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: orb %bl, %al ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload ; X86-NEXT: andb $1, %al ; X86-NEXT: movb %al, 16(%ecx) ; X86-NEXT: movl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll index 132683cdb0f9e..002fb1f15b541 100644 --- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll @@ -16,28 +16,28 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: testl %esi, %esi -; X86-NEXT: setne %dl -; X86-NEXT: testl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: setne %cl -; X86-NEXT: andb %dl, %cl +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %bl +; X86-NEXT: andb %cl, %bl ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %edi -; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: seto %cl +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: seto %ch -; X86-NEXT: orb %bl, %ch -; X86-NEXT: orb %cl, %ch -; X86-NEXT: leal (%edi,%eax), %esi +; X86-NEXT: seto %bh +; X86-NEXT: orb %cl, %bh +; X86-NEXT: orb %bl, %bh +; X86-NEXT: leal (%esi,%eax), %ecx ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: setb %cl -; X86-NEXT: orb %ch, %cl +; X86-NEXT: orb %bh, %cl ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll index 58fd6492f2ed5..5a3130d3b13be 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -10,11 +10,11 @@ define <4 x i32> @out_constant_varx_mone(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: out_constant_varx_mone: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] ; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -149,11 +149,11 @@ define <4 x i32> @in_constant_varx_mone_invmask(ptr%px, ptr%py, ptr%pmask) { define <4 x i32> @out_constant_varx_42(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: out_constant_varx_42: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -186,11 +186,11 @@ define <4 x i32> @out_constant_varx_42(ptr%px, ptr%py, ptr%pmask) { define <4 x i32> @in_constant_varx_42(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: in_constant_varx_42: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -223,11 +223,11 @@ define <4 x i32> @in_constant_varx_42(ptr%px, ptr%py, ptr%pmask) { define <4 x i32> @out_constant_varx_42_invmask(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: out_constant_varx_42_invmask: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1 ; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -261,11 +261,11 @@ define <4 x i32> @out_constant_varx_42_invmask(ptr%px, ptr%py, ptr%pmask) { define <4 x i32> @in_constant_varx_42_invmask(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: in_constant_varx_42_invmask: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1 ; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -358,11 +358,11 @@ define <4 x i32> @in_constant_mone_vary(ptr%px, ptr%py, ptr%pmask) { define <4 x i32> @out_constant_mone_vary_invmask(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: out_constant_mone_vary_invmask: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] ; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -431,11 +431,11 @@ define <4 x i32> @in_constant_mone_vary_invmask(ptr%px, ptr%py, ptr%pmask) { define <4 x i32> @out_constant_42_vary(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: out_constant_42_vary: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -468,11 +468,11 @@ define <4 x i32> @out_constant_42_vary(ptr%px, ptr%py, ptr%pmask) { define <4 x i32> @in_constant_42_vary(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: in_constant_42_vary: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 ; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -505,11 +505,11 @@ define <4 x i32> @in_constant_42_vary(ptr%px, ptr%py, ptr%pmask) { define <4 x i32> @out_constant_42_vary_invmask(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: out_constant_42_vary_invmask: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE1-NEXT: andps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -543,11 +543,11 @@ define <4 x i32> @out_constant_42_vary_invmask(ptr%px, ptr%py, ptr%pmask) { define <4 x i32> @in_constant_42_vary_invmask(ptr%px, ptr%py, ptr%pmask) { ; CHECK-SSE1-LABEL: in_constant_42_vary_invmask: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps (%rdx), %xmm1 ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll index b1194bedc4e1c..8b6b65ec6ee7b 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -335,17 +335,17 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: xorb %r11b, %bl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; CHECK-BASELINE-NEXT: xorb %r10b, %bpl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl ; CHECK-BASELINE-NEXT: xorb %r11b, %bl -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorb %r10b, %r11b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: xorb %r10b, %r11b +; CHECK-BASELINE-NEXT: xorb %r10b, %bpl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: xorb %dil, %r10b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b ; CHECK-BASELINE-NEXT: xorb %dil, %r10b ; CHECK-BASELINE-NEXT: movb %r10b, 7(%rax) -; CHECK-BASELINE-NEXT: movb %r11b, 6(%rax) +; CHECK-BASELINE-NEXT: movb %bpl, 6(%rax) ; CHECK-BASELINE-NEXT: movb %bl, 5(%rax) ; CHECK-BASELINE-NEXT: movb %r9b, 4(%rax) ; CHECK-BASELINE-NEXT: movb %r8b, 3(%rax) @@ -393,17 +393,17 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: xorb %r11b, %bl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; CHECK-SSE1-NEXT: xorb %r10b, %bpl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl ; CHECK-SSE1-NEXT: xorb %r11b, %bl -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorb %r10b, %r11b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: xorb %r10b, %r11b +; CHECK-SSE1-NEXT: xorb %r10b, %bpl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: xorb %dil, %r10b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b ; CHECK-SSE1-NEXT: xorb %dil, %r10b ; CHECK-SSE1-NEXT: movb %r10b, 7(%rax) -; CHECK-SSE1-NEXT: movb %r11b, 6(%rax) +; CHECK-SSE1-NEXT: movb %bpl, 6(%rax) ; CHECK-SSE1-NEXT: movb %bl, 5(%rax) ; CHECK-SSE1-NEXT: movb %r9b, 4(%rax) ; CHECK-SSE1-NEXT: movb %r8b, 3(%rax) @@ -635,59 +635,60 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movl %edx, %r11d +; CHECK-BASELINE-NEXT: movl %esi, %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: xorb %r10b, %sil -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: xorb %r10b, %sil -; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: xorb %r10b, %r11b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: xorb %dl, %r11b -; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: xorb %r10b, %r11b +; CHECK-BASELINE-NEXT: xorb %dl, %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: xorb %dl, %cl ; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: xorb %bl, %r8b +; CHECK-BASELINE-NEXT: xorb %r12b, %r8b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-BASELINE-NEXT: xorb %bl, %r8b +; CHECK-BASELINE-NEXT: xorb %r12b, %r8b ; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: xorb %r14b, %r9b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-BASELINE-NEXT: xorb %r14b, %r9b +; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: xorb %r12b, %r14b +; CHECK-BASELINE-NEXT: xorb %sil, %r14b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-BASELINE-NEXT: xorb %r12b, %r14b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: xorb %bpl, %r12b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: xorb %sil, %r14b ; CHECK-BASELINE-NEXT: xorb %bpl, %r12b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; CHECK-BASELINE-NEXT: xorb %r15b, %r8b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-BASELINE-NEXT: xorb %r15b, %r8b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-BASELINE-NEXT: xorb %r15b, %sil +; CHECK-BASELINE-NEXT: xorb %r13b, %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil -; CHECK-BASELINE-NEXT: xorb %r15b, %sil +; CHECK-BASELINE-NEXT: xorb %r13b, %sil ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; CHECK-BASELINE-NEXT: xorb %r13b, %dl +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; CHECK-BASELINE-NEXT: xorb %r9b, %dl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-BASELINE-NEXT: xorb %r13b, %dl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-BASELINE-NEXT: xorb %r9b, %dl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: xorb %al, %r13b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: xorb %al, %r13b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: xorb %al, %r15b @@ -698,40 +699,40 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-BASELINE-NEXT: xorb %al, %bpl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: xorb %r10b, %r13b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d -; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; CHECK-BASELINE-NEXT: xorb %r9b, %al ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: xorb %r10b, %r13b +; CHECK-BASELINE-NEXT: xorb %r9b, %al ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d -; CHECK-BASELINE-NEXT: xorb %r8b, %r10b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; CHECK-BASELINE-NEXT: xorb %r9b, %r10b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: xorb %r8b, %r10b +; CHECK-BASELINE-NEXT: xorb %r9b, %r10b ; CHECK-BASELINE-NEXT: movb %r10b, 15(%rdi) ; CHECK-BASELINE-NEXT: movb %al, 14(%rdi) -; CHECK-BASELINE-NEXT: movb %bl, 13(%rdi) +; CHECK-BASELINE-NEXT: movb %r13b, 13(%rdi) ; CHECK-BASELINE-NEXT: movb %bpl, 12(%rdi) ; CHECK-BASELINE-NEXT: movb %r15b, 11(%rdi) -; CHECK-BASELINE-NEXT: movb %r13b, 10(%rdi) -; CHECK-BASELINE-NEXT: movb %cl, 9(%rdi) -; CHECK-BASELINE-NEXT: movb %dl, 8(%rdi) -; CHECK-BASELINE-NEXT: movb %sil, 7(%rdi) +; CHECK-BASELINE-NEXT: movb %cl, 10(%rdi) +; CHECK-BASELINE-NEXT: movb %dl, 9(%rdi) +; CHECK-BASELINE-NEXT: movb %sil, 8(%rdi) +; CHECK-BASELINE-NEXT: movb %r8b, 7(%rdi) ; CHECK-BASELINE-NEXT: movb %r12b, 6(%rdi) ; CHECK-BASELINE-NEXT: movb %r14b, 5(%rdi) -; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: movb %al, 4(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movb %al, 3(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movb %al, 2(%rdi) ; CHECK-BASELINE-NEXT: movb %r11b, 1(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movb %al, (%rdi) +; CHECK-BASELINE-NEXT: movb %bl, (%rdi) ; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 @@ -750,59 +751,60 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movl %edx, %r11d +; CHECK-SSE1-NEXT: movl %esi, %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: xorb %r10b, %sil -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: xorb %r10b, %sil -; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: xorb %r10b, %r11b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: xorb %dl, %r11b -; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: xorb %r10b, %r11b +; CHECK-SSE1-NEXT: xorb %dl, %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: xorb %dl, %cl ; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: xorb %bl, %r8b +; CHECK-SSE1-NEXT: xorb %r12b, %r8b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-SSE1-NEXT: xorb %bl, %r8b +; CHECK-SSE1-NEXT: xorb %r12b, %r8b ; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: xorb %r14b, %r9b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-SSE1-NEXT: xorb %r14b, %r9b +; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: xorb %r12b, %r14b +; CHECK-SSE1-NEXT: xorb %sil, %r14b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-SSE1-NEXT: xorb %r12b, %r14b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: xorb %bpl, %r12b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: xorb %sil, %r14b ; CHECK-SSE1-NEXT: xorb %bpl, %r12b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; CHECK-SSE1-NEXT: xorb %r15b, %r8b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b +; CHECK-SSE1-NEXT: xorb %r15b, %r8b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-SSE1-NEXT: xorb %r15b, %sil +; CHECK-SSE1-NEXT: xorb %r13b, %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil -; CHECK-SSE1-NEXT: xorb %r15b, %sil +; CHECK-SSE1-NEXT: xorb %r13b, %sil ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; CHECK-SSE1-NEXT: xorb %r13b, %dl +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; CHECK-SSE1-NEXT: xorb %r9b, %dl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl -; CHECK-SSE1-NEXT: xorb %r13b, %dl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl +; CHECK-SSE1-NEXT: xorb %r9b, %dl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: xorb %al, %r13b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: xorb %al, %r13b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: xorb %al, %r15b @@ -813,40 +815,40 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-SSE1-NEXT: xorb %al, %bpl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: xorb %r10b, %r13b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d -; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; CHECK-SSE1-NEXT: xorb %r9b, %al ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: xorb %r10b, %r13b +; CHECK-SSE1-NEXT: xorb %r9b, %al ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d -; CHECK-SSE1-NEXT: xorb %r8b, %r10b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; CHECK-SSE1-NEXT: xorb %r9b, %r10b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: xorb %r8b, %r10b +; CHECK-SSE1-NEXT: xorb %r9b, %r10b ; CHECK-SSE1-NEXT: movb %r10b, 15(%rdi) ; CHECK-SSE1-NEXT: movb %al, 14(%rdi) -; CHECK-SSE1-NEXT: movb %bl, 13(%rdi) +; CHECK-SSE1-NEXT: movb %r13b, 13(%rdi) ; CHECK-SSE1-NEXT: movb %bpl, 12(%rdi) ; CHECK-SSE1-NEXT: movb %r15b, 11(%rdi) -; CHECK-SSE1-NEXT: movb %r13b, 10(%rdi) -; CHECK-SSE1-NEXT: movb %cl, 9(%rdi) -; CHECK-SSE1-NEXT: movb %dl, 8(%rdi) -; CHECK-SSE1-NEXT: movb %sil, 7(%rdi) +; CHECK-SSE1-NEXT: movb %cl, 10(%rdi) +; CHECK-SSE1-NEXT: movb %dl, 9(%rdi) +; CHECK-SSE1-NEXT: movb %sil, 8(%rdi) +; CHECK-SSE1-NEXT: movb %r8b, 7(%rdi) ; CHECK-SSE1-NEXT: movb %r12b, 6(%rdi) ; CHECK-SSE1-NEXT: movb %r14b, 5(%rdi) -; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: movb %al, 4(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movb %al, 3(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movb %al, 2(%rdi) ; CHECK-SSE1-NEXT: movb %r11b, 1(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movb %al, (%rdi) +; CHECK-SSE1-NEXT: movb %bl, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 @@ -1013,14 +1015,13 @@ define <4 x i32> @out_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-LABEL: out_v4i32: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl 12(%rdx), %edi ; CHECK-BASELINE-NEXT: movl 8(%rdx), %r8d ; CHECK-BASELINE-NEXT: movl (%rdx), %r9d ; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d -; CHECK-BASELINE-NEXT: movl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx -; CHECK-BASELINE-NEXT: andl (%rcx), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: movl (%rsi), %edi +; CHECK-BASELINE-NEXT: xorl %r9d, %edi +; CHECK-BASELINE-NEXT: andl (%rcx), %edi +; CHECK-BASELINE-NEXT: xorl %r9d, %edi ; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d ; CHECK-BASELINE-NEXT: xorl %r10d, %r9d ; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d @@ -1028,24 +1029,25 @@ define <4 x i32> @out_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movl 8(%rsi), %r10d ; CHECK-BASELINE-NEXT: xorl %r8d, %r10d ; CHECK-BASELINE-NEXT: andl 8(%rcx), %r10d +; CHECK-BASELINE-NEXT: movl 12(%rdx), %edx ; CHECK-BASELINE-NEXT: xorl %r8d, %r10d ; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi -; CHECK-BASELINE-NEXT: xorl %edi, %esi +; CHECK-BASELINE-NEXT: xorl %edx, %esi ; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi -; CHECK-BASELINE-NEXT: xorl %edi, %esi +; CHECK-BASELINE-NEXT: xorl %edx, %esi ; CHECK-BASELINE-NEXT: movl %esi, 12(%rax) ; CHECK-BASELINE-NEXT: movl %r10d, 8(%rax) ; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax) -; CHECK-BASELINE-NEXT: movl %edx, (%rax) +; CHECK-BASELINE-NEXT: movl %edi, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i32: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -1080,35 +1082,35 @@ define <4 x i32> @out_v4i32_undef(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: movl 8(%rsi), %edi -; CHECK-BASELINE-NEXT: movl 12(%rdx), %r8d -; CHECK-BASELINE-NEXT: movl (%rdx), %r9d -; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx +; CHECK-BASELINE-NEXT: movl (%rdx), %r8d +; CHECK-BASELINE-NEXT: movl 4(%rdx), %r9d ; CHECK-BASELINE-NEXT: andl 8(%rcx), %edi ; CHECK-BASELINE-NEXT: movl (%rsi), %r10d -; CHECK-BASELINE-NEXT: xorl %r9d, %r10d +; CHECK-BASELINE-NEXT: xorl %r8d, %r10d ; CHECK-BASELINE-NEXT: andl (%rcx), %r10d -; CHECK-BASELINE-NEXT: xorl %r9d, %r10d -; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d -; CHECK-BASELINE-NEXT: xorl %edx, %r9d -; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d -; CHECK-BASELINE-NEXT: xorl %edx, %r9d -; CHECK-BASELINE-NEXT: movl 12(%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r8d, %edx -; CHECK-BASELINE-NEXT: andl 12(%rcx), %edx -; CHECK-BASELINE-NEXT: xorl %r8d, %edx +; CHECK-BASELINE-NEXT: xorl %r8d, %r10d +; CHECK-BASELINE-NEXT: movl 4(%rsi), %r8d +; CHECK-BASELINE-NEXT: xorl %r9d, %r8d +; CHECK-BASELINE-NEXT: andl 4(%rcx), %r8d +; CHECK-BASELINE-NEXT: movl 12(%rdx), %edx +; CHECK-BASELINE-NEXT: xorl %r9d, %r8d +; CHECK-BASELINE-NEXT: movl 12(%rsi), %esi +; CHECK-BASELINE-NEXT: xorl %edx, %esi +; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi +; CHECK-BASELINE-NEXT: xorl %edx, %esi ; CHECK-BASELINE-NEXT: movl %edi, 8(%rax) -; CHECK-BASELINE-NEXT: movl %edx, 12(%rax) -; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax) +; CHECK-BASELINE-NEXT: movl %esi, 12(%rax) +; CHECK-BASELINE-NEXT: movl %r8d, 4(%rax) ; CHECK-BASELINE-NEXT: movl %r10d, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i32_undef: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -1198,8 +1200,6 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movq %rdx, %r8 ; CHECK-BASELINE-NEXT: movq %rsi, %r9 ; CHECK-BASELINE-NEXT: movq %rdi, %r11 -; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 13(%rdx), %eax @@ -1210,21 +1210,21 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r14d -; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r15d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi +; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebx +; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r14d +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r12d +; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %r13d ; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx ; CHECK-BASELINE-NEXT: movzbl 2(%r8), %edi ; CHECK-BASELINE-NEXT: movzbl (%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 1(%r8), %ecx -; CHECK-BASELINE-NEXT: movzbl (%r9), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb (%r10), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl (%rsi), %esi +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: andb (%r10), %sil +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 1(%r9), %eax ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: andb 1(%r10), %al @@ -1233,55 +1233,55 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movzbl 2(%r9), %eax ; CHECK-BASELINE-NEXT: xorb %dil, %al ; CHECK-BASELINE-NEXT: andb 2(%r10), %al +; CHECK-BASELINE-NEXT: movzbl 3(%r9), %ecx +; CHECK-BASELINE-NEXT: xorb %dl, %cl +; CHECK-BASELINE-NEXT: andb 3(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %dil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 3(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %dl, %al -; CHECK-BASELINE-NEXT: andb 3(%r10), %al -; CHECK-BASELINE-NEXT: xorb %dl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: xorb %dl, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 4(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: andb 4(%r10), %al -; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 5(%r9), %eax ; CHECK-BASELINE-NEXT: xorb %r13b, %al -; CHECK-BASELINE-NEXT: andb 5(%r10), %al +; CHECK-BASELINE-NEXT: andb 4(%r10), %al ; CHECK-BASELINE-NEXT: xorb %r13b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 6(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 5(%r9), %eax ; CHECK-BASELINE-NEXT: xorb %r12b, %al -; CHECK-BASELINE-NEXT: andb 6(%r10), %al +; CHECK-BASELINE-NEXT: andb 5(%r10), %al ; CHECK-BASELINE-NEXT: xorb %r12b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 7(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl 6(%r9), %eax ; CHECK-BASELINE-NEXT: xorb %r15b, %al -; CHECK-BASELINE-NEXT: andb 7(%r10), %al +; CHECK-BASELINE-NEXT: andb 6(%r10), %al +; CHECK-BASELINE-NEXT: movzbl 7(%r9), %ecx +; CHECK-BASELINE-NEXT: xorb %r14b, %cl +; CHECK-BASELINE-NEXT: andb 7(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: xorb %r14b, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 8(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: andb 8(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 9(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: andb 9(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%r9), %eax -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: andb 10(%r10), %al -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 11(%r9), %eax -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 11(%r10), %al -; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movzbl 11(%r9), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %dl, %cl +; CHECK-BASELINE-NEXT: andb 11(%r10), %cl +; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: xorb %dl, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 12(%r9), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %cl, %al @@ -1294,18 +1294,18 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: andb 13(%r10), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 14(%r9), %eax -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 14(%r10), %al -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 15(%r9), %eax -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 15(%r10), %al -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 14(%r9), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %dl, %cl +; CHECK-BASELINE-NEXT: andb 14(%r10), %cl +; CHECK-BASELINE-NEXT: movzbl 15(%r8), %eax +; CHECK-BASELINE-NEXT: xorb %dl, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 15(%r9), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 15(%r10), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 16(%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 16(%r9), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl @@ -1465,8 +1465,6 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movq %rdx, %r8 ; CHECK-SSE1-NEXT: movq %rsi, %r9 ; CHECK-SSE1-NEXT: movq %rdi, %r11 -; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 13(%rdx), %eax @@ -1477,21 +1475,21 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r14d -; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r15d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r12d -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r13d -; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi +; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebx +; CHECK-SSE1-NEXT: movzbl 8(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r14d +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r15d +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r12d +; CHECK-SSE1-NEXT: movzbl 4(%rdx), %r13d ; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx ; CHECK-SSE1-NEXT: movzbl 2(%r8), %edi ; CHECK-SSE1-NEXT: movzbl (%r8), %eax ; CHECK-SSE1-NEXT: movzbl 1(%r8), %ecx -; CHECK-SSE1-NEXT: movzbl (%r9), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb (%r10), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl (%rsi), %esi +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: andb (%r10), %sil +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 1(%r9), %eax ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: andb 1(%r10), %al @@ -1500,55 +1498,55 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movzbl 2(%r9), %eax ; CHECK-SSE1-NEXT: xorb %dil, %al ; CHECK-SSE1-NEXT: andb 2(%r10), %al +; CHECK-SSE1-NEXT: movzbl 3(%r9), %ecx +; CHECK-SSE1-NEXT: xorb %dl, %cl +; CHECK-SSE1-NEXT: andb 3(%r10), %cl ; CHECK-SSE1-NEXT: xorb %dil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 3(%r9), %eax -; CHECK-SSE1-NEXT: xorb %dl, %al -; CHECK-SSE1-NEXT: andb 3(%r10), %al -; CHECK-SSE1-NEXT: xorb %dl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: xorb %dl, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 4(%r9), %eax -; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: andb 4(%r10), %al -; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 5(%r9), %eax ; CHECK-SSE1-NEXT: xorb %r13b, %al -; CHECK-SSE1-NEXT: andb 5(%r10), %al +; CHECK-SSE1-NEXT: andb 4(%r10), %al ; CHECK-SSE1-NEXT: xorb %r13b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 6(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 5(%r9), %eax ; CHECK-SSE1-NEXT: xorb %r12b, %al -; CHECK-SSE1-NEXT: andb 6(%r10), %al +; CHECK-SSE1-NEXT: andb 5(%r10), %al ; CHECK-SSE1-NEXT: xorb %r12b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 7(%r9), %eax +; CHECK-SSE1-NEXT: movzbl 6(%r9), %eax ; CHECK-SSE1-NEXT: xorb %r15b, %al -; CHECK-SSE1-NEXT: andb 7(%r10), %al +; CHECK-SSE1-NEXT: andb 6(%r10), %al +; CHECK-SSE1-NEXT: movzbl 7(%r9), %ecx +; CHECK-SSE1-NEXT: xorb %r14b, %cl +; CHECK-SSE1-NEXT: andb 7(%r10), %cl ; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: xorb %r14b, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 8(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: andb 8(%r10), %al -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 9(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: andb 9(%r10), %al -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%r9), %eax -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: andb 10(%r10), %al -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 11(%r9), %eax -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 11(%r10), %al -; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movzbl 11(%r9), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %dl, %cl +; CHECK-SSE1-NEXT: andb 11(%r10), %cl +; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: xorb %dl, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 12(%r9), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %cl, %al @@ -1561,18 +1559,18 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: andb 13(%r10), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 14(%r9), %eax -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 14(%r10), %al -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 15(%r9), %eax -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 15(%r10), %al -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 14(%r9), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %dl, %cl +; CHECK-SSE1-NEXT: andb 14(%r10), %cl +; CHECK-SSE1-NEXT: movzbl 15(%r8), %eax +; CHECK-SSE1-NEXT: xorb %dl, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 15(%r9), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 15(%r10), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 16(%r8), %eax ; CHECK-SSE1-NEXT: movzbl 16(%r9), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl @@ -1729,8 +1727,8 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE2-NEXT: movaps (%rdi), %xmm3 ; CHECK-SSE2-NEXT: andps %xmm0, %xmm3 ; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm1 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: orps %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; @@ -1759,25 +1757,25 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r15d ; CHECK-BASELINE-NEXT: movzwl 16(%rdx), %r14d ; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp ; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %ebx -; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r12d ; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r11d ; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r10d ; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r9d ; CHECK-BASELINE-NEXT: movzwl (%rdx), %r8d -; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r12d +; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r15d ; CHECK-BASELINE-NEXT: movzwl (%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %r8w, %ax +; CHECK-BASELINE-NEXT: movl %r8d, %r13d ; CHECK-BASELINE-NEXT: andw (%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r12w, %ax -; CHECK-BASELINE-NEXT: andw 2(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r12d +; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %r8d +; CHECK-BASELINE-NEXT: xorw %r15w, %r8w +; CHECK-BASELINE-NEXT: andw 2(%rcx), %r8w +; CHECK-BASELINE-NEXT: xorl %eax, %r13d +; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorl %r8d, %r15d ; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %r9w, %ax ; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax @@ -1791,13 +1789,13 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %r11w, %ax ; CHECK-BASELINE-NEXT: andw 8(%rcx), %ax +; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %r8d +; CHECK-BASELINE-NEXT: xorw %r12w, %r8w +; CHECK-BASELINE-NEXT: andw 10(%rcx), %r8w ; CHECK-BASELINE-NEXT: xorl %eax, %r11d ; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax -; CHECK-BASELINE-NEXT: andw 10(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d -; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: xorl %r8d, %r12d +; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %bx, %ax ; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax @@ -1809,11 +1807,12 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %r14w, %ax ; CHECK-BASELINE-NEXT: andw 16(%rcx), %ax +; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r12d ; CHECK-BASELINE-NEXT: xorl %eax, %r14d ; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r15w, %ax +; CHECK-BASELINE-NEXT: xorw %r12w, %ax ; CHECK-BASELINE-NEXT: andw 18(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r15d +; CHECK-BASELINE-NEXT: xorl %eax, %r12d ; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %r13d ; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %r13w, %ax @@ -1850,7 +1849,7 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movw %r8w, 24(%rdi) ; CHECK-BASELINE-NEXT: movw %r9w, 22(%rdi) ; CHECK-BASELINE-NEXT: movw %r13w, 20(%rdi) -; CHECK-BASELINE-NEXT: movw %r15w, 18(%rdi) +; CHECK-BASELINE-NEXT: movw %r12w, 18(%rdi) ; CHECK-BASELINE-NEXT: movw %r14w, 16(%rdi) ; CHECK-BASELINE-NEXT: movw %bp, 14(%rdi) ; CHECK-BASELINE-NEXT: movw %bx, 12(%rdi) @@ -1862,7 +1861,7 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movw %ax, 4(%rdi) -; CHECK-BASELINE-NEXT: movw %r12w, 2(%rdi) +; CHECK-BASELINE-NEXT: movw %r15w, 2(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: movw %ax, (%rdi) ; CHECK-BASELINE-NEXT: movq %rdi, %rax @@ -1882,25 +1881,25 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r15d ; CHECK-SSE1-NEXT: movzwl 16(%rdx), %r14d ; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp ; CHECK-SSE1-NEXT: movzwl 12(%rdx), %ebx -; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r13d +; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r12d ; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r11d ; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r10d ; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r9d ; CHECK-SSE1-NEXT: movzwl (%rdx), %r8d -; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r12d +; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r15d ; CHECK-SSE1-NEXT: movzwl (%rsi), %eax ; CHECK-SSE1-NEXT: xorw %r8w, %ax +; CHECK-SSE1-NEXT: movl %r8d, %r13d ; CHECK-SSE1-NEXT: andw (%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r12w, %ax -; CHECK-SSE1-NEXT: andw 2(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r12d +; CHECK-SSE1-NEXT: movzwl 2(%rsi), %r8d +; CHECK-SSE1-NEXT: xorw %r15w, %r8w +; CHECK-SSE1-NEXT: andw 2(%rcx), %r8w +; CHECK-SSE1-NEXT: xorl %eax, %r13d +; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorl %r8d, %r15d ; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %r9w, %ax ; CHECK-SSE1-NEXT: andw 4(%rcx), %ax @@ -1914,13 +1913,13 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %r11w, %ax ; CHECK-SSE1-NEXT: andw 8(%rcx), %ax +; CHECK-SSE1-NEXT: movzwl 10(%rsi), %r8d +; CHECK-SSE1-NEXT: xorw %r12w, %r8w +; CHECK-SSE1-NEXT: andw 10(%rcx), %r8w ; CHECK-SSE1-NEXT: xorl %eax, %r11d ; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax -; CHECK-SSE1-NEXT: andw 10(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d -; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: xorl %r8d, %r12d +; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %bx, %ax ; CHECK-SSE1-NEXT: andw 12(%rcx), %ax @@ -1932,11 +1931,12 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movzwl 16(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %r14w, %ax ; CHECK-SSE1-NEXT: andw 16(%rcx), %ax +; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r12d ; CHECK-SSE1-NEXT: xorl %eax, %r14d ; CHECK-SSE1-NEXT: movzwl 18(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r15w, %ax +; CHECK-SSE1-NEXT: xorw %r12w, %ax ; CHECK-SSE1-NEXT: andw 18(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r15d +; CHECK-SSE1-NEXT: xorl %eax, %r12d ; CHECK-SSE1-NEXT: movzwl 20(%rdx), %r13d ; CHECK-SSE1-NEXT: movzwl 20(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %r13w, %ax @@ -1973,7 +1973,7 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movw %r8w, 24(%rdi) ; CHECK-SSE1-NEXT: movw %r9w, 22(%rdi) ; CHECK-SSE1-NEXT: movw %r13w, 20(%rdi) -; CHECK-SSE1-NEXT: movw %r15w, 18(%rdi) +; CHECK-SSE1-NEXT: movw %r12w, 18(%rdi) ; CHECK-SSE1-NEXT: movw %r14w, 16(%rdi) ; CHECK-SSE1-NEXT: movw %bp, 14(%rdi) ; CHECK-SSE1-NEXT: movw %bx, 12(%rdi) @@ -1985,7 +1985,7 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movw %ax, 4(%rdi) -; CHECK-SSE1-NEXT: movw %r12w, 2(%rdi) +; CHECK-SSE1-NEXT: movw %r15w, 2(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: movw %ax, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax @@ -2006,8 +2006,8 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE2-NEXT: movaps (%rdi), %xmm3 ; CHECK-SSE2-NEXT: andps %xmm0, %xmm3 ; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm1 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: orps %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; @@ -2034,54 +2034,54 @@ define <8 x i32> @out_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r14 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl 28(%rdx), %edi ; CHECK-BASELINE-NEXT: movl 24(%rdx), %r8d ; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d -; CHECK-BASELINE-NEXT: movl 16(%rdx), %ebx -; CHECK-BASELINE-NEXT: movl 12(%rdx), %r14d -; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebp +; CHECK-BASELINE-NEXT: movl 12(%rdx), %ebp +; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebx ; CHECK-BASELINE-NEXT: movl (%rdx), %r9d +; CHECK-BASELINE-NEXT: movl (%rsi), %edi +; CHECK-BASELINE-NEXT: xorl %r9d, %edi +; CHECK-BASELINE-NEXT: andl (%rcx), %edi ; CHECK-BASELINE-NEXT: movl 4(%rdx), %r11d -; CHECK-BASELINE-NEXT: movl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx -; CHECK-BASELINE-NEXT: andl (%rcx), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx +; CHECK-BASELINE-NEXT: xorl %r9d, %edi ; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d ; CHECK-BASELINE-NEXT: xorl %r11d, %r9d ; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d ; CHECK-BASELINE-NEXT: xorl %r11d, %r9d ; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d -; CHECK-BASELINE-NEXT: xorl %ebp, %r11d +; CHECK-BASELINE-NEXT: xorl %ebx, %r11d ; CHECK-BASELINE-NEXT: andl 8(%rcx), %r11d -; CHECK-BASELINE-NEXT: xorl %ebp, %r11d -; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebp +; CHECK-BASELINE-NEXT: xorl %ebx, %r11d +; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebx +; CHECK-BASELINE-NEXT: xorl %ebp, %ebx +; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebx +; CHECK-BASELINE-NEXT: movl 16(%rdx), %r14d +; CHECK-BASELINE-NEXT: xorl %ebp, %ebx +; CHECK-BASELINE-NEXT: movl 16(%rsi), %ebp ; CHECK-BASELINE-NEXT: xorl %r14d, %ebp -; CHECK-BASELINE-NEXT: andl 12(%rcx), %ebp +; CHECK-BASELINE-NEXT: andl 16(%rcx), %ebp ; CHECK-BASELINE-NEXT: xorl %r14d, %ebp -; CHECK-BASELINE-NEXT: movl 16(%rsi), %r14d -; CHECK-BASELINE-NEXT: xorl %ebx, %r14d -; CHECK-BASELINE-NEXT: andl 16(%rcx), %r14d -; CHECK-BASELINE-NEXT: xorl %ebx, %r14d -; CHECK-BASELINE-NEXT: movl 20(%rsi), %ebx -; CHECK-BASELINE-NEXT: xorl %r10d, %ebx -; CHECK-BASELINE-NEXT: andl 20(%rcx), %ebx -; CHECK-BASELINE-NEXT: xorl %r10d, %ebx +; CHECK-BASELINE-NEXT: movl 20(%rsi), %r14d +; CHECK-BASELINE-NEXT: xorl %r10d, %r14d +; CHECK-BASELINE-NEXT: andl 20(%rcx), %r14d +; CHECK-BASELINE-NEXT: xorl %r10d, %r14d ; CHECK-BASELINE-NEXT: movl 24(%rsi), %r10d ; CHECK-BASELINE-NEXT: xorl %r8d, %r10d ; CHECK-BASELINE-NEXT: andl 24(%rcx), %r10d +; CHECK-BASELINE-NEXT: movl 28(%rdx), %edx ; CHECK-BASELINE-NEXT: xorl %r8d, %r10d ; CHECK-BASELINE-NEXT: movl 28(%rsi), %esi -; CHECK-BASELINE-NEXT: xorl %edi, %esi +; CHECK-BASELINE-NEXT: xorl %edx, %esi ; CHECK-BASELINE-NEXT: andl 28(%rcx), %esi -; CHECK-BASELINE-NEXT: xorl %edi, %esi +; CHECK-BASELINE-NEXT: xorl %edx, %esi ; CHECK-BASELINE-NEXT: movl %esi, 28(%rax) ; CHECK-BASELINE-NEXT: movl %r10d, 24(%rax) -; CHECK-BASELINE-NEXT: movl %ebx, 20(%rax) -; CHECK-BASELINE-NEXT: movl %r14d, 16(%rax) -; CHECK-BASELINE-NEXT: movl %ebp, 12(%rax) +; CHECK-BASELINE-NEXT: movl %r14d, 20(%rax) +; CHECK-BASELINE-NEXT: movl %ebp, 16(%rax) +; CHECK-BASELINE-NEXT: movl %ebx, 12(%rax) ; CHECK-BASELINE-NEXT: movl %r11d, 8(%rax) ; CHECK-BASELINE-NEXT: movl %r9d, 4(%rax) -; CHECK-BASELINE-NEXT: movl %edx, (%rax) +; CHECK-BASELINE-NEXT: movl %edi, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r14 ; CHECK-BASELINE-NEXT: popq %rbp @@ -2093,54 +2093,54 @@ define <8 x i32> @out_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r14 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movl 28(%rdx), %edi ; CHECK-SSE1-NEXT: movl 24(%rdx), %r8d ; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d -; CHECK-SSE1-NEXT: movl 16(%rdx), %ebx -; CHECK-SSE1-NEXT: movl 12(%rdx), %r14d -; CHECK-SSE1-NEXT: movl 8(%rdx), %ebp +; CHECK-SSE1-NEXT: movl 12(%rdx), %ebp +; CHECK-SSE1-NEXT: movl 8(%rdx), %ebx ; CHECK-SSE1-NEXT: movl (%rdx), %r9d +; CHECK-SSE1-NEXT: movl (%rsi), %edi +; CHECK-SSE1-NEXT: xorl %r9d, %edi +; CHECK-SSE1-NEXT: andl (%rcx), %edi ; CHECK-SSE1-NEXT: movl 4(%rdx), %r11d -; CHECK-SSE1-NEXT: movl (%rsi), %edx -; CHECK-SSE1-NEXT: xorl %r9d, %edx -; CHECK-SSE1-NEXT: andl (%rcx), %edx -; CHECK-SSE1-NEXT: xorl %r9d, %edx +; CHECK-SSE1-NEXT: xorl %r9d, %edi ; CHECK-SSE1-NEXT: movl 4(%rsi), %r9d ; CHECK-SSE1-NEXT: xorl %r11d, %r9d ; CHECK-SSE1-NEXT: andl 4(%rcx), %r9d ; CHECK-SSE1-NEXT: xorl %r11d, %r9d ; CHECK-SSE1-NEXT: movl 8(%rsi), %r11d -; CHECK-SSE1-NEXT: xorl %ebp, %r11d +; CHECK-SSE1-NEXT: xorl %ebx, %r11d ; CHECK-SSE1-NEXT: andl 8(%rcx), %r11d -; CHECK-SSE1-NEXT: xorl %ebp, %r11d -; CHECK-SSE1-NEXT: movl 12(%rsi), %ebp +; CHECK-SSE1-NEXT: xorl %ebx, %r11d +; CHECK-SSE1-NEXT: movl 12(%rsi), %ebx +; CHECK-SSE1-NEXT: xorl %ebp, %ebx +; CHECK-SSE1-NEXT: andl 12(%rcx), %ebx +; CHECK-SSE1-NEXT: movl 16(%rdx), %r14d +; CHECK-SSE1-NEXT: xorl %ebp, %ebx +; CHECK-SSE1-NEXT: movl 16(%rsi), %ebp ; CHECK-SSE1-NEXT: xorl %r14d, %ebp -; CHECK-SSE1-NEXT: andl 12(%rcx), %ebp +; CHECK-SSE1-NEXT: andl 16(%rcx), %ebp ; CHECK-SSE1-NEXT: xorl %r14d, %ebp -; CHECK-SSE1-NEXT: movl 16(%rsi), %r14d -; CHECK-SSE1-NEXT: xorl %ebx, %r14d -; CHECK-SSE1-NEXT: andl 16(%rcx), %r14d -; CHECK-SSE1-NEXT: xorl %ebx, %r14d -; CHECK-SSE1-NEXT: movl 20(%rsi), %ebx -; CHECK-SSE1-NEXT: xorl %r10d, %ebx -; CHECK-SSE1-NEXT: andl 20(%rcx), %ebx -; CHECK-SSE1-NEXT: xorl %r10d, %ebx +; CHECK-SSE1-NEXT: movl 20(%rsi), %r14d +; CHECK-SSE1-NEXT: xorl %r10d, %r14d +; CHECK-SSE1-NEXT: andl 20(%rcx), %r14d +; CHECK-SSE1-NEXT: xorl %r10d, %r14d ; CHECK-SSE1-NEXT: movl 24(%rsi), %r10d ; CHECK-SSE1-NEXT: xorl %r8d, %r10d ; CHECK-SSE1-NEXT: andl 24(%rcx), %r10d +; CHECK-SSE1-NEXT: movl 28(%rdx), %edx ; CHECK-SSE1-NEXT: xorl %r8d, %r10d ; CHECK-SSE1-NEXT: movl 28(%rsi), %esi -; CHECK-SSE1-NEXT: xorl %edi, %esi +; CHECK-SSE1-NEXT: xorl %edx, %esi ; CHECK-SSE1-NEXT: andl 28(%rcx), %esi -; CHECK-SSE1-NEXT: xorl %edi, %esi +; CHECK-SSE1-NEXT: xorl %edx, %esi ; CHECK-SSE1-NEXT: movl %esi, 28(%rax) ; CHECK-SSE1-NEXT: movl %r10d, 24(%rax) -; CHECK-SSE1-NEXT: movl %ebx, 20(%rax) -; CHECK-SSE1-NEXT: movl %r14d, 16(%rax) -; CHECK-SSE1-NEXT: movl %ebp, 12(%rax) +; CHECK-SSE1-NEXT: movl %r14d, 20(%rax) +; CHECK-SSE1-NEXT: movl %ebp, 16(%rax) +; CHECK-SSE1-NEXT: movl %ebx, 12(%rax) ; CHECK-SSE1-NEXT: movl %r11d, 8(%rax) ; CHECK-SSE1-NEXT: movl %r9d, 4(%rax) -; CHECK-SSE1-NEXT: movl %edx, (%rax) +; CHECK-SSE1-NEXT: movl %edi, (%rax) ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r14 ; CHECK-SSE1-NEXT: popq %rbp @@ -2155,8 +2155,8 @@ define <8 x i32> @out_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE2-NEXT: movaps (%rdi), %xmm3 ; CHECK-SSE2-NEXT: andps %xmm0, %xmm3 ; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm1 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: orps %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; @@ -2180,14 +2180,13 @@ define <4 x i64> @out_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-LABEL: out_v4i64: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movq 24(%rdx), %rdi ; CHECK-BASELINE-NEXT: movq 16(%rdx), %r8 ; CHECK-BASELINE-NEXT: movq (%rdx), %r9 ; CHECK-BASELINE-NEXT: movq 8(%rdx), %r10 -; CHECK-BASELINE-NEXT: movq (%rsi), %rdx -; CHECK-BASELINE-NEXT: xorq %r9, %rdx -; CHECK-BASELINE-NEXT: andq (%rcx), %rdx -; CHECK-BASELINE-NEXT: xorq %r9, %rdx +; CHECK-BASELINE-NEXT: movq (%rsi), %rdi +; CHECK-BASELINE-NEXT: xorq %r9, %rdi +; CHECK-BASELINE-NEXT: andq (%rcx), %rdi +; CHECK-BASELINE-NEXT: xorq %r9, %rdi ; CHECK-BASELINE-NEXT: movq 8(%rsi), %r9 ; CHECK-BASELINE-NEXT: xorq %r10, %r9 ; CHECK-BASELINE-NEXT: andq 8(%rcx), %r9 @@ -2195,28 +2194,28 @@ define <4 x i64> @out_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movq 16(%rsi), %r10 ; CHECK-BASELINE-NEXT: xorq %r8, %r10 ; CHECK-BASELINE-NEXT: andq 16(%rcx), %r10 +; CHECK-BASELINE-NEXT: movq 24(%rdx), %rdx ; CHECK-BASELINE-NEXT: xorq %r8, %r10 ; CHECK-BASELINE-NEXT: movq 24(%rsi), %rsi -; CHECK-BASELINE-NEXT: xorq %rdi, %rsi +; CHECK-BASELINE-NEXT: xorq %rdx, %rsi ; CHECK-BASELINE-NEXT: andq 24(%rcx), %rsi -; CHECK-BASELINE-NEXT: xorq %rdi, %rsi +; CHECK-BASELINE-NEXT: xorq %rdx, %rsi ; CHECK-BASELINE-NEXT: movq %rsi, 24(%rax) ; CHECK-BASELINE-NEXT: movq %r10, 16(%rax) ; CHECK-BASELINE-NEXT: movq %r9, 8(%rax) -; CHECK-BASELINE-NEXT: movq %rdx, (%rax) +; CHECK-BASELINE-NEXT: movq %rdi, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i64: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movq 24(%rdx), %rdi ; CHECK-SSE1-NEXT: movq 16(%rdx), %r8 ; CHECK-SSE1-NEXT: movq (%rdx), %r9 ; CHECK-SSE1-NEXT: movq 8(%rdx), %r10 -; CHECK-SSE1-NEXT: movq (%rsi), %rdx -; CHECK-SSE1-NEXT: xorq %r9, %rdx -; CHECK-SSE1-NEXT: andq (%rcx), %rdx -; CHECK-SSE1-NEXT: xorq %r9, %rdx +; CHECK-SSE1-NEXT: movq (%rsi), %rdi +; CHECK-SSE1-NEXT: xorq %r9, %rdi +; CHECK-SSE1-NEXT: andq (%rcx), %rdi +; CHECK-SSE1-NEXT: xorq %r9, %rdi ; CHECK-SSE1-NEXT: movq 8(%rsi), %r9 ; CHECK-SSE1-NEXT: xorq %r10, %r9 ; CHECK-SSE1-NEXT: andq 8(%rcx), %r9 @@ -2224,15 +2223,16 @@ define <4 x i64> @out_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movq 16(%rsi), %r10 ; CHECK-SSE1-NEXT: xorq %r8, %r10 ; CHECK-SSE1-NEXT: andq 16(%rcx), %r10 +; CHECK-SSE1-NEXT: movq 24(%rdx), %rdx ; CHECK-SSE1-NEXT: xorq %r8, %r10 ; CHECK-SSE1-NEXT: movq 24(%rsi), %rsi -; CHECK-SSE1-NEXT: xorq %rdi, %rsi +; CHECK-SSE1-NEXT: xorq %rdx, %rsi ; CHECK-SSE1-NEXT: andq 24(%rcx), %rsi -; CHECK-SSE1-NEXT: xorq %rdi, %rsi +; CHECK-SSE1-NEXT: xorq %rdx, %rsi ; CHECK-SSE1-NEXT: movq %rsi, 24(%rax) ; CHECK-SSE1-NEXT: movq %r10, 16(%rax) ; CHECK-SSE1-NEXT: movq %r9, 8(%rax) -; CHECK-SSE1-NEXT: movq %rdx, (%rax) +; CHECK-SSE1-NEXT: movq %rdi, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i64: @@ -2244,8 +2244,8 @@ define <4 x i64> @out_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE2-NEXT: movaps (%rdi), %xmm3 ; CHECK-SSE2-NEXT: andps %xmm0, %xmm3 ; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm1 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 ; CHECK-SSE2-NEXT: orps %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; @@ -2362,13 +2362,13 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: xorb %r11b, %dl ; CHECK-BASELINE-NEXT: xorb %r10b, %cl ; CHECK-BASELINE-NEXT: xorb %dil, %r8b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-BASELINE-NEXT: xorb %r9b, %sil ; CHECK-BASELINE-NEXT: xorb %r11b, %dl @@ -2386,13 +2386,13 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: xorb %r11b, %dl ; CHECK-SSE1-NEXT: xorb %r10b, %cl ; CHECK-SSE1-NEXT: xorb %dil, %r8b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-SSE1-NEXT: xorb %r9b, %sil ; CHECK-SSE1-NEXT: xorb %r11b, %dl @@ -2502,9 +2502,8 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-BASELINE-NEXT: xorb %r12b, %dl ; CHECK-BASELINE-NEXT: xorb %r14b, %cl ; CHECK-BASELINE-NEXT: xorb %bpl, %r8b -; CHECK-BASELINE-NEXT: xorb %bl, %r9b +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r9b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax @@ -2516,13 +2515,14 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: xorb %bl, %r15b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b ; CHECK-BASELINE-NEXT: xorb %r11b, %sil ; CHECK-BASELINE-NEXT: xorb %r12b, %dl ; CHECK-BASELINE-NEXT: xorb %r14b, %cl ; CHECK-BASELINE-NEXT: xorb %bpl, %r8b -; CHECK-BASELINE-NEXT: xorb %bl, %r9b -; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r9b +; CHECK-BASELINE-NEXT: xorb %bl, %r15b ; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b ; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: movb %al, 7(%rdi) @@ -2560,9 +2560,8 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-SSE1-NEXT: xorb %r12b, %dl ; CHECK-SSE1-NEXT: xorb %r14b, %cl ; CHECK-SSE1-NEXT: xorb %bpl, %r8b -; CHECK-SSE1-NEXT: xorb %bl, %r9b +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r9b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r15b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d ; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax @@ -2574,13 +2573,14 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: xorb %bl, %r15b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b ; CHECK-SSE1-NEXT: xorb %r11b, %sil ; CHECK-SSE1-NEXT: xorb %r12b, %dl ; CHECK-SSE1-NEXT: xorb %r14b, %cl ; CHECK-SSE1-NEXT: xorb %bpl, %r8b -; CHECK-SSE1-NEXT: xorb %bl, %r9b -; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r9b +; CHECK-SSE1-NEXT: xorb %bl, %r15b ; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b ; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: movb %al, 7(%rdi) @@ -2624,13 +2624,13 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: xorl %r11d, %edx ; CHECK-BASELINE-NEXT: xorl %r10d, %ecx ; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: xorl %r11d, %edx @@ -2648,13 +2648,13 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: xorl %r11d, %edx ; CHECK-SSE1-NEXT: xorl %r10d, %ecx ; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: xorl %r11d, %edx @@ -2755,102 +2755,103 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind ; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movq %rdi, %rdx +; CHECK-BASELINE-NEXT: movq %rdi, %rcx +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: xorb %dil, %r9b +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: xorb %r10b, %r11b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-BASELINE-NEXT: xorb %dil, %r9b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: xorb %r10b, %dil -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dil -; CHECK-BASELINE-NEXT: xorb %r10b, %dil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: xorb %bl, %dil +; CHECK-BASELINE-NEXT: xorb %r10b, %r11b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: xorb %r11b, %r10b +; CHECK-BASELINE-NEXT: xorb %r13b, %r10b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dil +; CHECK-BASELINE-NEXT: xorb %bl, %dil +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorb %r12b, %bl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-BASELINE-NEXT: xorb %r11b, %r10b -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorb %r13b, %r11b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-BASELINE-NEXT: xorb %r13b, %r11b +; CHECK-BASELINE-NEXT: xorb %r13b, %r10b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-BASELINE-NEXT: xorb %r12b, %r13b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: xorb %r12b, %r13b +; CHECK-BASELINE-NEXT: xorb %r15b, %r13b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-BASELINE-NEXT: xorb %r12b, %bl ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-BASELINE-NEXT: xorb %r15b, %r12b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b -; CHECK-BASELINE-NEXT: xorb %r15b, %r12b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: xorb %r14b, %r12b +; CHECK-BASELINE-NEXT: xorb %r15b, %r13b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: xorb %r14b, %r15b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-BASELINE-NEXT: xorb %r14b, %r15b +; CHECK-BASELINE-NEXT: xorb %bpl, %r15b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-BASELINE-NEXT: xorb %r14b, %r12b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: xorb %bpl, %r14b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-BASELINE-NEXT: xorb %bpl, %r14b +; CHECK-BASELINE-NEXT: xorb %al, %r14b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-BASELINE-NEXT: xorb %bpl, %r15b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: xorb %bl, %bpl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-BASELINE-NEXT: xorb %bl, %bpl -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-BASELINE-NEXT: xorb %al, %r14b ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: xorb %dl, %al ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-BASELINE-NEXT: xorb %sil, %cl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb %sil, %cl -; CHECK-BASELINE-NEXT: movb %cl, 15(%rdx) -; CHECK-BASELINE-NEXT: movb %al, 14(%rdx) -; CHECK-BASELINE-NEXT: movb %bl, 13(%rdx) -; CHECK-BASELINE-NEXT: movb %bpl, 12(%rdx) -; CHECK-BASELINE-NEXT: movb %r14b, 11(%rdx) -; CHECK-BASELINE-NEXT: movb %r15b, 10(%rdx) -; CHECK-BASELINE-NEXT: movb %r12b, 9(%rdx) -; CHECK-BASELINE-NEXT: movb %r13b, 8(%rdx) -; CHECK-BASELINE-NEXT: movb %r11b, 7(%rdx) -; CHECK-BASELINE-NEXT: movb %r10b, 6(%rdx) -; CHECK-BASELINE-NEXT: movb %dil, 5(%rdx) -; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdx) +; CHECK-BASELINE-NEXT: xorb %sil, %bpl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-BASELINE-NEXT: xorb %sil, %bpl +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-BASELINE-NEXT: xorb %sil, %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb %sil, %dl +; CHECK-BASELINE-NEXT: movb %dl, 15(%rcx) +; CHECK-BASELINE-NEXT: movb %al, 14(%rcx) +; CHECK-BASELINE-NEXT: movb %bpl, 13(%rcx) +; CHECK-BASELINE-NEXT: movb %r14b, 12(%rcx) +; CHECK-BASELINE-NEXT: movb %r15b, 11(%rcx) +; CHECK-BASELINE-NEXT: movb %r12b, 10(%rcx) +; CHECK-BASELINE-NEXT: movb %r13b, 9(%rcx) +; CHECK-BASELINE-NEXT: movb %bl, 8(%rcx) +; CHECK-BASELINE-NEXT: movb %r10b, 7(%rcx) +; CHECK-BASELINE-NEXT: movb %dil, 6(%rcx) +; CHECK-BASELINE-NEXT: movb %r11b, 5(%rcx) +; CHECK-BASELINE-NEXT: movb %r9b, 4(%rcx) ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-BASELINE-NEXT: xorb %al, %r8b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdx) ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, 2(%rdx) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: movb %r8b, 3(%rcx) +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movb %dl, 2(%rcx) ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, 1(%rdx) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movl %edx, %esi ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, (%rdx) -; CHECK-BASELINE-NEXT: movq %rdx, %rax +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-BASELINE-NEXT: movb %sil, 1(%rcx) +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movb %dl, (%rcx) +; CHECK-BASELINE-NEXT: movq %rcx, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -2870,102 +2871,103 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind ; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movq %rdi, %rdx +; CHECK-SSE1-NEXT: movq %rdi, %rcx +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: xorb %dil, %r9b +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: xorb %r10b, %r11b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-SSE1-NEXT: xorb %dil, %r9b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: xorb %r10b, %dil -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dil -; CHECK-SSE1-NEXT: xorb %r10b, %dil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: xorb %bl, %dil +; CHECK-SSE1-NEXT: xorb %r10b, %r11b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: xorb %r11b, %r10b +; CHECK-SSE1-NEXT: xorb %r13b, %r10b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dil +; CHECK-SSE1-NEXT: xorb %bl, %dil +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorb %r12b, %bl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b -; CHECK-SSE1-NEXT: xorb %r11b, %r10b -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorb %r13b, %r11b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r11b -; CHECK-SSE1-NEXT: xorb %r13b, %r11b +; CHECK-SSE1-NEXT: xorb %r13b, %r10b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; CHECK-SSE1-NEXT: xorb %r12b, %r13b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: xorb %r12b, %r13b +; CHECK-SSE1-NEXT: xorb %r15b, %r13b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl +; CHECK-SSE1-NEXT: xorb %r12b, %bl ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; CHECK-SSE1-NEXT: xorb %r15b, %r12b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b -; CHECK-SSE1-NEXT: xorb %r15b, %r12b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: xorb %r14b, %r12b +; CHECK-SSE1-NEXT: xorb %r15b, %r13b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: xorb %r14b, %r15b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b -; CHECK-SSE1-NEXT: xorb %r14b, %r15b +; CHECK-SSE1-NEXT: xorb %bpl, %r15b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r12b +; CHECK-SSE1-NEXT: xorb %r14b, %r12b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: xorb %bpl, %r14b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b -; CHECK-SSE1-NEXT: xorb %bpl, %r14b +; CHECK-SSE1-NEXT: xorb %al, %r14b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r15b +; CHECK-SSE1-NEXT: xorb %bpl, %r15b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: xorb %bl, %bpl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl -; CHECK-SSE1-NEXT: xorb %bl, %bpl -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r14b +; CHECK-SSE1-NEXT: xorb %al, %r14b ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: xorb %dl, %al ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-SSE1-NEXT: xorb %sil, %cl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb %sil, %cl -; CHECK-SSE1-NEXT: movb %cl, 15(%rdx) -; CHECK-SSE1-NEXT: movb %al, 14(%rdx) -; CHECK-SSE1-NEXT: movb %bl, 13(%rdx) -; CHECK-SSE1-NEXT: movb %bpl, 12(%rdx) -; CHECK-SSE1-NEXT: movb %r14b, 11(%rdx) -; CHECK-SSE1-NEXT: movb %r15b, 10(%rdx) -; CHECK-SSE1-NEXT: movb %r12b, 9(%rdx) -; CHECK-SSE1-NEXT: movb %r13b, 8(%rdx) -; CHECK-SSE1-NEXT: movb %r11b, 7(%rdx) -; CHECK-SSE1-NEXT: movb %r10b, 6(%rdx) -; CHECK-SSE1-NEXT: movb %dil, 5(%rdx) -; CHECK-SSE1-NEXT: movb %r9b, 4(%rdx) +; CHECK-SSE1-NEXT: xorb %sil, %bpl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bpl +; CHECK-SSE1-NEXT: xorb %sil, %bpl +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-SSE1-NEXT: xorb %sil, %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb %sil, %dl +; CHECK-SSE1-NEXT: movb %dl, 15(%rcx) +; CHECK-SSE1-NEXT: movb %al, 14(%rcx) +; CHECK-SSE1-NEXT: movb %bpl, 13(%rcx) +; CHECK-SSE1-NEXT: movb %r14b, 12(%rcx) +; CHECK-SSE1-NEXT: movb %r15b, 11(%rcx) +; CHECK-SSE1-NEXT: movb %r12b, 10(%rcx) +; CHECK-SSE1-NEXT: movb %r13b, 9(%rcx) +; CHECK-SSE1-NEXT: movb %bl, 8(%rcx) +; CHECK-SSE1-NEXT: movb %r10b, 7(%rcx) +; CHECK-SSE1-NEXT: movb %dil, 6(%rcx) +; CHECK-SSE1-NEXT: movb %r11b, 5(%rcx) +; CHECK-SSE1-NEXT: movb %r9b, 4(%rcx) ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-SSE1-NEXT: xorb %al, %r8b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: movb %r8b, 3(%rdx) ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, 2(%rdx) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: movb %r8b, 3(%rcx) +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movb %dl, 2(%rcx) ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, 1(%rdx) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movl %edx, %esi ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, (%rdx) -; CHECK-SSE1-NEXT: movq %rdx, %rax +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl +; CHECK-SSE1-NEXT: movb %sil, 1(%rcx) +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movb %dl, (%rcx) +; CHECK-SSE1-NEXT: movq %rcx, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -3004,36 +3006,36 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-BASELINE-NEXT: xorl %ebx, %esi ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorl %r11d, %ecx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx ; CHECK-BASELINE-NEXT: xorl %ebx, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-BASELINE-NEXT: xorl %ebx, %edx ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorl %ebx, %ecx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %ebx, %ecx -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorl %ebx, %r8d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %ebx, %r8d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorl %r11d, %ecx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: xorl %r11d, %r8d ; CHECK-BASELINE-NEXT: xorl %ebx, %r9d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-BASELINE-NEXT: xorl %r11d, %r8d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w ; CHECK-BASELINE-NEXT: xorl %ebx, %r9d ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorw %r11w, %bx +; CHECK-BASELINE-NEXT: xorw %di, %bx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-BASELINE-NEXT: xorl %r11d, %ebx -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: xorw %r10w, %r11w ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w ; CHECK-BASELINE-NEXT: xorl %r10d, %r11d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: xorw %di, %r10w -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-BASELINE-NEXT: xorl %edi, %r10d -; CHECK-BASELINE-NEXT: movw %r10w, 14(%rax) -; CHECK-BASELINE-NEXT: movw %r11w, 12(%rax) -; CHECK-BASELINE-NEXT: movw %bx, 10(%rax) +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: xorl %edi, %ebx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: xorw %r10w, %di +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-BASELINE-NEXT: xorl %r10d, %edi +; CHECK-BASELINE-NEXT: movw %di, 14(%rax) +; CHECK-BASELINE-NEXT: movw %bx, 12(%rax) +; CHECK-BASELINE-NEXT: movw %r11w, 10(%rax) ; CHECK-BASELINE-NEXT: movw %r9w, 8(%rax) ; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) @@ -3054,36 +3056,36 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-SSE1-NEXT: xorl %ebx, %esi ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorl %r11d, %ecx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx ; CHECK-SSE1-NEXT: xorl %ebx, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-SSE1-NEXT: xorl %ebx, %edx ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorl %ebx, %ecx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %ebx, %ecx -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorl %ebx, %r8d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %ebx, %r8d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorl %r11d, %ecx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: xorl %r11d, %r8d ; CHECK-SSE1-NEXT: xorl %ebx, %r9d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-SSE1-NEXT: xorl %r11d, %r8d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w ; CHECK-SSE1-NEXT: xorl %ebx, %r9d ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorw %r11w, %bx +; CHECK-SSE1-NEXT: xorw %di, %bx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-SSE1-NEXT: xorl %r11d, %ebx -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: xorw %r10w, %r11w ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w ; CHECK-SSE1-NEXT: xorl %r10d, %r11d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: xorw %di, %r10w -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-SSE1-NEXT: xorl %edi, %r10d -; CHECK-SSE1-NEXT: movw %r10w, 14(%rax) -; CHECK-SSE1-NEXT: movw %r11w, 12(%rax) -; CHECK-SSE1-NEXT: movw %bx, 10(%rax) +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: xorl %edi, %ebx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: xorw %r10w, %di +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-SSE1-NEXT: xorl %r10d, %edi +; CHECK-SSE1-NEXT: movw %di, 14(%rax) +; CHECK-SSE1-NEXT: movw %bx, 12(%rax) +; CHECK-SSE1-NEXT: movw %r11w, 10(%rax) ; CHECK-SSE1-NEXT: movw %r9w, 8(%rax) ; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) @@ -3119,7 +3121,6 @@ define <4 x i32> @in_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movl (%rdx), %r9d ; CHECK-BASELINE-NEXT: movl 4(%rdx), %r10d ; CHECK-BASELINE-NEXT: movl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r9d, %edx ; CHECK-BASELINE-NEXT: movl 4(%rsi), %r11d ; CHECK-BASELINE-NEXT: xorl %r10d, %r11d ; CHECK-BASELINE-NEXT: movl 8(%rsi), %ebx @@ -3129,6 +3130,7 @@ define <4 x i32> @in_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: andl 12(%rcx), %esi ; CHECK-BASELINE-NEXT: andl 8(%rcx), %ebx ; CHECK-BASELINE-NEXT: andl 4(%rcx), %r11d +; CHECK-BASELINE-NEXT: xorl %r9d, %edx ; CHECK-BASELINE-NEXT: andl (%rcx), %edx ; CHECK-BASELINE-NEXT: xorl %r9d, %edx ; CHECK-BASELINE-NEXT: xorl %r10d, %r11d @@ -3143,11 +3145,11 @@ define <4 x i32> @in_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; ; CHECK-SSE1-LABEL: in_v4i32: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 ; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq @@ -3231,12 +3233,10 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r12 -; CHECK-BASELINE-NEXT: movq %rdx, %r15 +; CHECK-BASELINE-NEXT: movq %rcx, %r15 +; CHECK-BASELINE-NEXT: movq %rdx, %rbx ; CHECK-BASELINE-NEXT: movq %rsi, %r14 -; CHECK-BASELINE-NEXT: movq %rdi, %r13 -; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movq %rdi, %r12 ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 13(%rdx), %eax @@ -3247,241 +3247,243 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d -; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %edi -; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi -; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %eax -; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %ecx -; CHECK-BASELINE-NEXT: movzbl (%rdx), %r11d -; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %edx -; CHECK-BASELINE-NEXT: movzbl (%r14), %ebx -; CHECK-BASELINE-NEXT: xorb %r11b, %bl -; CHECK-BASELINE-NEXT: andb (%r12), %bl -; CHECK-BASELINE-NEXT: xorb %r11b, %bl -; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r8d +; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %esi +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %edx +; CHECK-BASELINE-NEXT: movzbl 3(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 2(%rbx), %ecx +; CHECK-BASELINE-NEXT: movzbl (%rbx), %r13d +; CHECK-BASELINE-NEXT: movzbl 1(%rbx), %edi +; CHECK-BASELINE-NEXT: movzbl (%r14), %r10d +; CHECK-BASELINE-NEXT: xorb %r13b, %r10b ; CHECK-BASELINE-NEXT: movzbl 1(%r14), %r11d -; CHECK-BASELINE-NEXT: xorb %dl, %r11b -; CHECK-BASELINE-NEXT: andb 1(%r12), %r11b -; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: xorb %dil, %r11b +; CHECK-BASELINE-NEXT: andb (%r15), %r10b +; CHECK-BASELINE-NEXT: xorb %r13b, %r10b +; CHECK-BASELINE-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 2(%r14), %r10d +; CHECK-BASELINE-NEXT: andb 1(%r15), %r11b +; CHECK-BASELINE-NEXT: xorb %cl, %r10b +; CHECK-BASELINE-NEXT: xorb %dil, %r11b ; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 2(%r14), %edx -; CHECK-BASELINE-NEXT: xorb %cl, %dl -; CHECK-BASELINE-NEXT: andb 2(%r12), %dl -; CHECK-BASELINE-NEXT: xorb %cl, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 3(%r14), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 3(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 4(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: andb 4(%r12), %al -; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 3(%r14), %edi +; CHECK-BASELINE-NEXT: xorb %al, %dil +; CHECK-BASELINE-NEXT: andb 2(%r15), %r10b +; CHECK-BASELINE-NEXT: xorb %cl, %r10b +; CHECK-BASELINE-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 4(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %dl, %cl +; CHECK-BASELINE-NEXT: andb 3(%r15), %dil +; CHECK-BASELINE-NEXT: xorb %al, %dil +; CHECK-BASELINE-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 5(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: andb 5(%r12), %al -; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %bpl, %al -; CHECK-BASELINE-NEXT: andb 6(%r12), %al +; CHECK-BASELINE-NEXT: andb 4(%r15), %cl +; CHECK-BASELINE-NEXT: xorb %dl, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 6(%r14), %ecx +; CHECK-BASELINE-NEXT: andb 5(%r15), %al +; CHECK-BASELINE-NEXT: xorb %sil, %cl ; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 7(%r12), %al -; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r9b, %al -; CHECK-BASELINE-NEXT: andb 8(%r12), %al +; CHECK-BASELINE-NEXT: andb 6(%r15), %cl +; CHECK-BASELINE-NEXT: xorb %sil, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 8(%r14), %edx +; CHECK-BASELINE-NEXT: xorb %r8b, %dl +; CHECK-BASELINE-NEXT: andb 7(%r15), %al ; CHECK-BASELINE-NEXT: xorb %r9b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r8b, %al -; CHECK-BASELINE-NEXT: andb 9(%r12), %al -; CHECK-BASELINE-NEXT: xorb %r8b, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 10(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 10(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 11(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 11(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 12(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl 9(%r14), %esi +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %sil +; CHECK-BASELINE-NEXT: andb 8(%r15), %dl +; CHECK-BASELINE-NEXT: xorb %r8b, %dl +; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 10(%r14), %edx +; CHECK-BASELINE-NEXT: andb 9(%r15), %sil ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 12(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 13(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: xorb %cl, %sil +; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 11(%r14), %esi +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %sil +; CHECK-BASELINE-NEXT: andb 10(%r15), %dl +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 12(%r14), %edx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 13(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 14(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: andb 11(%r15), %sil +; CHECK-BASELINE-NEXT: xorb %cl, %sil +; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 13(%r14), %esi +; CHECK-BASELINE-NEXT: andb 12(%r15), %dl +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 14(%r14), %edx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %dl +; CHECK-BASELINE-NEXT: andb 14(%r15), %dl ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 14(%r12), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: andb 13(%r15), %sil +; CHECK-BASELINE-NEXT: xorb %al, %sil +; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 15(%rbx), %eax +; CHECK-BASELINE-NEXT: xorb %cl, %dl +; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 15(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 15(%r12), %cl +; CHECK-BASELINE-NEXT: andb 15(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 16(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 16(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 16(%r12), %cl +; CHECK-BASELINE-NEXT: andb 16(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 17(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 17(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 17(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 17(%r12), %cl +; CHECK-BASELINE-NEXT: andb 17(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 18(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 18(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 18(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 18(%r12), %cl +; CHECK-BASELINE-NEXT: andb 18(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 19(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 19(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 19(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 19(%r12), %cl +; CHECK-BASELINE-NEXT: andb 19(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 20(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 20(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 20(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 20(%r12), %cl +; CHECK-BASELINE-NEXT: andb 20(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 21(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 21(%r14), %ebp +; CHECK-BASELINE-NEXT: movzbl 21(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 21(%r14), %r13d +; CHECK-BASELINE-NEXT: xorb %al, %r13b +; CHECK-BASELINE-NEXT: andb 21(%r15), %r13b +; CHECK-BASELINE-NEXT: xorb %al, %r13b +; CHECK-BASELINE-NEXT: movzbl 22(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebp ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: andb 21(%r12), %bpl +; CHECK-BASELINE-NEXT: andb 22(%r15), %bpl ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: movzbl 22(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb 22(%r12), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: movzbl 23(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 23(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 23(%r14), %r11d ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: andb 23(%r12), %r11b +; CHECK-BASELINE-NEXT: andb 23(%r15), %r11b ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: movzbl 24(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 24(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 24(%r14), %r9d ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: andb 24(%r12), %r9b +; CHECK-BASELINE-NEXT: andb 24(%r15), %r9b ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: movzbl 25(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 25(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 25(%r14), %r8d ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: andb 25(%r12), %r8b +; CHECK-BASELINE-NEXT: andb 25(%r15), %r8b ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: movzbl 26(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 26(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 26(%r14), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: andb 26(%r12), %dil +; CHECK-BASELINE-NEXT: andb 26(%r15), %dil ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: movzbl 27(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 27(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 27(%r14), %esi ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: andb 27(%r12), %sil +; CHECK-BASELINE-NEXT: andb 27(%r15), %sil ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: movzbl 28(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 28(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 28(%r14), %edx ; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 28(%r12), %dl +; CHECK-BASELINE-NEXT: andb 28(%r15), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movzbl 29(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 29(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 29(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 29(%r12), %cl +; CHECK-BASELINE-NEXT: andb 29(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%r15), %r10d +; CHECK-BASELINE-NEXT: movzbl 30(%rbx), %r10d ; CHECK-BASELINE-NEXT: movzbl 30(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 30(%r12), %al +; CHECK-BASELINE-NEXT: andb 30(%r15), %al ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: movzbl 31(%r15), %r10d -; CHECK-BASELINE-NEXT: movzbl 31(%r14), %r14d -; CHECK-BASELINE-NEXT: xorb %r10b, %r14b -; CHECK-BASELINE-NEXT: andb 31(%r12), %r14b -; CHECK-BASELINE-NEXT: xorb %r10b, %r14b -; CHECK-BASELINE-NEXT: movb %r14b, 31(%r13) -; CHECK-BASELINE-NEXT: movb %al, 30(%r13) -; CHECK-BASELINE-NEXT: movb %cl, 29(%r13) -; CHECK-BASELINE-NEXT: movb %dl, 28(%r13) -; CHECK-BASELINE-NEXT: movb %sil, 27(%r13) -; CHECK-BASELINE-NEXT: movb %dil, 26(%r13) -; CHECK-BASELINE-NEXT: movb %r8b, 25(%r13) -; CHECK-BASELINE-NEXT: movb %r9b, 24(%r13) -; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13) -; CHECK-BASELINE-NEXT: movb %bl, 22(%r13) -; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13) +; CHECK-BASELINE-NEXT: movzbl 31(%rbx), %r10d +; CHECK-BASELINE-NEXT: movzbl 31(%r14), %ebx +; CHECK-BASELINE-NEXT: xorb %r10b, %bl +; CHECK-BASELINE-NEXT: andb 31(%r15), %bl +; CHECK-BASELINE-NEXT: xorb %r10b, %bl +; CHECK-BASELINE-NEXT: movb %bl, 31(%r12) +; CHECK-BASELINE-NEXT: movb %al, 30(%r12) +; CHECK-BASELINE-NEXT: movb %cl, 29(%r12) +; CHECK-BASELINE-NEXT: movb %dl, 28(%r12) +; CHECK-BASELINE-NEXT: movb %sil, 27(%r12) +; CHECK-BASELINE-NEXT: movb %dil, 26(%r12) +; CHECK-BASELINE-NEXT: movb %r8b, 25(%r12) +; CHECK-BASELINE-NEXT: movb %r9b, 24(%r12) +; CHECK-BASELINE-NEXT: movb %r11b, 23(%r12) +; CHECK-BASELINE-NEXT: movb %bpl, 22(%r12) +; CHECK-BASELINE-NEXT: movb %r13b, 21(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 20(%r13) +; CHECK-BASELINE-NEXT: movb %al, 20(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 19(%r13) +; CHECK-BASELINE-NEXT: movb %al, 19(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 18(%r13) +; CHECK-BASELINE-NEXT: movb %al, 18(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 17(%r13) +; CHECK-BASELINE-NEXT: movb %al, 17(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 16(%r13) +; CHECK-BASELINE-NEXT: movb %al, 16(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 15(%r13) +; CHECK-BASELINE-NEXT: movb %al, 15(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 14(%r13) +; CHECK-BASELINE-NEXT: movb %al, 14(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 13(%r13) +; CHECK-BASELINE-NEXT: movb %al, 13(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 12(%r13) +; CHECK-BASELINE-NEXT: movb %al, 12(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 11(%r13) +; CHECK-BASELINE-NEXT: movb %al, 11(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 10(%r13) +; CHECK-BASELINE-NEXT: movb %al, 10(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 9(%r13) +; CHECK-BASELINE-NEXT: movb %al, 9(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 8(%r13) +; CHECK-BASELINE-NEXT: movb %al, 8(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 7(%r13) +; CHECK-BASELINE-NEXT: movb %al, 7(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 6(%r13) +; CHECK-BASELINE-NEXT: movb %al, 6(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 5(%r13) +; CHECK-BASELINE-NEXT: movb %al, 5(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 4(%r13) +; CHECK-BASELINE-NEXT: movb %al, 4(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 3(%r13) +; CHECK-BASELINE-NEXT: movb %al, 3(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 2(%r13) +; CHECK-BASELINE-NEXT: movb %al, 2(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 1(%r13) +; CHECK-BASELINE-NEXT: movb %al, 1(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, (%r13) -; CHECK-BASELINE-NEXT: movq %r13, %rax +; CHECK-BASELINE-NEXT: movb %al, (%r12) +; CHECK-BASELINE-NEXT: movq %r12, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -3498,12 +3500,10 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r12 -; CHECK-SSE1-NEXT: movq %rdx, %r15 +; CHECK-SSE1-NEXT: movq %rcx, %r15 +; CHECK-SSE1-NEXT: movq %rdx, %rbx ; CHECK-SSE1-NEXT: movq %rsi, %r14 -; CHECK-SSE1-NEXT: movq %rdi, %r13 -; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movq %rdi, %r12 ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 13(%rdx), %eax @@ -3514,241 +3514,243 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d -; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d -; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %edi -; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi -; CHECK-SSE1-NEXT: movzbl 3(%rdx), %eax -; CHECK-SSE1-NEXT: movzbl 2(%rdx), %ecx -; CHECK-SSE1-NEXT: movzbl (%rdx), %r11d -; CHECK-SSE1-NEXT: movzbl 1(%rdx), %edx -; CHECK-SSE1-NEXT: movzbl (%r14), %ebx -; CHECK-SSE1-NEXT: xorb %r11b, %bl -; CHECK-SSE1-NEXT: andb (%r12), %bl -; CHECK-SSE1-NEXT: xorb %r11b, %bl -; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 9(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r8d +; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r9d +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %esi +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 4(%rdx), %edx +; CHECK-SSE1-NEXT: movzbl 3(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 2(%rbx), %ecx +; CHECK-SSE1-NEXT: movzbl (%rbx), %r13d +; CHECK-SSE1-NEXT: movzbl 1(%rbx), %edi +; CHECK-SSE1-NEXT: movzbl (%r14), %r10d +; CHECK-SSE1-NEXT: xorb %r13b, %r10b ; CHECK-SSE1-NEXT: movzbl 1(%r14), %r11d -; CHECK-SSE1-NEXT: xorb %dl, %r11b -; CHECK-SSE1-NEXT: andb 1(%r12), %r11b -; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: xorb %dil, %r11b +; CHECK-SSE1-NEXT: andb (%r15), %r10b +; CHECK-SSE1-NEXT: xorb %r13b, %r10b +; CHECK-SSE1-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 2(%r14), %r10d +; CHECK-SSE1-NEXT: andb 1(%r15), %r11b +; CHECK-SSE1-NEXT: xorb %cl, %r10b +; CHECK-SSE1-NEXT: xorb %dil, %r11b ; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 2(%r14), %edx -; CHECK-SSE1-NEXT: xorb %cl, %dl -; CHECK-SSE1-NEXT: andb 2(%r12), %dl -; CHECK-SSE1-NEXT: xorb %cl, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 3(%r14), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 3(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 4(%r14), %eax -; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: andb 4(%r12), %al -; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 3(%r14), %edi +; CHECK-SSE1-NEXT: xorb %al, %dil +; CHECK-SSE1-NEXT: andb 2(%r15), %r10b +; CHECK-SSE1-NEXT: xorb %cl, %r10b +; CHECK-SSE1-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 4(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %dl, %cl +; CHECK-SSE1-NEXT: andb 3(%r15), %dil +; CHECK-SSE1-NEXT: xorb %al, %dil +; CHECK-SSE1-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 5(%r14), %eax -; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: andb 5(%r12), %al -; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax ; CHECK-SSE1-NEXT: xorb %bpl, %al -; CHECK-SSE1-NEXT: andb 6(%r12), %al +; CHECK-SSE1-NEXT: andb 4(%r15), %cl +; CHECK-SSE1-NEXT: xorb %dl, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 6(%r14), %ecx +; CHECK-SSE1-NEXT: andb 5(%r15), %al +; CHECK-SSE1-NEXT: xorb %sil, %cl ; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 7(%r12), %al -; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r9b, %al -; CHECK-SSE1-NEXT: andb 8(%r12), %al +; CHECK-SSE1-NEXT: andb 6(%r15), %cl +; CHECK-SSE1-NEXT: xorb %sil, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 8(%r14), %edx +; CHECK-SSE1-NEXT: xorb %r8b, %dl +; CHECK-SSE1-NEXT: andb 7(%r15), %al ; CHECK-SSE1-NEXT: xorb %r9b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r8b, %al -; CHECK-SSE1-NEXT: andb 9(%r12), %al -; CHECK-SSE1-NEXT: xorb %r8b, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 10(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 10(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 11(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 11(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 12(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl 9(%r14), %esi +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %sil +; CHECK-SSE1-NEXT: andb 8(%r15), %dl +; CHECK-SSE1-NEXT: xorb %r8b, %dl +; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 10(%r14), %edx +; CHECK-SSE1-NEXT: andb 9(%r15), %sil ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 12(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 13(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: xorb %cl, %sil +; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 11(%r14), %esi +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %sil +; CHECK-SSE1-NEXT: andb 10(%r15), %dl +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 12(%r14), %edx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 13(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 14(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: andb 11(%r15), %sil +; CHECK-SSE1-NEXT: xorb %cl, %sil +; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 13(%r14), %esi +; CHECK-SSE1-NEXT: andb 12(%r15), %dl +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 14(%r14), %edx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %dl +; CHECK-SSE1-NEXT: andb 14(%r15), %dl ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 14(%r12), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: andb 13(%r15), %sil +; CHECK-SSE1-NEXT: xorb %al, %sil +; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 15(%rbx), %eax +; CHECK-SSE1-NEXT: xorb %cl, %dl +; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 15(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 15(%r12), %cl +; CHECK-SSE1-NEXT: andb 15(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 16(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 16(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 16(%r12), %cl +; CHECK-SSE1-NEXT: andb 16(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 17(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 17(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 17(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 17(%r12), %cl +; CHECK-SSE1-NEXT: andb 17(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 18(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 18(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 18(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 18(%r12), %cl +; CHECK-SSE1-NEXT: andb 18(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 19(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 19(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 19(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 19(%r12), %cl +; CHECK-SSE1-NEXT: andb 19(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 20(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 20(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 20(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 20(%r12), %cl +; CHECK-SSE1-NEXT: andb 20(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 21(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 21(%r14), %ebp +; CHECK-SSE1-NEXT: movzbl 21(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 21(%r14), %r13d +; CHECK-SSE1-NEXT: xorb %al, %r13b +; CHECK-SSE1-NEXT: andb 21(%r15), %r13b +; CHECK-SSE1-NEXT: xorb %al, %r13b +; CHECK-SSE1-NEXT: movzbl 22(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebp ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: andb 21(%r12), %bpl +; CHECK-SSE1-NEXT: andb 22(%r15), %bpl ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: movzbl 22(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb 22(%r12), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: movzbl 23(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 23(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 23(%r14), %r11d ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: andb 23(%r12), %r11b +; CHECK-SSE1-NEXT: andb 23(%r15), %r11b ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: movzbl 24(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 24(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 24(%r14), %r9d ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: andb 24(%r12), %r9b +; CHECK-SSE1-NEXT: andb 24(%r15), %r9b ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: movzbl 25(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 25(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 25(%r14), %r8d ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: andb 25(%r12), %r8b +; CHECK-SSE1-NEXT: andb 25(%r15), %r8b ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: movzbl 26(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 26(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 26(%r14), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: andb 26(%r12), %dil +; CHECK-SSE1-NEXT: andb 26(%r15), %dil ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: movzbl 27(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 27(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 27(%r14), %esi ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: andb 27(%r12), %sil +; CHECK-SSE1-NEXT: andb 27(%r15), %sil ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: movzbl 28(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 28(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 28(%r14), %edx ; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 28(%r12), %dl +; CHECK-SSE1-NEXT: andb 28(%r15), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movzbl 29(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 29(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 29(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 29(%r12), %cl +; CHECK-SSE1-NEXT: andb 29(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%r15), %r10d +; CHECK-SSE1-NEXT: movzbl 30(%rbx), %r10d ; CHECK-SSE1-NEXT: movzbl 30(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 30(%r12), %al +; CHECK-SSE1-NEXT: andb 30(%r15), %al ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: movzbl 31(%r15), %r10d -; CHECK-SSE1-NEXT: movzbl 31(%r14), %r14d -; CHECK-SSE1-NEXT: xorb %r10b, %r14b -; CHECK-SSE1-NEXT: andb 31(%r12), %r14b -; CHECK-SSE1-NEXT: xorb %r10b, %r14b -; CHECK-SSE1-NEXT: movb %r14b, 31(%r13) -; CHECK-SSE1-NEXT: movb %al, 30(%r13) -; CHECK-SSE1-NEXT: movb %cl, 29(%r13) -; CHECK-SSE1-NEXT: movb %dl, 28(%r13) -; CHECK-SSE1-NEXT: movb %sil, 27(%r13) -; CHECK-SSE1-NEXT: movb %dil, 26(%r13) -; CHECK-SSE1-NEXT: movb %r8b, 25(%r13) -; CHECK-SSE1-NEXT: movb %r9b, 24(%r13) -; CHECK-SSE1-NEXT: movb %r11b, 23(%r13) -; CHECK-SSE1-NEXT: movb %bl, 22(%r13) -; CHECK-SSE1-NEXT: movb %bpl, 21(%r13) +; CHECK-SSE1-NEXT: movzbl 31(%rbx), %r10d +; CHECK-SSE1-NEXT: movzbl 31(%r14), %ebx +; CHECK-SSE1-NEXT: xorb %r10b, %bl +; CHECK-SSE1-NEXT: andb 31(%r15), %bl +; CHECK-SSE1-NEXT: xorb %r10b, %bl +; CHECK-SSE1-NEXT: movb %bl, 31(%r12) +; CHECK-SSE1-NEXT: movb %al, 30(%r12) +; CHECK-SSE1-NEXT: movb %cl, 29(%r12) +; CHECK-SSE1-NEXT: movb %dl, 28(%r12) +; CHECK-SSE1-NEXT: movb %sil, 27(%r12) +; CHECK-SSE1-NEXT: movb %dil, 26(%r12) +; CHECK-SSE1-NEXT: movb %r8b, 25(%r12) +; CHECK-SSE1-NEXT: movb %r9b, 24(%r12) +; CHECK-SSE1-NEXT: movb %r11b, 23(%r12) +; CHECK-SSE1-NEXT: movb %bpl, 22(%r12) +; CHECK-SSE1-NEXT: movb %r13b, 21(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 20(%r13) +; CHECK-SSE1-NEXT: movb %al, 20(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 19(%r13) +; CHECK-SSE1-NEXT: movb %al, 19(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 18(%r13) +; CHECK-SSE1-NEXT: movb %al, 18(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 17(%r13) +; CHECK-SSE1-NEXT: movb %al, 17(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 16(%r13) +; CHECK-SSE1-NEXT: movb %al, 16(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 15(%r13) +; CHECK-SSE1-NEXT: movb %al, 15(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 14(%r13) +; CHECK-SSE1-NEXT: movb %al, 14(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 13(%r13) +; CHECK-SSE1-NEXT: movb %al, 13(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 12(%r13) +; CHECK-SSE1-NEXT: movb %al, 12(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 11(%r13) +; CHECK-SSE1-NEXT: movb %al, 11(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 10(%r13) +; CHECK-SSE1-NEXT: movb %al, 10(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 9(%r13) +; CHECK-SSE1-NEXT: movb %al, 9(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 8(%r13) +; CHECK-SSE1-NEXT: movb %al, 8(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 7(%r13) +; CHECK-SSE1-NEXT: movb %al, 7(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 6(%r13) +; CHECK-SSE1-NEXT: movb %al, 6(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 5(%r13) +; CHECK-SSE1-NEXT: movb %al, 5(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 4(%r13) +; CHECK-SSE1-NEXT: movb %al, 4(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 3(%r13) +; CHECK-SSE1-NEXT: movb %al, 3(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 2(%r13) +; CHECK-SSE1-NEXT: movb %al, 2(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 1(%r13) +; CHECK-SSE1-NEXT: movb %al, 1(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, (%r13) -; CHECK-SSE1-NEXT: movq %r13, %rax +; CHECK-SSE1-NEXT: movb %al, (%r12) +; CHECK-SSE1-NEXT: movq %r12, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -3760,15 +3762,15 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE2-LABEL: in_v32i8: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 ; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2 +; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 ; CHECK-SSE2-NEXT: andps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2 -; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm3 +; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm3 ; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: orps %xmm3, %xmm1 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v32i8: @@ -3795,20 +3797,20 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r9 -; CHECK-BASELINE-NEXT: movq %rdi, %r10 +; CHECK-BASELINE-NEXT: movq %rcx, %r8 +; CHECK-BASELINE-NEXT: movq %rdi, %r9 ; CHECK-BASELINE-NEXT: movzwl 30(%rdx), %edi ; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 28(%rdx), %edi ; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %edi -; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 24(%rdx), %eax +; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %eax +; CHECK-BASELINE-NEXT: movl 24(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 20(%rdx), %r8d -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %edi +; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d +; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r11d ; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 16(%rdx), %ebx @@ -3823,112 +3825,116 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r13d ; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl (%rdx), %ecx +; CHECK-BASELINE-NEXT: movl (%rdx), %eax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 4(%rdx), %ecx ; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 4(%rdx), %edi -; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl (%rsi), %edx -; CHECK-BASELINE-NEXT: xorw %cx, %dx +; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %edx +; CHECK-BASELINE-NEXT: xorw %ax, %dx ; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %ecx -; CHECK-BASELINE-NEXT: xorw %ax, %cx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %di, %ax +; CHECK-BASELINE-NEXT: xorw %cx, %ax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %ecx -; CHECK-BASELINE-NEXT: xorw %r13w, %cx +; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %edx +; CHECK-BASELINE-NEXT: xorw %r13w, %dx +; CHECK-BASELINE-NEXT: movl %edx, %ecx ; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %r12w, %ax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %r15w, %ax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %edx ; CHECK-BASELINE-NEXT: xorw %r14w, %dx -; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %r13d -; CHECK-BASELINE-NEXT: xorw %bp, %r13w -; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %r12d -; CHECK-BASELINE-NEXT: xorw %bx, %r12w -; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %r15d -; CHECK-BASELINE-NEXT: xorw %r11w, %r15w -; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %r14d -; CHECK-BASELINE-NEXT: xorw %r8w, %r14w -; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %ebp -; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload +; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %bp, %ax +; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %r13d +; CHECK-BASELINE-NEXT: xorw %bx, %r13w +; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %r12d +; CHECK-BASELINE-NEXT: xorw %r11w, %r12w +; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %r15d +; CHECK-BASELINE-NEXT: xorw %r10w, %r15w +; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %r14d +; CHECK-BASELINE-NEXT: xorw %di, %r14w ; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %ebx ; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload ; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r11d ; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload -; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %edi +; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %r10d +; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload +; CHECK-BASELINE-NEXT: movzwl (%rsi), %ebp +; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %edi ; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload -; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %esi -; CHECK-BASELINE-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload -; CHECK-BASELINE-NEXT: andw 30(%r9), %si -; CHECK-BASELINE-NEXT: andw 28(%r9), %di -; CHECK-BASELINE-NEXT: andw 26(%r9), %r11w -; CHECK-BASELINE-NEXT: andw 24(%r9), %bx -; CHECK-BASELINE-NEXT: andw 22(%r9), %bp -; CHECK-BASELINE-NEXT: andw 20(%r9), %r14w -; CHECK-BASELINE-NEXT: andw 18(%r9), %r15w -; CHECK-BASELINE-NEXT: andw 16(%r9), %r12w -; CHECK-BASELINE-NEXT: andw 14(%r9), %r13w -; CHECK-BASELINE-NEXT: andw 12(%r9), %dx +; CHECK-BASELINE-NEXT: andw 30(%r8), %di +; CHECK-BASELINE-NEXT: andw 28(%r8), %r10w +; CHECK-BASELINE-NEXT: andw 26(%r8), %r11w +; CHECK-BASELINE-NEXT: andw 24(%r8), %bx +; CHECK-BASELINE-NEXT: andw 22(%r8), %r14w +; CHECK-BASELINE-NEXT: andw 20(%r8), %r15w +; CHECK-BASELINE-NEXT: andw 18(%r8), %r12w +; CHECK-BASELINE-NEXT: andw 16(%r8), %r13w +; CHECK-BASELINE-NEXT: andw 14(%r8), %ax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: andw 12(%r8), %dx ; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: andw 10(%r9), %ax +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-BASELINE-NEXT: andw 10(%r8), %ax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload -; CHECK-BASELINE-NEXT: andw 8(%r9), %dx -; CHECK-BASELINE-NEXT: andw 6(%r9), %cx +; CHECK-BASELINE-NEXT: andw 8(%r8), %dx +; CHECK-BASELINE-NEXT: andw 6(%r8), %cx ; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload -; CHECK-BASELINE-NEXT: andw 4(%r9), %r8w ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: andw 2(%r9), %ax +; CHECK-BASELINE-NEXT: andw 4(%r8), %ax ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-BASELINE-NEXT: andw (%r9), %cx +; CHECK-BASELINE-NEXT: andw 2(%r8), %cx +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; CHECK-BASELINE-NEXT: xorw %si, %bp +; CHECK-BASELINE-NEXT: andw (%r8), %bp +; CHECK-BASELINE-NEXT: xorl %esi, %ebp ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: movl %edx, %ecx ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: movw %si, 30(%r10) -; CHECK-BASELINE-NEXT: movw %di, 28(%r10) -; CHECK-BASELINE-NEXT: movw %r11w, 26(%r10) -; CHECK-BASELINE-NEXT: movw %bx, 24(%r10) -; CHECK-BASELINE-NEXT: movw %bp, 22(%r10) -; CHECK-BASELINE-NEXT: movw %r14w, 20(%r10) -; CHECK-BASELINE-NEXT: movw %r15w, 18(%r10) -; CHECK-BASELINE-NEXT: movw %r12w, 16(%r10) -; CHECK-BASELINE-NEXT: movw %r13w, 14(%r10) -; CHECK-BASELINE-NEXT: movw %ax, 12(%r10) -; CHECK-BASELINE-NEXT: movw %dx, 10(%r10) -; CHECK-BASELINE-NEXT: movw %cx, 8(%r10) -; CHECK-BASELINE-NEXT: movw %r9w, 6(%r10) -; CHECK-BASELINE-NEXT: movw %r8w, 4(%r10) +; CHECK-BASELINE-NEXT: movw %di, 30(%r9) +; CHECK-BASELINE-NEXT: movw %r10w, 28(%r9) +; CHECK-BASELINE-NEXT: movw %r11w, 26(%r9) +; CHECK-BASELINE-NEXT: movw %bx, 24(%r9) +; CHECK-BASELINE-NEXT: movw %r14w, 22(%r9) +; CHECK-BASELINE-NEXT: movw %r15w, 20(%r9) +; CHECK-BASELINE-NEXT: movw %r12w, 18(%r9) +; CHECK-BASELINE-NEXT: movw %r13w, 16(%r9) +; CHECK-BASELINE-NEXT: movw %si, 14(%r9) +; CHECK-BASELINE-NEXT: movw %ax, 12(%r9) +; CHECK-BASELINE-NEXT: movw %dx, 10(%r9) +; CHECK-BASELINE-NEXT: movw %cx, 8(%r9) +; CHECK-BASELINE-NEXT: movw %r8w, 6(%r9) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 2(%r10) +; CHECK-BASELINE-NEXT: movw %ax, 4(%r9) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, (%r10) -; CHECK-BASELINE-NEXT: movq %r10, %rax +; CHECK-BASELINE-NEXT: movw %ax, 2(%r9) +; CHECK-BASELINE-NEXT: movw %bp, (%r9) +; CHECK-BASELINE-NEXT: movq %r9, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -3945,20 +3951,20 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r9 -; CHECK-SSE1-NEXT: movq %rdi, %r10 +; CHECK-SSE1-NEXT: movq %rcx, %r8 +; CHECK-SSE1-NEXT: movq %rdi, %r9 ; CHECK-SSE1-NEXT: movzwl 30(%rdx), %edi ; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 28(%rdx), %edi ; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 26(%rdx), %edi -; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 24(%rdx), %eax +; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 22(%rdx), %eax +; CHECK-SSE1-NEXT: movl 24(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 20(%rdx), %r8d -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 22(%rdx), %edi +; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d +; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r11d ; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 16(%rdx), %ebx @@ -3973,112 +3979,116 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r13d ; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl (%rdx), %ecx +; CHECK-SSE1-NEXT: movl (%rdx), %eax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 4(%rdx), %ecx ; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 4(%rdx), %edi -; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 2(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl (%rsi), %edx -; CHECK-SSE1-NEXT: xorw %cx, %dx +; CHECK-SSE1-NEXT: movzwl 2(%rsi), %edx +; CHECK-SSE1-NEXT: xorw %ax, %dx ; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 2(%rsi), %ecx -; CHECK-SSE1-NEXT: xorw %ax, %cx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %di, %ax +; CHECK-SSE1-NEXT: xorw %cx, %ax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 6(%rsi), %ecx -; CHECK-SSE1-NEXT: xorw %r13w, %cx +; CHECK-SSE1-NEXT: movzwl 6(%rsi), %edx +; CHECK-SSE1-NEXT: xorw %r13w, %dx +; CHECK-SSE1-NEXT: movl %edx, %ecx ; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %r12w, %ax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %r15w, %ax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movzwl 12(%rsi), %edx ; CHECK-SSE1-NEXT: xorw %r14w, %dx -; CHECK-SSE1-NEXT: movzwl 14(%rsi), %r13d -; CHECK-SSE1-NEXT: xorw %bp, %r13w -; CHECK-SSE1-NEXT: movzwl 16(%rsi), %r12d -; CHECK-SSE1-NEXT: xorw %bx, %r12w -; CHECK-SSE1-NEXT: movzwl 18(%rsi), %r15d -; CHECK-SSE1-NEXT: xorw %r11w, %r15w -; CHECK-SSE1-NEXT: movzwl 20(%rsi), %r14d -; CHECK-SSE1-NEXT: xorw %r8w, %r14w -; CHECK-SSE1-NEXT: movzwl 22(%rsi), %ebp -; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload +; CHECK-SSE1-NEXT: movzwl 14(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %bp, %ax +; CHECK-SSE1-NEXT: movzwl 16(%rsi), %r13d +; CHECK-SSE1-NEXT: xorw %bx, %r13w +; CHECK-SSE1-NEXT: movzwl 18(%rsi), %r12d +; CHECK-SSE1-NEXT: xorw %r11w, %r12w +; CHECK-SSE1-NEXT: movzwl 20(%rsi), %r15d +; CHECK-SSE1-NEXT: xorw %r10w, %r15w +; CHECK-SSE1-NEXT: movzwl 22(%rsi), %r14d +; CHECK-SSE1-NEXT: xorw %di, %r14w ; CHECK-SSE1-NEXT: movzwl 24(%rsi), %ebx ; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload ; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r11d ; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload -; CHECK-SSE1-NEXT: movzwl 28(%rsi), %edi +; CHECK-SSE1-NEXT: movzwl 28(%rsi), %r10d +; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload +; CHECK-SSE1-NEXT: movzwl (%rsi), %ebp +; CHECK-SSE1-NEXT: movzwl 30(%rsi), %edi ; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload -; CHECK-SSE1-NEXT: movzwl 30(%rsi), %esi -; CHECK-SSE1-NEXT: xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload -; CHECK-SSE1-NEXT: andw 30(%r9), %si -; CHECK-SSE1-NEXT: andw 28(%r9), %di -; CHECK-SSE1-NEXT: andw 26(%r9), %r11w -; CHECK-SSE1-NEXT: andw 24(%r9), %bx -; CHECK-SSE1-NEXT: andw 22(%r9), %bp -; CHECK-SSE1-NEXT: andw 20(%r9), %r14w -; CHECK-SSE1-NEXT: andw 18(%r9), %r15w -; CHECK-SSE1-NEXT: andw 16(%r9), %r12w -; CHECK-SSE1-NEXT: andw 14(%r9), %r13w -; CHECK-SSE1-NEXT: andw 12(%r9), %dx +; CHECK-SSE1-NEXT: andw 30(%r8), %di +; CHECK-SSE1-NEXT: andw 28(%r8), %r10w +; CHECK-SSE1-NEXT: andw 26(%r8), %r11w +; CHECK-SSE1-NEXT: andw 24(%r8), %bx +; CHECK-SSE1-NEXT: andw 22(%r8), %r14w +; CHECK-SSE1-NEXT: andw 20(%r8), %r15w +; CHECK-SSE1-NEXT: andw 18(%r8), %r12w +; CHECK-SSE1-NEXT: andw 16(%r8), %r13w +; CHECK-SSE1-NEXT: andw 14(%r8), %ax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: andw 12(%r8), %dx ; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: andw 10(%r9), %ax +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; CHECK-SSE1-NEXT: andw 10(%r8), %ax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload -; CHECK-SSE1-NEXT: andw 8(%r9), %dx -; CHECK-SSE1-NEXT: andw 6(%r9), %cx +; CHECK-SSE1-NEXT: andw 8(%r8), %dx +; CHECK-SSE1-NEXT: andw 6(%r8), %cx ; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload -; CHECK-SSE1-NEXT: andw 4(%r9), %r8w ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: andw 2(%r9), %ax +; CHECK-SSE1-NEXT: andw 4(%r8), %ax ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-SSE1-NEXT: andw (%r9), %cx +; CHECK-SSE1-NEXT: andw 2(%r8), %cx +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; CHECK-SSE1-NEXT: xorw %si, %bp +; CHECK-SSE1-NEXT: andw (%r8), %bp +; CHECK-SSE1-NEXT: xorl %esi, %ebp ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload ; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload ; CHECK-SSE1-NEXT: movl %edx, %ecx ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload -; CHECK-SSE1-NEXT: movw %si, 30(%r10) -; CHECK-SSE1-NEXT: movw %di, 28(%r10) -; CHECK-SSE1-NEXT: movw %r11w, 26(%r10) -; CHECK-SSE1-NEXT: movw %bx, 24(%r10) -; CHECK-SSE1-NEXT: movw %bp, 22(%r10) -; CHECK-SSE1-NEXT: movw %r14w, 20(%r10) -; CHECK-SSE1-NEXT: movw %r15w, 18(%r10) -; CHECK-SSE1-NEXT: movw %r12w, 16(%r10) -; CHECK-SSE1-NEXT: movw %r13w, 14(%r10) -; CHECK-SSE1-NEXT: movw %ax, 12(%r10) -; CHECK-SSE1-NEXT: movw %dx, 10(%r10) -; CHECK-SSE1-NEXT: movw %cx, 8(%r10) -; CHECK-SSE1-NEXT: movw %r9w, 6(%r10) -; CHECK-SSE1-NEXT: movw %r8w, 4(%r10) +; CHECK-SSE1-NEXT: movw %di, 30(%r9) +; CHECK-SSE1-NEXT: movw %r10w, 28(%r9) +; CHECK-SSE1-NEXT: movw %r11w, 26(%r9) +; CHECK-SSE1-NEXT: movw %bx, 24(%r9) +; CHECK-SSE1-NEXT: movw %r14w, 22(%r9) +; CHECK-SSE1-NEXT: movw %r15w, 20(%r9) +; CHECK-SSE1-NEXT: movw %r12w, 18(%r9) +; CHECK-SSE1-NEXT: movw %r13w, 16(%r9) +; CHECK-SSE1-NEXT: movw %si, 14(%r9) +; CHECK-SSE1-NEXT: movw %ax, 12(%r9) +; CHECK-SSE1-NEXT: movw %dx, 10(%r9) +; CHECK-SSE1-NEXT: movw %cx, 8(%r9) +; CHECK-SSE1-NEXT: movw %r8w, 6(%r9) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 2(%r10) +; CHECK-SSE1-NEXT: movw %ax, 4(%r9) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, (%r10) -; CHECK-SSE1-NEXT: movq %r10, %rax +; CHECK-SSE1-NEXT: movw %ax, 2(%r9) +; CHECK-SSE1-NEXT: movw %bp, (%r9) +; CHECK-SSE1-NEXT: movq %r9, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -4090,15 +4100,15 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE2-LABEL: in_v16i16: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 ; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2 +; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 ; CHECK-SSE2-NEXT: andps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2 -; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm3 +; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm3 ; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: orps %xmm3, %xmm1 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v16i16: @@ -4125,57 +4135,58 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movl 28(%rdx), %ebp -; CHECK-BASELINE-NEXT: movl 24(%rdx), %ebx -; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d +; CHECK-BASELINE-NEXT: movq %rcx, %r8 +; CHECK-BASELINE-NEXT: movl 28(%rdx), %ebx +; CHECK-BASELINE-NEXT: movl 24(%rdx), %r11d +; CHECK-BASELINE-NEXT: movl 20(%rdx), %ecx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 16(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl 12(%rdx), %r12d -; CHECK-BASELINE-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movl 12(%rdx), %r15d +; CHECK-BASELINE-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 8(%rdx), %r14d -; CHECK-BASELINE-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movl (%rdx), %r15d -; CHECK-BASELINE-NEXT: movl 4(%rdx), %r13d -; CHECK-BASELINE-NEXT: movl (%rsi), %r8d -; CHECK-BASELINE-NEXT: xorl %r15d, %r8d +; CHECK-BASELINE-NEXT: movl (%rdx), %r13d +; CHECK-BASELINE-NEXT: movl 4(%rdx), %r12d ; CHECK-BASELINE-NEXT: movl 4(%rsi), %r9d -; CHECK-BASELINE-NEXT: xorl %r13d, %r9d -; CHECK-BASELINE-NEXT: movl 8(%rsi), %r11d -; CHECK-BASELINE-NEXT: xorl %r14d, %r11d -; CHECK-BASELINE-NEXT: movl 12(%rsi), %r14d -; CHECK-BASELINE-NEXT: xorl %r12d, %r14d -; CHECK-BASELINE-NEXT: movl 16(%rsi), %r12d -; CHECK-BASELINE-NEXT: xorl %eax, %r12d +; CHECK-BASELINE-NEXT: xorl %r12d, %r9d +; CHECK-BASELINE-NEXT: movl 8(%rsi), %r10d +; CHECK-BASELINE-NEXT: xorl %r14d, %r10d +; CHECK-BASELINE-NEXT: movl 12(%rsi), %ebp +; CHECK-BASELINE-NEXT: xorl %r15d, %ebp +; CHECK-BASELINE-NEXT: movl 16(%rsi), %r15d +; CHECK-BASELINE-NEXT: xorl %eax, %r15d ; CHECK-BASELINE-NEXT: movl 20(%rsi), %edx -; CHECK-BASELINE-NEXT: xorl %r10d, %edx +; CHECK-BASELINE-NEXT: xorl %ecx, %edx ; CHECK-BASELINE-NEXT: movl 24(%rsi), %eax -; CHECK-BASELINE-NEXT: xorl %ebx, %eax +; CHECK-BASELINE-NEXT: xorl %r11d, %eax +; CHECK-BASELINE-NEXT: movl (%rsi), %ecx ; CHECK-BASELINE-NEXT: movl 28(%rsi), %esi -; CHECK-BASELINE-NEXT: xorl %ebp, %esi -; CHECK-BASELINE-NEXT: andl 28(%rcx), %esi -; CHECK-BASELINE-NEXT: andl 24(%rcx), %eax -; CHECK-BASELINE-NEXT: andl 20(%rcx), %edx -; CHECK-BASELINE-NEXT: andl 16(%rcx), %r12d -; CHECK-BASELINE-NEXT: andl 12(%rcx), %r14d -; CHECK-BASELINE-NEXT: andl 8(%rcx), %r11d -; CHECK-BASELINE-NEXT: andl 4(%rcx), %r9d -; CHECK-BASELINE-NEXT: andl (%rcx), %r8d -; CHECK-BASELINE-NEXT: xorl %r15d, %r8d -; CHECK-BASELINE-NEXT: xorl %r13d, %r9d -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl %r10d, %edx -; CHECK-BASELINE-NEXT: xorl %ebx, %eax -; CHECK-BASELINE-NEXT: xorl %ebp, %esi +; CHECK-BASELINE-NEXT: xorl %ebx, %esi +; CHECK-BASELINE-NEXT: andl 28(%r8), %esi +; CHECK-BASELINE-NEXT: andl 24(%r8), %eax +; CHECK-BASELINE-NEXT: andl 20(%r8), %edx +; CHECK-BASELINE-NEXT: andl 16(%r8), %r15d +; CHECK-BASELINE-NEXT: andl 12(%r8), %ebp +; CHECK-BASELINE-NEXT: andl 8(%r8), %r10d +; CHECK-BASELINE-NEXT: andl 4(%r8), %r9d +; CHECK-BASELINE-NEXT: xorl %r13d, %ecx +; CHECK-BASELINE-NEXT: andl (%r8), %ecx +; CHECK-BASELINE-NEXT: xorl %r13d, %ecx +; CHECK-BASELINE-NEXT: xorl %r12d, %r9d +; CHECK-BASELINE-NEXT: xorl %r14d, %r10d +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl %r11d, %eax +; CHECK-BASELINE-NEXT: xorl %ebx, %esi ; CHECK-BASELINE-NEXT: movl %esi, 28(%rdi) ; CHECK-BASELINE-NEXT: movl %eax, 24(%rdi) ; CHECK-BASELINE-NEXT: movl %edx, 20(%rdi) -; CHECK-BASELINE-NEXT: movl %r12d, 16(%rdi) -; CHECK-BASELINE-NEXT: movl %r14d, 12(%rdi) -; CHECK-BASELINE-NEXT: movl %r11d, 8(%rdi) +; CHECK-BASELINE-NEXT: movl %r15d, 16(%rdi) +; CHECK-BASELINE-NEXT: movl %ebp, 12(%rdi) +; CHECK-BASELINE-NEXT: movl %r10d, 8(%rdi) ; CHECK-BASELINE-NEXT: movl %r9d, 4(%rdi) -; CHECK-BASELINE-NEXT: movl %r8d, (%rdi) +; CHECK-BASELINE-NEXT: movl %ecx, (%rdi) ; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 @@ -4193,57 +4204,58 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movl 28(%rdx), %ebp -; CHECK-SSE1-NEXT: movl 24(%rdx), %ebx -; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d +; CHECK-SSE1-NEXT: movq %rcx, %r8 +; CHECK-SSE1-NEXT: movl 28(%rdx), %ebx +; CHECK-SSE1-NEXT: movl 24(%rdx), %r11d +; CHECK-SSE1-NEXT: movl 20(%rdx), %ecx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 16(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl 12(%rdx), %r12d -; CHECK-SSE1-NEXT: movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movl 12(%rdx), %r15d +; CHECK-SSE1-NEXT: movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 8(%rdx), %r14d -; CHECK-SSE1-NEXT: movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movl (%rdx), %r15d -; CHECK-SSE1-NEXT: movl 4(%rdx), %r13d -; CHECK-SSE1-NEXT: movl (%rsi), %r8d -; CHECK-SSE1-NEXT: xorl %r15d, %r8d +; CHECK-SSE1-NEXT: movl (%rdx), %r13d +; CHECK-SSE1-NEXT: movl 4(%rdx), %r12d ; CHECK-SSE1-NEXT: movl 4(%rsi), %r9d -; CHECK-SSE1-NEXT: xorl %r13d, %r9d -; CHECK-SSE1-NEXT: movl 8(%rsi), %r11d -; CHECK-SSE1-NEXT: xorl %r14d, %r11d -; CHECK-SSE1-NEXT: movl 12(%rsi), %r14d -; CHECK-SSE1-NEXT: xorl %r12d, %r14d -; CHECK-SSE1-NEXT: movl 16(%rsi), %r12d -; CHECK-SSE1-NEXT: xorl %eax, %r12d +; CHECK-SSE1-NEXT: xorl %r12d, %r9d +; CHECK-SSE1-NEXT: movl 8(%rsi), %r10d +; CHECK-SSE1-NEXT: xorl %r14d, %r10d +; CHECK-SSE1-NEXT: movl 12(%rsi), %ebp +; CHECK-SSE1-NEXT: xorl %r15d, %ebp +; CHECK-SSE1-NEXT: movl 16(%rsi), %r15d +; CHECK-SSE1-NEXT: xorl %eax, %r15d ; CHECK-SSE1-NEXT: movl 20(%rsi), %edx -; CHECK-SSE1-NEXT: xorl %r10d, %edx +; CHECK-SSE1-NEXT: xorl %ecx, %edx ; CHECK-SSE1-NEXT: movl 24(%rsi), %eax -; CHECK-SSE1-NEXT: xorl %ebx, %eax +; CHECK-SSE1-NEXT: xorl %r11d, %eax +; CHECK-SSE1-NEXT: movl (%rsi), %ecx ; CHECK-SSE1-NEXT: movl 28(%rsi), %esi -; CHECK-SSE1-NEXT: xorl %ebp, %esi -; CHECK-SSE1-NEXT: andl 28(%rcx), %esi -; CHECK-SSE1-NEXT: andl 24(%rcx), %eax -; CHECK-SSE1-NEXT: andl 20(%rcx), %edx -; CHECK-SSE1-NEXT: andl 16(%rcx), %r12d -; CHECK-SSE1-NEXT: andl 12(%rcx), %r14d -; CHECK-SSE1-NEXT: andl 8(%rcx), %r11d -; CHECK-SSE1-NEXT: andl 4(%rcx), %r9d -; CHECK-SSE1-NEXT: andl (%rcx), %r8d -; CHECK-SSE1-NEXT: xorl %r15d, %r8d -; CHECK-SSE1-NEXT: xorl %r13d, %r9d -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl %r10d, %edx -; CHECK-SSE1-NEXT: xorl %ebx, %eax -; CHECK-SSE1-NEXT: xorl %ebp, %esi +; CHECK-SSE1-NEXT: xorl %ebx, %esi +; CHECK-SSE1-NEXT: andl 28(%r8), %esi +; CHECK-SSE1-NEXT: andl 24(%r8), %eax +; CHECK-SSE1-NEXT: andl 20(%r8), %edx +; CHECK-SSE1-NEXT: andl 16(%r8), %r15d +; CHECK-SSE1-NEXT: andl 12(%r8), %ebp +; CHECK-SSE1-NEXT: andl 8(%r8), %r10d +; CHECK-SSE1-NEXT: andl 4(%r8), %r9d +; CHECK-SSE1-NEXT: xorl %r13d, %ecx +; CHECK-SSE1-NEXT: andl (%r8), %ecx +; CHECK-SSE1-NEXT: xorl %r13d, %ecx +; CHECK-SSE1-NEXT: xorl %r12d, %r9d +; CHECK-SSE1-NEXT: xorl %r14d, %r10d +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl %r11d, %eax +; CHECK-SSE1-NEXT: xorl %ebx, %esi ; CHECK-SSE1-NEXT: movl %esi, 28(%rdi) ; CHECK-SSE1-NEXT: movl %eax, 24(%rdi) ; CHECK-SSE1-NEXT: movl %edx, 20(%rdi) -; CHECK-SSE1-NEXT: movl %r12d, 16(%rdi) -; CHECK-SSE1-NEXT: movl %r14d, 12(%rdi) -; CHECK-SSE1-NEXT: movl %r11d, 8(%rdi) +; CHECK-SSE1-NEXT: movl %r15d, 16(%rdi) +; CHECK-SSE1-NEXT: movl %ebp, 12(%rdi) +; CHECK-SSE1-NEXT: movl %r10d, 8(%rdi) ; CHECK-SSE1-NEXT: movl %r9d, 4(%rdi) -; CHECK-SSE1-NEXT: movl %r8d, (%rdi) +; CHECK-SSE1-NEXT: movl %ecx, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 @@ -4256,15 +4268,15 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE2-LABEL: in_v8i32: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 ; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2 +; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 ; CHECK-SSE2-NEXT: andps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2 -; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm3 +; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm3 ; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: orps %xmm3, %xmm1 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v8i32: @@ -4292,7 +4304,6 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movq (%rdx), %r9 ; CHECK-BASELINE-NEXT: movq 8(%rdx), %r10 ; CHECK-BASELINE-NEXT: movq (%rsi), %rdx -; CHECK-BASELINE-NEXT: xorq %r9, %rdx ; CHECK-BASELINE-NEXT: movq 8(%rsi), %r11 ; CHECK-BASELINE-NEXT: xorq %r10, %r11 ; CHECK-BASELINE-NEXT: movq 16(%rsi), %rbx @@ -4302,6 +4313,7 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: andq 24(%rcx), %rsi ; CHECK-BASELINE-NEXT: andq 16(%rcx), %rbx ; CHECK-BASELINE-NEXT: andq 8(%rcx), %r11 +; CHECK-BASELINE-NEXT: xorq %r9, %rdx ; CHECK-BASELINE-NEXT: andq (%rcx), %rdx ; CHECK-BASELINE-NEXT: xorq %r9, %rdx ; CHECK-BASELINE-NEXT: xorq %r10, %r11 @@ -4323,7 +4335,6 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movq (%rdx), %r9 ; CHECK-SSE1-NEXT: movq 8(%rdx), %r10 ; CHECK-SSE1-NEXT: movq (%rsi), %rdx -; CHECK-SSE1-NEXT: xorq %r9, %rdx ; CHECK-SSE1-NEXT: movq 8(%rsi), %r11 ; CHECK-SSE1-NEXT: xorq %r10, %r11 ; CHECK-SSE1-NEXT: movq 16(%rsi), %rbx @@ -4333,6 +4344,7 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: andq 24(%rcx), %rsi ; CHECK-SSE1-NEXT: andq 16(%rcx), %rbx ; CHECK-SSE1-NEXT: andq 8(%rcx), %r11 +; CHECK-SSE1-NEXT: xorq %r9, %rdx ; CHECK-SSE1-NEXT: andq (%rcx), %rdx ; CHECK-SSE1-NEXT: xorq %r9, %rdx ; CHECK-SSE1-NEXT: xorq %r10, %r11 @@ -4348,15 +4360,15 @@ define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE2-LABEL: in_v4i64: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2 ; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2 +; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1 ; CHECK-SSE2-NEXT: andps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 -; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2 -; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2 +; CHECK-SSE2-NEXT: movaps %xmm1, %xmm3 +; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm3 ; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1 -; CHECK-SSE2-NEXT: orps %xmm2, %xmm1 +; CHECK-SSE2-NEXT: orps %xmm2, %xmm0 +; CHECK-SSE2-NEXT: orps %xmm3, %xmm1 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_v4i64: diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 7c1a1e285ca05..333632eaecd32 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -169,7 +169,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; SSE41-NEXT: pinsrd $2, %edx, %xmm0 ; SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmovsxwd {{.*#+}} xmm1 = [2047,2047,2047,2047] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: psrld $1, %xmm2 @@ -180,9 +180,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; SSE41-NEXT: por %xmm2, %xmm3 ; SSE41-NEXT: pand %xmm1, %xmm3 ; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: movd %xmm3, %eax ; SSE41-NEXT: pextrb $4, %xmm3, %edx ; SSE41-NEXT: pextrb $8, %xmm3, %ecx +; SSE41-NEXT: movd %xmm3, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx @@ -205,9 +205,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vpextrb $4, %xmm0, %edx ; AVX1-NEXT: vpextrb $8, %xmm0, %ecx +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: # kill: def $dl killed $dl killed $edx ; AVX1-NEXT: # kill: def $cl killed $cl killed $ecx @@ -227,9 +227,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vpextrb $4, %xmm0, %edx ; AVX2-NEXT: vpextrb $8, %xmm0, %ecx +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: # kill: def $dl killed $dl killed $edx ; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll index 8b7d87da1d6e9..82a23a4305c64 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll @@ -47,9 +47,9 @@ define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; X86-LABEL: test_optsize: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: cmpl $858993460, %eax # imm = 0x33333334 +; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD ; X86-NEXT: movl $42, %eax +; X86-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 ; X86-NEXT: jb .LBB1_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl $-10, %eax diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 838086e366fbf..6da1afccf79a3 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -11,8 +11,8 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -162,8 +162,8 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -240,9 +240,9 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -318,8 +318,8 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -391,12 +391,12 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-SSE2-NEXT: por %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -483,7 +483,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -539,8 +539,8 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -823,8 +823,8 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -915,7 +915,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,2] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,2,u] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -971,8 +971,8 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -1145,7 +1145,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1475,8 +1475,8 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -1552,8 +1552,8 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -1706,8 +1706,8 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -1797,7 +1797,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [1,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,u,268435456,u] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 @@ -1865,7 +1865,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,268435456] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,u,268435456,u] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrlq $32, %xmm1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll index 6a36cd2a86d5c..cbe83ef64831d 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -98,9 +98,9 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind { ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [858993458,858993458,858993458,858993458] ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -168,9 +168,9 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [715827882,715827882,715827882,715827882] ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll index 2166e43fc4286..12c1fe9187226 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -564,7 +564,7 @@ define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: psubd %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll index 36094fe56d577..7b6d4064bfcd8 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -143,7 +143,7 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind { ; CHECK-SSE41-LABEL: t2_narrow: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [43691,43691,43691,43691,43691,43691,43691,43691] -; CHECK-SSE41-NEXT: pmovsxdq {{.*#+}} xmm1 = [18446744073709507925,18446744073709507925] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [21845,65535,65535,65535,21845,65535,65535,65535] ; CHECK-SSE41-NEXT: pminuw %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 @@ -189,16 +189,16 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 ; CHECK-SSE2-NEXT: psrlq $32, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE2-NEXT: paddq %xmm3, %xmm0 ; CHECK-SSE2-NEXT: psllq $32, %xmm0 ; CHECK-SSE2-NEXT: paddq %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] ; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] ; CHECK-SSE2-NEXT: pand %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: por %xmm1, %xmm0 @@ -214,16 +214,16 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3 ; CHECK-SSE41-NEXT: psrlq $32, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 ; CHECK-SSE41-NEXT: paddq %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psllq $32, %xmm0 ; CHECK-SSE41-NEXT: paddq %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pand %xmm2, %xmm1 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: por %xmm1, %xmm0 @@ -254,7 +254,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; ; CHECK-AVX2-LABEL: t3_wide: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] ; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll index 94c7892795c2b..5a02235490bb3 100644 --- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -115,7 +115,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] ; SSE-NEXT: pmulhuw %xmm0, %xmm1 ; SSE-NEXT: psrlw $6, %xmm1 -; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] ; SSE-NEXT: pmullw %xmm1, %xmm2 ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 @@ -139,9 +139,9 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_power_of_two: ; SSE: # %bb.0: -; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63] -; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63] ; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: andl $31, %eax ; SSE-NEXT: pinsrw $1, %eax, %xmm1 ; SSE-NEXT: pextrw $2, %xmm0, %eax @@ -176,8 +176,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; AVX2-LABEL: dont_fold_urem_power_of_two: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpextrw $1, %xmm0, %eax +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: andl $31, %eax ; AVX2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vpextrw $2, %xmm0, %eax @@ -210,14 +210,14 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; SSE-NEXT: leal (%rdx,%rdx,2), %ecx ; SSE-NEXT: shll $3, %ecx ; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: pextrw $1, %xmm0, %ecx ; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; SSE-NEXT: shrl $25, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: imull $51307, %ecx, %eax # imm = 0xC86B +; SSE-NEXT: shrl $25, %eax +; SSE-NEXT: imull $654, %eax, %eax # imm = 0x28E +; SSE-NEXT: subl %eax, %ecx ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 ; SSE-NEXT: pinsrw $2, %edx, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax ; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 @@ -242,14 +242,14 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; AVX-NEXT: leal (%rdx,%rdx,2), %ecx ; AVX-NEXT: shll $3, %ecx ; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: vpextrw $1, %xmm0, %ecx ; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; AVX-NEXT: shrl $25, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: imull $51307, %ecx, %eax # imm = 0xC86B +; AVX-NEXT: shrl $25, %eax +; AVX-NEXT: imull $654, %eax, %eax # imm = 0x28E +; AVX-NEXT: subl %eax, %ecx ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 ; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax ; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 diff --git a/llvm/test/CodeGen/X86/use-add-flags.ll b/llvm/test/CodeGen/X86/use-add-flags.ll index a28bad20e5d78..a6d31c0ae08c3 100644 --- a/llvm/test/CodeGen/X86/use-add-flags.ll +++ b/llvm/test/CodeGen/X86/use-add-flags.ll @@ -10,15 +10,15 @@ define i32 @test1(ptr %x, i32 %y, i32 %a, i32 %b) nounwind { ; LNX-LABEL: test1: ; LNX: # %bb.0: -; LNX-NEXT: movl %edx, %eax ; LNX-NEXT: addl (%rdi), %esi +; LNX-NEXT: movl %edx, %eax ; LNX-NEXT: cmovnsl %ecx, %eax ; LNX-NEXT: retq ; ; WIN-LABEL: test1: ; WIN: # %bb.0: -; WIN-NEXT: movl %r8d, %eax ; WIN-NEXT: addl (%rcx), %edx +; WIN-NEXT: movl %r8d, %eax ; WIN-NEXT: cmovnsl %r9d, %eax ; WIN-NEXT: retq %tmp2 = load i32, ptr %x, align 4 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll index e0e1ef7108d0d..2ce7f5a49f872 100644 --- a/llvm/test/CodeGen/X86/ushl_sat.ll +++ b/llvm/test/CodeGen/X86/ushl_sat.ll @@ -196,30 +196,30 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: shll %cl, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: shldl %cl, %edi, %edx ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %edi, %edx -; X86-NEXT: cmovnel %ebx, %edi +; X86-NEXT: cmovnel %esi, %edx +; X86-NEXT: cmovnel %ebx, %esi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovel %ebp, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: shrdl %cl, %edx, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel %ebp, %eax -; X86-NEXT: xorl %esi, %eax +; X86-NEXT: xorl %edi, %eax ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: orl %eax, %ebx ; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovnel %eax, %edi +; X86-NEXT: cmovnel %eax, %esi ; X86-NEXT: cmovnel %eax, %edx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index ebb5e135eacd0..902756600381e 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -47,9 +47,9 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: subl $16, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: testb $32, %cl @@ -141,10 +141,10 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; X64-NEXT: movdqa %xmm5, %xmm2 ; X64-NEXT: psrld %xmm1, %xmm2 -; X64-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] ; X64-NEXT: movdqa %xmm6, %xmm1 ; X64-NEXT: psrld %xmm3, %xmm1 ; X64-NEXT: psrld %xmm4, %xmm5 +; X64-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] ; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; X64-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] ; X64-NEXT: pcmpeqd %xmm5, %xmm0 @@ -169,47 +169,48 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: cmpl %ebp, %ebx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: shrl %cl, %edi +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %edx ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovnel %edx, %esi -; X86-NEXT: movl $-1, %ebx -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movb %ah, %cl -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: cmpl %ebp, %edi -; X86-NEXT: cmovnel %ebx, %edx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: movl %ebx, %edi ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmovnel %eax, %edi -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movl %edi, %eax ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: cmpl %eax, %ebx -; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovnel %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmovnel %edx, %edi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpl %edx, %ebx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebp, 4(%ecx) +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -301,71 +302,72 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $12, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: shll %cl, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzwl %bx, %edi ; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %ebx +; X86-NEXT: cmpw %di, %dx +; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X86-NEXT: cmovnel %ecx, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bp, %ebx +; X86-NEXT: shrl %cl, %ebx +; X86-NEXT: cmpw %bx, %si +; X86-NEXT: movl $65535, %ebx # imm = 0xFFFF +; X86-NEXT: cmovnel %ebx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %si +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %di ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmovnel %ebx, %ebp +; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-NEXT: movl $65535, %esi # imm = 0xFFFF -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %bp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: shll %cl, %ebp ; X86-NEXT: movzwl %bp, %edx ; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %si -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %eax, %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl $65535, %esi # imm = 0xFFFF +; X86-NEXT: movzwl %bx, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax ; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %edi ; X86-NEXT: shll %cl, %edi -; X86-NEXT: movzwl %di, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx -; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-NEXT: cmovnel %edx, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: shll %cl, %esi ; X86-NEXT: movzwl %si, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: cmovnel %edx, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll %cl, %eax @@ -486,44 +488,43 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $48, %esp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movb %dl, %ch +; X86-NEXT: shlb %cl, %ch +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: movb {{[0-9]+}}(%esp), %dh -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movb %bl, %bh -; X86-NEXT: shlb %cl, %bh -; X86-NEXT: movzbl %bh, %edi -; X86-NEXT: shrb %cl, %bh -; X86-NEXT: cmpb %bh, %bl +; X86-NEXT: movzbl %ch, %edi +; X86-NEXT: shrb %cl, %ch +; X86-NEXT: cmpb %ch, %dl ; X86-NEXT: movl $255, %esi ; X86-NEXT: cmovnel %esi, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %dh, %bl -; X86-NEXT: movb %ah, %cl -; X86-NEXT: shlb %cl, %bl -; X86-NEXT: movzbl %bl, %edi -; X86-NEXT: shrb %cl, %bl -; X86-NEXT: cmpb %bl, %dh +; X86-NEXT: movb %ah, %dh +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shlb %cl, %dh +; X86-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movzbl %dh, %edi +; X86-NEXT: shrb %cl, %dh +; X86-NEXT: cmpb %dh, %ah ; X86-NEXT: cmovnel %esi, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %ch, %ah +; X86-NEXT: movb %ch, %al ; X86-NEXT: movb %dl, %cl -; X86-NEXT: shlb %cl, %ah -; X86-NEXT: movzbl %ah, %edi -; X86-NEXT: shrb %cl, %ah +; X86-NEXT: shlb %cl, %al +; X86-NEXT: movzbl %al, %edi +; X86-NEXT: shrb %cl, %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpb %ah, %ch +; X86-NEXT: cmpb %al, %ch ; X86-NEXT: cmovnel %esi, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %dl, %ah -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shlb %cl, %ah -; X86-NEXT: movzbl %ah, %edi -; X86-NEXT: shrb %cl, %ah -; X86-NEXT: cmpb %ah, %dl +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shlb %cl, %al +; X86-NEXT: movzbl %al, %edi +; X86-NEXT: shrb %cl, %al +; X86-NEXT: cmpb %al, %dl ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovnel %esi, %edi diff --git a/llvm/test/CodeGen/X86/usub_sat.ll b/llvm/test/CodeGen/X86/usub_sat.ll index 6749a1f9147af..fe05954e836c0 100644 --- a/llvm/test/CodeGen/X86/usub_sat.ll +++ b/llvm/test/CodeGen/X86/usub_sat.ll @@ -32,9 +32,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-LABEL: func2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovbl %ecx, %edx ; X86-NEXT: cmovbl %ecx, %eax @@ -123,7 +123,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -136,6 +135,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: subl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovbl %ebx, %esi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovbl %ebx, %edi ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: movl %esi, 8(%eax) diff --git a/llvm/test/CodeGen/X86/usub_sat_plus.ll b/llvm/test/CodeGen/X86/usub_sat_plus.ll index 0fb14ad5cf7b0..e278526131372 100644 --- a/llvm/test/CodeGen/X86/usub_sat_plus.ll +++ b/llvm/test/CodeGen/X86/usub_sat_plus.ll @@ -35,9 +35,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind { ; X86-LABEL: func64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovbl %ecx, %edx ; X86-NEXT: cmovbl %ecx, %eax @@ -82,9 +82,9 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y, i16 zeroext %z) nounw define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y, i8 zeroext %z) nounwind { ; X86-LABEL: func8: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mulb {{[0-9]+}}(%esp) +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: subb %al, %cl ; X86-NEXT: movzbl %cl, %eax diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll index 4e17ca6fbae33..2152bbeac0b4e 100644 --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -499,7 +499,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 @@ -507,7 +507,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; ; AVX512-LABEL: v16i4: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 @@ -834,7 +834,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1055,94 +1055,60 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; SSE-LABEL: v2i128: ; SSE: # %bb.0: -; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: xorl %edi, %edi +; SSE-NEXT: xorl %r10d, %r10d ; SSE-NEXT: subq %r9, %rsi ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx -; SSE-NEXT: cmovbq %rdi, %rsi -; SSE-NEXT: cmovbq %rdi, %rdx +; SSE-NEXT: cmovbq %r10, %rsi +; SSE-NEXT: cmovbq %r10, %rdx ; SSE-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 -; SSE-NEXT: cmovbq %rdi, %r8 -; SSE-NEXT: cmovbq %rdi, %rcx -; SSE-NEXT: movq %r8, 24(%rax) -; SSE-NEXT: movq %rcx, 16(%rax) -; SSE-NEXT: movq %rdx, 8(%rax) -; SSE-NEXT: movq %rsi, (%rax) +; SSE-NEXT: cmovbq %r10, %r8 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: cmovbq %r10, %rcx +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rcx, 16(%rdi) +; SSE-NEXT: movq %rdx, 8(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: xorl %edi, %edi +; AVX-NEXT: xorl %r10d, %r10d ; AVX-NEXT: subq %r9, %rsi ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx -; AVX-NEXT: cmovbq %rdi, %rsi -; AVX-NEXT: cmovbq %rdi, %rdx +; AVX-NEXT: cmovbq %r10, %rsi +; AVX-NEXT: cmovbq %r10, %rdx ; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: cmovbq %rdi, %r8 -; AVX-NEXT: cmovbq %rdi, %rcx -; AVX-NEXT: movq %r8, 24(%rax) -; AVX-NEXT: movq %rcx, 16(%rax) -; AVX-NEXT: movq %rdx, 8(%rax) -; AVX-NEXT: movq %rsi, (%rax) +; AVX-NEXT: cmovbq %r10, %r8 +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: cmovbq %r10, %rcx +; AVX-NEXT: movq %r8, 24(%rdi) +; AVX-NEXT: movq %rcx, 16(%rdi) +; AVX-NEXT: movq %rdx, 8(%rdi) +; AVX-NEXT: movq %rsi, (%rdi) ; AVX-NEXT: retq %z = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z } define void @PR48223(ptr %p0) { -; SSE2-LABEL: PR48223: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] -; SSE2-NEXT: psubusw %xmm4, %xmm1 -; SSE2-NEXT: psubusw %xmm4, %xmm0 -; SSE2-NEXT: psubusw %xmm4, %xmm3 -; SSE2-NEXT: psubusw %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 32(%rdi) -; SSE2-NEXT: movdqa %xmm3, 48(%rdi) -; SSE2-NEXT: movdqa %xmm0, (%rdi) -; SSE2-NEXT: movdqa %xmm1, 16(%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR48223: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm0 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm1 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm2 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] -; SSSE3-NEXT: psubusw %xmm4, %xmm1 -; SSSE3-NEXT: psubusw %xmm4, %xmm0 -; SSSE3-NEXT: psubusw %xmm4, %xmm3 -; SSSE3-NEXT: psubusw %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) -; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm0, (%rdi) -; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR48223: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: movdqa 32(%rdi), %xmm2 -; SSE41-NEXT: movdqa 48(%rdi), %xmm3 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] -; SSE41-NEXT: psubusw %xmm4, %xmm1 -; SSE41-NEXT: psubusw %xmm4, %xmm0 -; SSE41-NEXT: psubusw %xmm4, %xmm3 -; SSE41-NEXT: psubusw %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm2, 32(%rdi) -; SSE41-NEXT: movdqa %xmm3, 48(%rdi) -; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm1, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: PR48223: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] +; SSE-NEXT: psubusw %xmm4, %xmm1 +; SSE-NEXT: psubusw %xmm4, %xmm0 +; SSE-NEXT: psubusw %xmm4, %xmm3 +; SSE-NEXT: psubusw %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, 32(%rdi) +; SSE-NEXT: movdqa %xmm3, 48(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: PR48223: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll index 01f10372eaa2d..3e708bfff481a 100644 --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -236,7 +236,7 @@ define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) { define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 { ; X86-LABEL: two_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -244,7 +244,7 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 { ; ; X64-LABEL: two_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -253,8 +253,8 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 { ; X86-AVX2-LABEL: two_ands: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl @@ -262,8 +262,8 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 { ; X64-AVX2-LABEL: two_ands: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq @@ -294,23 +294,23 @@ entry: define <8 x i32> @three_ands(<8 x float> %x) { ; X86-LABEL: three_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: three_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -370,27 +370,27 @@ entry: define <8 x i32> @four_ands(<8 x float> %x) { ; X86-LABEL: four_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 +; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: four_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -460,31 +460,31 @@ entry: define <8 x i32> @five_ands(<8 x float> %x) { ; X86-LABEL: five_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 ; X86-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: five_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -500,9 +500,9 @@ define <8 x i32> @five_ands(<8 x float> %x) { ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -518,9 +518,9 @@ define <8 x i32> @five_ands(<8 x float> %x) { ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -564,7 +564,7 @@ entry: define <8 x i32> @two_or(<8 x float> %x) { ; X86-LABEL: two_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -572,7 +572,7 @@ define <8 x i32> @two_or(<8 x float> %x) { ; ; X64-LABEL: two_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -581,8 +581,8 @@ define <8 x i32> @two_or(<8 x float> %x) { ; X86-AVX2-LABEL: two_or: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl @@ -590,8 +590,8 @@ define <8 x i32> @two_or(<8 x float> %x) { ; X64-AVX2-LABEL: two_or: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq @@ -624,23 +624,23 @@ entry: define <8 x i32> @three_or(<8 x float> %x) { ; X86-LABEL: three_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm2, %ymm0 +; X86-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: three_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm2, %ymm0 +; X64-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -704,27 +704,27 @@ entry: define <8 x i32> @four_or(<8 x float> %x) { ; X86-LABEL: four_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X86-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 +; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-NEXT: vorps %ymm3, %ymm0, %ymm0 ; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: four_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-NEXT: vorps %ymm3, %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -800,31 +800,31 @@ entry: define <8 x i32> @five_or(<8 x float> %x) { ; X86-LABEL: five_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 ; X86-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X86-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: five_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; X64-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -840,9 +840,9 @@ define <8 x i32> @five_or(<8 x float> %x) { ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -858,9 +858,9 @@ define <8 x i32> @five_or(<8 x float> %x) { ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -871,9 +871,9 @@ define <8 x i32> @five_or(<8 x float> %x) { ; X86-AVX512-NEXT: korw %k1, %k0, %k0 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k1 +; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2 ; X86-AVX512-NEXT: korw %k1, %k0, %k0 -; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 -; X86-AVX512-NEXT: korw %k1, %k0, %k0 +; X86-AVX512-NEXT: korw %k2, %k0, %k0 ; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 ; X86-AVX512-NEXT: korw %k1, %k0, %k1 ; X86-AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -887,9 +887,9 @@ define <8 x i32> @five_or(<8 x float> %x) { ; X64-AVX512-NEXT: korw %k1, %k0, %k0 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k1 +; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k2 ; X64-AVX512-NEXT: korw %k1, %k0, %k0 -; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1 -; X64-AVX512-NEXT: korw %k1, %k0, %k0 +; X64-AVX512-NEXT: korw %k2, %k0, %k0 ; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1 ; X64-AVX512-NEXT: korw %k1, %k0, %k1 ; X64-AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -912,7 +912,7 @@ entry: define <8 x i32> @three_or_and(<8 x float> %x) { ; X86-LABEL: three_or_and: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -923,7 +923,7 @@ define <8 x i32> @three_or_and(<8 x float> %x) { ; ; X64-LABEL: three_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -990,27 +990,27 @@ entry: define <8 x i32> @four_or_and(<8 x float> %x) { ; X86-LABEL: four_or_and: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 +; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: four_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -1082,31 +1082,31 @@ entry: define <8 x i32> @five_or_and(<8 x float> %x) { ; X86-LABEL: five_or_and: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X86-NEXT: vorps %ymm1, %ymm2, %ymm1 -; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X86-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X86-NEXT: vandps %ymm0, %ymm3, %ymm0 ; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: five_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vorps %ymm1, %ymm2, %ymm1 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X64-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X64-NEXT: vandps %ymm0, %ymm3, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -1119,9 +1119,9 @@ define <8 x i32> @five_or_and(<8 x float> %x) { ; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 @@ -1137,9 +1137,9 @@ define <8 x i32> @five_or_and(<8 x float> %x) { ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 @@ -1190,27 +1190,27 @@ entry: define <8 x i32> @four_or_and_xor(<8 x float> %x) { ; X86-LABEL: four_or_and_xor: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 +; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: four_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -1284,9 +1284,9 @@ entry: define <8 x i32> @five_or_and_xor(<8 x float> %x) { ; X86-LABEL: five_or_and_xor: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-NEXT: vxorps %ymm3, %ymm2, %ymm2 @@ -1299,9 +1299,9 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) { ; ; X64-LABEL: five_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-NEXT: vxorps %ymm3, %ymm2, %ymm2 @@ -1354,9 +1354,9 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) { ; X86-AVX512-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k2 +; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k3 ; X86-AVX512-NEXT: kxorw %k2, %k1, %k1 -; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2 -; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2 {%k2} +; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2 {%k3} ; X86-AVX512-NEXT: kxorw %k2, %k1, %k1 ; X86-AVX512-NEXT: korw %k0, %k1, %k1 ; X86-AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -1369,9 +1369,9 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) { ; X64-AVX512-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k2 +; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k3 ; X64-AVX512-NEXT: kxorw %k2, %k1, %k1 -; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k2 -; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k2 {%k2} +; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k2 {%k3} ; X64-AVX512-NEXT: kxorw %k2, %k1, %k1 ; X64-AVX512-NEXT: korw %k0, %k1, %k1 ; X64-AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -1393,7 +1393,7 @@ entry: define <8 x i32> @six_or_and_xor(<8 x float> %x) { ; X86-LABEL: six_or_and_xor: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1410,7 +1410,7 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) { ; ; X64-LABEL: six_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1440,9 +1440,9 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) { ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1 +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] ; X86-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -1461,9 +1461,9 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) { ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] ; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 7f4111e65cc17..67af6b9f4fd32 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -70,8 +70,8 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw ; SSE3-NEXT: pxor %xmm1, %xmm2 ; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE3-NEXT: pand %xmm4, %xmm3 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE3-NEXT: por %xmm3, %xmm2 @@ -95,8 +95,8 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw ; SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm3, %xmm2 @@ -168,7 +168,7 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw ; AVX512-LABEL: var_shuffle_zero_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3] ; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -315,7 +315,7 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw ; ; SSE41-LABEL: var_shuffle_zero_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [4,4,4,4] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,4,4,4] ; SSE41-NEXT: pmaxud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: por %xmm2, %xmm1 @@ -556,7 +556,7 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw ; ; SSE41-LABEL: var_shuffle_zero_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8] ; SSE41-NEXT: pmaxuw %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE41-NEXT: por %xmm2, %xmm1 @@ -1041,8 +1041,8 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) ; SSE3-NEXT: pxor %xmm1, %xmm2 ; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSE3-NEXT: pand %xmm4, %xmm3 ; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE3-NEXT: por %xmm3, %xmm2 @@ -1065,8 +1065,8 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) ; SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] ; SSSE3-NEXT: pand %xmm4, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm3, %xmm2 @@ -1137,7 +1137,7 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) ; AVX512-LABEL: var_shuffle_zero_v2f64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3] ; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -1284,7 +1284,7 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n ; ; SSE41-LABEL: var_shuffle_zero_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [4,4,4,4] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,4,4,4] ; SSE41-NEXT: pmaxud %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE41-NEXT: por %xmm2, %xmm1 @@ -1781,10 +1781,11 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: subq $392, %rsp # imm = 0x188 -; SSE41-NEXT: movd %xmm2, %eax +; SSE41-NEXT: movd %xmm2, %ecx +; SSE41-NEXT: pextrb $1, %xmm2, %eax ; SSE41-NEXT: movaps %xmm1, 368(%rsp) ; SSE41-NEXT: movaps %xmm0, 352(%rsp) -; SSE41-NEXT: andl $31, %eax +; SSE41-NEXT: andl $31, %ecx ; SSE41-NEXT: movaps %xmm1, 336(%rsp) ; SSE41-NEXT: movaps %xmm0, 320(%rsp) ; SSE41-NEXT: movaps %xmm1, 304(%rsp) @@ -1815,9 +1816,8 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE41-NEXT: movaps %xmm0, -96(%rsp) ; SSE41-NEXT: movaps %xmm1, -112(%rsp) ; SSE41-NEXT: movaps %xmm0, -128(%rsp) -; SSE41-NEXT: movzbl 352(%rsp,%rax), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pextrb $1, %xmm2, %eax +; SSE41-NEXT: movzbl 352(%rsp,%rcx), %ecx +; SSE41-NEXT: movd %ecx, %xmm0 ; SSE41-NEXT: andl $31, %eax ; SSE41-NEXT: pinsrb $1, 320(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $2, %xmm2, %eax @@ -1876,9 +1876,9 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1886,9 +1886,9 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1896,9 +1896,9 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -2011,8 +2011,8 @@ define void @indices_convert() { ; SSE41-NEXT: extractps $2, %xmm0, %eax ; SSE41-NEXT: movaps %xmm0, -24(%rsp) ; SSE41-NEXT: movaps %xmm0, -40(%rsp) -; SSE41-NEXT: andl $3, %eax ; SSE41-NEXT: extractps $3, %xmm0, %ecx +; SSE41-NEXT: andl $3, %eax ; SSE41-NEXT: movaps %xmm0, -56(%rsp) ; SSE41-NEXT: movaps %xmm0, -72(%rsp) ; SSE41-NEXT: andl $3, %ecx diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index 7296cc27894c3..9d8ce6d5bdc98 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -82,7 +82,8 @@ define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounw ; XOP-LABEL: var_shuffle_zero_v4i64: ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,3] +; XOP-NEXT: vmovddup {{.*#+}} xmm3 = [3,3] +; XOP-NEXT: # xmm3 = mem[0,0] ; XOP-NEXT: vpcomgtuq %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpcomgtuq %xmm3, %xmm1, %xmm3 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -196,8 +197,8 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind { ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 @@ -350,23 +351,23 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi ; AVX1-LABEL: var_shuffle_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] -; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [256,256,256,256,256,256,256,256] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpcmpgtb %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm8 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm6 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm5, %xmm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v16i16: @@ -376,9 +377,9 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v16i16: @@ -388,9 +389,9 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VLDQ-LABEL: var_shuffle_v16i16: @@ -543,10 +544,10 @@ define <16 x i16> @var_shuffle_zero_v16i16(<16 x i16> %v, <16 x i16> %indices) n ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm0 ^ ymm3)) -; AVX512VLDQ-NEXT: vpandn %ymm1, %ymm2, %ymm0 +; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm3 ^ (ymm4 & (ymm0 ^ ymm3)) +; AVX512VLDQ-NEXT: vpandn %ymm4, %ymm2, %ymm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_zero_v16i16: @@ -653,9 +654,9 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v32i8: @@ -663,9 +664,9 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3] ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VLDQ-LABEL: var_shuffle_v32i8: @@ -855,10 +856,10 @@ define <32 x i8> @var_shuffle_zero_v32i8(<32 x i8> %v, <32 x i8> %indices) nounw ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512VLDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VLDQ-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm0 ^ ymm3)) -; AVX512VLDQ-NEXT: vpandn %ymm1, %ymm2, %ymm0 +; AVX512VLDQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm3 ^ (ymm4 & (ymm0 ^ ymm3)) +; AVX512VLDQ-NEXT: vpandn %ymm4, %ymm2, %ymm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_zero_v32i8: @@ -1058,7 +1059,8 @@ define <4 x double> @var_shuffle_zero_v4f64(<4 x double> %v, <4 x i64> %indices) ; XOP-LABEL: var_shuffle_zero_v4f64: ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,3] +; XOP-NEXT: vmovddup {{.*#+}} xmm3 = [3,3] +; XOP-NEXT: # xmm3 = mem[0,0] ; XOP-NEXT: vpcomgtuq %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpcomgtuq %xmm3, %xmm1, %xmm3 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -1172,8 +1174,8 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 @@ -1333,8 +1335,8 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 -; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2 @@ -1344,8 +1346,8 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] @@ -1394,8 +1396,8 @@ define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 @@ -1451,22 +1453,22 @@ define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indic ; AVX1-LABEL: var_shuffle_v16i16_from_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514] -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256] -; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [256,256,256,256,256,256,256,256] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpcmpgtb %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm7 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm5, %xmm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v16i16_from_v8i16: @@ -1474,11 +1476,11 @@ define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indic ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] ; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v16i16_from_v8i16: @@ -1486,18 +1488,18 @@ define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indic ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] ; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLDQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] ; AVX512VLDQ-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512VLDQ-NEXT: vpshufb %ymm1, %ymm0, %ymm3 @@ -1594,21 +1596,21 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX2-LABEL: var_shuffle_v32i8_from_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v32i8_from_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8: @@ -1754,8 +1756,8 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 -; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2 @@ -1765,8 +1767,8 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] @@ -1815,8 +1817,8 @@ define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indi ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 @@ -1868,9 +1870,9 @@ define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices) ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpermilps %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 ; AVX1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1974,8 +1976,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr ; AVX512-NEXT: vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vpcmpgtq %zmm0, %zmm2, %k1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [17,51,85,119] -; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = [34,68,102,136] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [17,51,85,119] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [34,68,102,136] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: movq %rbp, %rsp @@ -1997,7 +1999,7 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr ; AVX512VL-NEXT: vmovq (%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm2, %k1 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} ymm0 = [34,68,102,136] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [34,68,102,136] ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} ymm0 {%k1} = [17,51,85,119] ; AVX512VL-NEXT: movq %rbp, %rsp ; AVX512VL-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll index 88788013e4943..be0dce5b7846b 100644 --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -98,95 +98,95 @@ define <32 x i16> @var_shuffle_v32i16(<32 x i16> %v, <32 x i16> %indices) nounwi ; AVX512F-NEXT: movq %rsp, %rbp ; AVX512F-NEXT: andq $-64, %rsp ; AVX512F-NEXT: subq $128, %rsp +; AVX512F-NEXT: vpextrw $0, %xmm1, %eax ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrw $0, %xmm2, %ecx ; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; AVX512F-NEXT: vpextrw $0, %xmm3, %edx ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512F-NEXT: vpextrw $0, %xmm4, %eax +; AVX512F-NEXT: vpextrw $0, %xmm4, %edi +; AVX512F-NEXT: vpextrw $1, %xmm4, %esi ; AVX512F-NEXT: vmovaps %zmm0, (%rsp) -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vpextrw $1, %xmm4, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrw $2, %xmm4, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrw $3, %xmm4, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrw $4, %xmm4, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrw $5, %xmm4, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrw $6, %xmm4, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrw $7, %xmm4, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrw $0, %xmm3, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrw $1, %xmm3, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $2, %xmm3, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $3, %xmm3, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $4, %xmm3, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $5, %xmm3, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $6, %xmm3, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $7, %xmm3, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm3 -; AVX512F-NEXT: vpextrw $0, %xmm2, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrw $1, %xmm2, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $2, %xmm2, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $3, %xmm2, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $4, %xmm2, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $5, %xmm2, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $6, %xmm2, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrw $7, %xmm2, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vpinsrw $7, %eax, %xmm4, %xmm2 -; AVX512F-NEXT: vpextrw $0, %xmm1, %eax +; AVX512F-NEXT: andl $31, %edi +; AVX512F-NEXT: movzwl (%rsp,%rdi,2), %edi +; AVX512F-NEXT: vmovd %edi, %xmm0 +; AVX512F-NEXT: andl $31, %esi +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $2, %xmm4, %esi +; AVX512F-NEXT: andl $31, %esi +; AVX512F-NEXT: vpinsrw $2, (%rsp,%rsi,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $3, %xmm4, %esi +; AVX512F-NEXT: andl $31, %esi +; AVX512F-NEXT: vpinsrw $3, (%rsp,%rsi,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $4, %xmm4, %esi +; AVX512F-NEXT: andl $31, %esi +; AVX512F-NEXT: vpinsrw $4, (%rsp,%rsi,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $5, %xmm4, %esi +; AVX512F-NEXT: andl $31, %esi +; AVX512F-NEXT: vpinsrw $5, (%rsp,%rsi,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $6, %xmm4, %esi +; AVX512F-NEXT: andl $31, %esi +; AVX512F-NEXT: vpinsrw $6, (%rsp,%rsi,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $7, %xmm4, %esi +; AVX512F-NEXT: andl $31, %esi +; AVX512F-NEXT: vpinsrw $7, (%rsp,%rsi,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $1, %xmm3, %esi +; AVX512F-NEXT: andl $31, %edx +; AVX512F-NEXT: movzwl (%rsp,%rdx,2), %edx +; AVX512F-NEXT: vmovd %edx, %xmm4 +; AVX512F-NEXT: andl $31, %esi +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $2, %xmm3, %edx +; AVX512F-NEXT: andl $31, %edx +; AVX512F-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $3, %xmm3, %edx +; AVX512F-NEXT: andl $31, %edx +; AVX512F-NEXT: vpinsrw $3, (%rsp,%rdx,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $4, %xmm3, %edx +; AVX512F-NEXT: andl $31, %edx +; AVX512F-NEXT: vpinsrw $4, (%rsp,%rdx,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $5, %xmm3, %edx +; AVX512F-NEXT: andl $31, %edx +; AVX512F-NEXT: vpinsrw $5, (%rsp,%rdx,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $6, %xmm3, %edx +; AVX512F-NEXT: andl $31, %edx +; AVX512F-NEXT: vpinsrw $6, (%rsp,%rdx,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $7, %xmm3, %edx +; AVX512F-NEXT: andl $31, %edx +; AVX512F-NEXT: vpinsrw $7, (%rsp,%rdx,2), %xmm4, %xmm3 +; AVX512F-NEXT: vpextrw $1, %xmm2, %edx +; AVX512F-NEXT: andl $31, %ecx +; AVX512F-NEXT: movzwl (%rsp,%rcx,2), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm4 +; AVX512F-NEXT: andl $31, %edx +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rdx,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $2, %xmm2, %ecx +; AVX512F-NEXT: andl $31, %ecx +; AVX512F-NEXT: vpinsrw $2, (%rsp,%rcx,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $3, %xmm2, %ecx +; AVX512F-NEXT: andl $31, %ecx +; AVX512F-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $4, %xmm2, %ecx +; AVX512F-NEXT: andl $31, %ecx +; AVX512F-NEXT: vpinsrw $4, (%rsp,%rcx,2), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $5, %xmm2, %ecx +; AVX512F-NEXT: andl $31, %ecx +; AVX512F-NEXT: movzwl (%rsp,%rcx,2), %ecx +; AVX512F-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $6, %xmm2, %ecx +; AVX512F-NEXT: andl $31, %ecx +; AVX512F-NEXT: movzwl (%rsp,%rcx,2), %ecx +; AVX512F-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrw $7, %xmm2, %ecx +; AVX512F-NEXT: andl $31, %ecx +; AVX512F-NEXT: movzwl (%rsp,%rcx,2), %ecx +; AVX512F-NEXT: vpinsrw $7, %ecx, %xmm4, %xmm2 +; AVX512F-NEXT: vpextrw $1, %xmm1, %ecx ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrw $1, %xmm1, %eax -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4 +; AVX512F-NEXT: andl $31, %ecx +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rcx,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $2, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4 @@ -327,171 +327,171 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind { ; AVX512F-NEXT: movq %rsp, %rbp ; AVX512F-NEXT: andq $-64, %rsp ; AVX512F-NEXT: subq $128, %rsp +; AVX512F-NEXT: vpextrb $0, %xmm1, %eax ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $0, %xmm2, %ecx ; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; AVX512F-NEXT: vpextrb $0, %xmm3, %edx ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512F-NEXT: vpextrb $0, %xmm4, %eax +; AVX512F-NEXT: vpextrb $0, %xmm4, %edi +; AVX512F-NEXT: vpextrb $1, %xmm4, %esi ; AVX512F-NEXT: vmovaps %zmm0, (%rsp) -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vpextrb $1, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $2, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $3, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $4, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $5, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $6, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $7, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $8, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $9, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $10, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $11, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $12, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $13, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $14, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $15, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrb $0, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrb $1, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $2, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $3, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $4, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $5, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $6, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $7, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $8, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $9, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $10, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $11, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $12, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $13, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $14, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $15, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 -; AVX512F-NEXT: vpextrb $0, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrb $1, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $2, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $3, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $4, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $5, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $6, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $7, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $8, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $9, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $10, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $11, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $12, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $13, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $14, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrb $15, %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX512F-NEXT: vpextrb $0, %xmm1, %eax +; AVX512F-NEXT: andl $63, %edi +; AVX512F-NEXT: movzbl (%rsp,%rdi), %edi +; AVX512F-NEXT: vmovd %edi, %xmm0 +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $2, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $3, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $4, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $5, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $6, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $7, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $8, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $9, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $10, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $11, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $12, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $13, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $14, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $15, %xmm4, %esi +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $15, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $1, %xmm3, %esi +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512F-NEXT: vmovd %edx, %xmm4 +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rsi), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $2, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $3, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $4, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $5, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $6, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $7, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $8, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $9, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $10, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $11, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $12, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $13, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512F-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $14, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512F-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $15, %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512F-NEXT: vpinsrb $15, %edx, %xmm4, %xmm3 +; AVX512F-NEXT: vpextrb $1, %xmm2, %edx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm4 +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512F-NEXT: vpinsrb $12, %ecx, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512F-NEXT: vpinsrb $13, %ecx, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512F-NEXT: vpinsrb $14, %ecx, %xmm4, %xmm4 +; AVX512F-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512F-NEXT: vpinsrb $15, %ecx, %xmm4, %xmm2 +; AVX512F-NEXT: vpextrb $1, %xmm1, %ecx ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vpextrb $1, %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rcx), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 @@ -552,171 +552,171 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind { ; AVX512BW-NEXT: movq %rsp, %rbp ; AVX512BW-NEXT: andq $-64, %rsp ; AVX512BW-NEXT: subq $128, %rsp +; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512BW-NEXT: vpextrb $0, %xmm4, %eax +; AVX512BW-NEXT: vpextrb $0, %xmm4, %edi +; AVX512BW-NEXT: vpextrb $1, %xmm4, %esi ; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm0 -; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $2, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $3, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $4, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $5, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $6, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $7, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $8, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $9, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $10, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $11, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $12, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $13, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $14, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $15, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm4 -; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 -; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm4 -; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm2 -; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax +; AVX512BW-NEXT: andl $63, %edi +; AVX512BW-NEXT: movzbl (%rsp,%rdi), %edi +; AVX512BW-NEXT: vmovd %edi, %xmm0 +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $2, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $3, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $4, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $5, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $6, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $7, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $8, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $9, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $10, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $11, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $12, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $13, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $14, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $15, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $1, %xmm3, %esi +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512BW-NEXT: vmovd %edx, %xmm4 +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rsi), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $2, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm4, %xmm3 +; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm4 +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $2, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $3, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $4, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $5, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $6, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $7, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $8, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $9, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $10, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $11, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $12, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $13, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $14, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrb $15, %xmm2, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512BW-NEXT: vpinsrb $15, %ecx, %xmm4, %xmm2 +; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm4 -; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm4, %xmm4 +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rcx), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm4, %xmm4 @@ -1067,146 +1067,146 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: subq $128, %rsp ; AVX512F-NEXT: # kill: def $esi killed $esi def $rsi ; AVX512F-NEXT: vpbroadcastd %esi, %zmm2 -; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vmovaps %zmm0, (%rsp) -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm1, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm5, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512F-NEXT: andl $63, %esi -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrd $1, %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $2, %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm3, %ecx +; AVX512F-NEXT: vpextrd $1, %xmm4, %edx +; AVX512F-NEXT: vmovd %xmm4, %r8d +; AVX512F-NEXT: vmovaps %zmm0, (%rsp) +; AVX512F-NEXT: andl $63, %r8d +; AVX512F-NEXT: movzbl (%rsp,%r8), %r8d +; AVX512F-NEXT: vmovd %r8d, %xmm0 +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $2, %xmm4, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $3, %xmm4, %edx ; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm5, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm5, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $1, %xmm5, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $2, %xmm5, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $3, %xmm5, %edx ; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm5 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm5, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm1 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm5, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $1, %xmm5, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $3, %xmm5, %edx +; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm4 +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $1, %xmm4, %esi +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm4, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $2, %xmm4, %edx +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $3, %xmm4, %esi +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm3, %edx +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $15, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrd $2, %xmm3, %esi +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512F-NEXT: vmovd %edx, %xmm4 +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrd $3, %xmm3, %ecx +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-NEXT: andl $63, %esi +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rsi), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrd $1, %xmm5, %edx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vmovd %xmm5, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrd $2, %xmm5, %ecx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrd $3, %xmm5, %edx +; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5 +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vmovd %xmm5, %ecx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrd $1, %xmm5, %edx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vmovd %xmm1, %ecx +; AVX512F-NEXT: vpextrd $2, %xmm5, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrd $3, %xmm5, %edx +; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3 +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vmovd %xmm3, %edx +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512F-NEXT: vpextrd $2, %xmm1, %edx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-NEXT: andl $63, %edx +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rdx), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm6, %ecx ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vmovd %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $1, %xmm6, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vpextrd $2, %xmm6, %eax +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rcx), %xmm5, %xmm7 +; AVX512F-NEXT: vpextrd $3, %xmm6, %ecx +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7 -; AVX512F-NEXT: vpextrd $3, %xmm6, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6 +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm6 +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rcx), %xmm6, %xmm6 ; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: vpextrd $1, %xmm5, %ecx ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rcx), %xmm6, %xmm6 +; AVX512F-NEXT: vmovd %xmm2, %ecx ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 ; AVX512F-NEXT: vpextrd $1, %xmm2, %eax +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm7 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7 ; AVX512F-NEXT: vpextrd $2, %xmm2, %eax @@ -1240,10 +1240,10 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 ; AVX512F-NEXT: vpextrd $3, %xmm8, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -1261,49 +1261,49 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3 +; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 -; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax +; AVX512F-NEXT: vpextrd $1, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax +; AVX512F-NEXT: vpextrd $2, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax +; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512F-NEXT: vpextrd $1, %xmm1, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm1 +; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm1, %eax +; AVX512F-NEXT: vpextrd $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-NEXT: vpextrd $3, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512F-NEXT: vcvtdq2ps %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vcvtdq2ps %zmm0, %zmm0 ; AVX512F-NEXT: vmovaps %zmm0, 192(%rdi) -; AVX512F-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512F-NEXT: vmovaps %zmm3, 64(%rdi) +; AVX512F-NEXT: vmovaps %zmm3, 128(%rdi) +; AVX512F-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512F-NEXT: vmovaps %zmm2, (%rdi) ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp @@ -1318,146 +1318,146 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: subq $128, %rsp ; AVX512BW-NEXT: # kill: def $esi killed $esi def $rsi ; AVX512BW-NEXT: vpbroadcastd %esi, %zmm2 -; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm0 -; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm5, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: andl $63, %esi -; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm3, %ecx +; AVX512BW-NEXT: vpextrd $1, %xmm4, %edx +; AVX512BW-NEXT: vmovd %xmm4, %r8d +; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: andl $63, %r8d +; AVX512BW-NEXT: movzbl (%rsp,%r8), %r8d +; AVX512BW-NEXT: vmovd %r8d, %xmm0 +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx ; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm5, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm5, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $2, %xmm5, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx ; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm5 -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm5, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 -; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm1 -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vmovd %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm5 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm5, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm4 +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $1, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm4, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %esi +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rdx), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm3, %edx +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rsi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrd $2, %xmm3, %esi +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: movzbl (%rsp,%rdx), %edx +; AVX512BW-NEXT: vmovd %edx, %xmm4 +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrd $3, %xmm3, %ecx +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512BW-NEXT: andl $63, %esi +; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rsi), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vmovd %xmm5, %ecx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrd $2, %xmm5, %ecx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm5 +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vmovd %xmm5, %ecx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rcx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vmovd %xmm1, %ecx +; AVX512BW-NEXT: vpextrd $2, %xmm5, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx +; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm3 +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vmovd %xmm3, %edx +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rdx), %xmm4, %xmm4 +; AVX512BW-NEXT: vpextrd $2, %xmm1, %edx +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512BW-NEXT: andl $63, %edx +; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rdx), %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm6, %ecx ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vmovd %xmm6, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $1, %xmm6, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrd $2, %xmm6, %eax +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rcx), %xmm5, %xmm7 +; AVX512BW-NEXT: vpextrd $3, %xmm6, %ecx +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7 -; AVX512BW-NEXT: vpextrd $3, %xmm6, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm5 -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6 +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm6 +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rcx), %xmm6, %xmm6 ; AVX512BW-NEXT: vmovd %xmm5, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm5, %ecx ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 -; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rcx), %xmm6, %xmm6 +; AVX512BW-NEXT: vmovd %xmm2, %ecx ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm7 ; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax +; AVX512BW-NEXT: andl $63, %ecx +; AVX512BW-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm7 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax @@ -1491,10 +1491,10 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 ; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512BW-NEXT: vmovd %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax @@ -1512,49 +1512,49 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm3 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 -; AVX512BW-NEXT: vmovd %xmm3, %eax +; AVX512BW-NEXT: vmovd %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm1 +; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 ; AVX512BW-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512BW-NEXT: vcvtdq2ps %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512BW-NEXT: vcvtdq2ps %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512BW-NEXT: vcvtdq2ps %zmm0, %zmm0 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdi) -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512BW-NEXT: vmovaps %zmm3, 64(%rdi) +; AVX512BW-NEXT: vmovaps %zmm3, 128(%rdi) +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512BW-NEXT: vmovaps %zmm2, (%rdi) ; AVX512BW-NEXT: movq %rbp, %rsp ; AVX512BW-NEXT: popq %rbp @@ -1570,12 +1570,12 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512VBMI-NEXT: # kill: def $esi killed $esi def $rsi ; AVX512VBMI-NEXT: vpbroadcastd %esi, %zmm1 ; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512VBMI-NEXT: vmovd %xmm2, %eax -; AVX512VBMI-NEXT: vmovdqa64 %zmm0, (%rsp) -; AVX512VBMI-NEXT: andl $63, %eax -; AVX512VBMI-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VBMI-NEXT: vmovd %eax, %xmm3 ; AVX512VBMI-NEXT: vpextrd $1, %xmm2, %eax +; AVX512VBMI-NEXT: vmovd %xmm2, %ecx +; AVX512VBMI-NEXT: vmovdqa64 %zmm0, (%rsp) +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: movzbl (%rsp,%rcx), %ecx +; AVX512VBMI-NEXT: vmovd %ecx, %xmm3 ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $1, (%rsp,%rax), %xmm3, %xmm3 ; AVX512VBMI-NEXT: vpextrd $2, %xmm2, %eax @@ -1623,8 +1623,8 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512VBMI-NEXT: andl $63, %eax ; AVX512VBMI-NEXT: vpinsrb $15, (%rsp,%rax), %xmm3, %xmm2 ; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3 -; AVX512VBMI-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512VBMI-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512VBMI-NEXT: vpmovdb %zmm4, %xmm4 ; AVX512VBMI-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec-strict-128.ll b/llvm/test/CodeGen/X86/vec-strict-128.ll index 84c0ff61cf9b7..0251784b32b23 100644 --- a/llvm/test/CodeGen/X86/vec-strict-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-128.ll @@ -353,15 +353,16 @@ define <2 x double> @f14(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 { ; SSE-X86-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; SSE-X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; SSE-X86-NEXT: movlps %xmm2, {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movlps %xmm1, {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movlps %xmm0, (%esp) +; SSE-X86-NEXT: movaps %xmm0, %xmm2 +; SSE-X86-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-X86-NEXT: movups %xmm2, (%esp) ; SSE-X86-NEXT: calll fma ; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-X86-NEXT: movhps %xmm0, {{[0-9]+}}(%esp) ; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-X86-NEXT: movhps %xmm0, {{[0-9]+}}(%esp) -; SSE-X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-X86-NEXT: movhps %xmm0, (%esp) +; SSE-X86-NEXT: unpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-X86-NEXT: # xmm0 = xmm0[1],mem[1] +; SSE-X86-NEXT: movups %xmm0, (%esp) ; SSE-X86-NEXT: fstpl {{[0-9]+}}(%esp) ; SSE-X86-NEXT: wait ; SSE-X86-NEXT: calll fma diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll index 8c64dd2d9b49f..22962df84b13c 100644 --- a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll @@ -715,9 +715,9 @@ define <2 x i16> @test_v2f16_oeq_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, < ; X86-NEXT: testb %al, %cl ; X86-NEXT: setne %al ; X86-NEXT: andl $1, %eax -; X86-NEXT: kmovw %eax, %k0 ; X86-NEXT: vpsrld $16, %xmm2, %xmm2 ; X86-NEXT: vucomish 10(%ebp), %xmm2 +; X86-NEXT: kmovw %eax, %k0 ; X86-NEXT: setnp %al ; X86-NEXT: sete %cl ; X86-NEXT: testb %al, %cl @@ -770,11 +770,11 @@ define <2 x i16> @test_v2f16_ogt_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, < ; X86-NEXT: vcomish 8(%ebp), %xmm2 ; X86-NEXT: seta %al ; X86-NEXT: andl $1, %eax -; X86-NEXT: kmovw %eax, %k0 ; X86-NEXT: vpsrld $16, %xmm2, %xmm2 ; X86-NEXT: vcomish 10(%ebp), %xmm2 -; X86-NEXT: seta %al -; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: seta %cl +; X86-NEXT: kmovw %eax, %k0 +; X86-NEXT: kmovd %ecx, %k1 ; X86-NEXT: kshiftlw $15, %k1, %k1 ; X86-NEXT: kshiftrw $14, %k1, %k1 ; X86-NEXT: korw %k1, %k0, %k1 @@ -788,12 +788,12 @@ define <2 x i16> @test_v2f16_ogt_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, < ; X64-NEXT: vcomish %xmm3, %xmm2 ; X64-NEXT: seta %al ; X64-NEXT: andl $1, %eax -; X64-NEXT: kmovw %eax, %k0 ; X64-NEXT: vpsrld $16, %xmm3, %xmm3 ; X64-NEXT: vpsrld $16, %xmm2, %xmm2 ; X64-NEXT: vcomish %xmm3, %xmm2 -; X64-NEXT: seta %al -; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: seta %cl +; X64-NEXT: kmovw %eax, %k0 +; X64-NEXT: kmovd %ecx, %k1 ; X64-NEXT: kshiftlw $15, %k1, %k1 ; X64-NEXT: kshiftrw $14, %k1, %k1 ; X64-NEXT: korw %k1, %k0, %k1 @@ -826,9 +826,9 @@ define <4 x i16> @test_v4f16_oge_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; X86-NEXT: korw %k1, %k0, %k0 ; X86-NEXT: movw $-5, %ax ; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: kandw %k1, %k0, %k0 ; X86-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; X86-NEXT: vucomish 12(%ebp), %xmm3 +; X86-NEXT: kandw %k1, %k0, %k0 ; X86-NEXT: setae %al ; X86-NEXT: kmovd %eax, %k1 ; X86-NEXT: kshiftlw $15, %k1, %k1 @@ -905,33 +905,33 @@ define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; X86-NEXT: vcomish %xmm2, %xmm3 ; X86-NEXT: seta %al ; X86-NEXT: andl $1, %eax -; X86-NEXT: kmovw %eax, %k0 ; X86-NEXT: vpsrld $16, %xmm2, %xmm3 ; X86-NEXT: vmovsh {{.*#+}} xmm4 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vcomish %xmm3, %xmm4 -; X86-NEXT: seta %al -; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: seta %cl +; X86-NEXT: kmovw %eax, %k0 +; X86-NEXT: kmovd %ecx, %k1 ; X86-NEXT: kshiftlw $15, %k1, %k1 ; X86-NEXT: kshiftrw $14, %k1, %k1 ; X86-NEXT: korw %k1, %k0, %k0 ; X86-NEXT: movw $-5, %ax ; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: kandw %k1, %k0, %k0 ; X86-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; X86-NEXT: vmovsh {{.*#+}} xmm4 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vcomish %xmm3, %xmm4 ; X86-NEXT: seta %al +; X86-NEXT: kandw %k1, %k0, %k0 ; X86-NEXT: kmovd %eax, %k1 ; X86-NEXT: kshiftlw $15, %k1, %k1 ; X86-NEXT: kshiftrw $13, %k1, %k1 ; X86-NEXT: korw %k1, %k0, %k0 ; X86-NEXT: movw $-9, %ax ; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: kandw %k1, %k0, %k0 ; X86-NEXT: vpsrlq $48, %xmm2, %xmm2 ; X86-NEXT: vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vcomish %xmm2, %xmm3 ; X86-NEXT: seta %al +; X86-NEXT: kandw %k1, %k0, %k0 ; X86-NEXT: kmovd %eax, %k1 ; X86-NEXT: kshiftlw $15, %k1, %k1 ; X86-NEXT: kshiftrw $12, %k1, %k1 @@ -946,33 +946,33 @@ define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; X64-NEXT: vcomish %xmm2, %xmm3 ; X64-NEXT: seta %al ; X64-NEXT: andl $1, %eax -; X64-NEXT: kmovw %eax, %k0 ; X64-NEXT: vpsrld $16, %xmm2, %xmm4 ; X64-NEXT: vpsrld $16, %xmm3, %xmm5 ; X64-NEXT: vcomish %xmm4, %xmm5 -; X64-NEXT: seta %al -; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: seta %cl +; X64-NEXT: kmovw %eax, %k0 +; X64-NEXT: kmovd %ecx, %k1 ; X64-NEXT: kshiftlw $15, %k1, %k1 ; X64-NEXT: kshiftrw $14, %k1, %k1 ; X64-NEXT: korw %k1, %k0, %k0 ; X64-NEXT: movw $-5, %ax ; X64-NEXT: kmovd %eax, %k1 -; X64-NEXT: kandw %k1, %k0, %k0 ; X64-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] ; X64-NEXT: vmovshdup {{.*#+}} xmm5 = xmm3[1,1,3,3] ; X64-NEXT: vcomish %xmm4, %xmm5 ; X64-NEXT: seta %al +; X64-NEXT: kandw %k1, %k0, %k0 ; X64-NEXT: kmovd %eax, %k1 ; X64-NEXT: kshiftlw $15, %k1, %k1 ; X64-NEXT: kshiftrw $13, %k1, %k1 ; X64-NEXT: korw %k1, %k0, %k0 ; X64-NEXT: movw $-9, %ax ; X64-NEXT: kmovd %eax, %k1 -; X64-NEXT: kandw %k1, %k0, %k0 ; X64-NEXT: vpsrlq $48, %xmm2, %xmm2 ; X64-NEXT: vpsrlq $48, %xmm3, %xmm3 ; X64-NEXT: vcomish %xmm2, %xmm3 ; X64-NEXT: seta %al +; X64-NEXT: kandw %k1, %k0, %k0 ; X64-NEXT: kmovd %eax, %k1 ; X64-NEXT: kshiftlw $15, %k1, %k1 ; X64-NEXT: kshiftrw $12, %k1, %k1 diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll index 209d6a5a67100..45ea780ae31bd 100644 --- a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll @@ -106,6 +106,7 @@ define <4 x i32> @test_v4f32_ogt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32: # %bb.0: ; SSE-32-NEXT: pushl %ebp ; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: pushl %esi ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 @@ -118,31 +119,32 @@ define <4 x i32> @test_v4f32_ogt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: movaps %xmm3, %xmm4 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-32-NEXT: movaps %xmm2, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 -; SSE-32-NEXT: movl $0, %edx -; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-32-NEXT: ucomiss %xmm4, %xmm5 +; SSE-32-NEXT: movl $0, %esi +; SSE-32-NEXT: cmoval %ecx, %esi +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %esi, %xmm4 ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: cmoval %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 -; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: movd %edx, %xmm2 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 +; SSE-32-NEXT: leal -4(%ebp), %esp +; SSE-32-NEXT: popl %esi ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; @@ -157,30 +159,30 @@ define <4 x i32> @test_v4f32_ogt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-64-NEXT: movl $-1, %ecx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmoval %ecx, %edx +; SSE-64-NEXT: movaps %xmm3, %xmm4 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-64-NEXT: movaps %xmm2, %xmm5 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-64-NEXT: ucomiss %xmm4, %xmm5 +; SSE-64-NEXT: movl $0, %esi +; SSE-64-NEXT: cmoval %ecx, %esi ; SSE-64-NEXT: movd %edx, %xmm4 -; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-64-NEXT: movaps %xmm2, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-64-NEXT: ucomiss %xmm5, %xmm6 -; SSE-64-NEXT: movl $0, %edx -; SSE-64-NEXT: cmoval %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: movd %esi, %xmm5 ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmoval %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-64-NEXT: cmoval %ecx, %eax -; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movd %edx, %xmm2 +; SSE-64-NEXT: movd %eax, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ogt_q: @@ -285,17 +287,17 @@ define <4 x i32> @test_v4f32_oge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmovael %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: movd %edx, %xmm2 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -324,17 +326,17 @@ define <4 x i32> @test_v4f32_oge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovael %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: cmovael %ecx, %eax -; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movd %edx, %xmm2 +; SSE-64-NEXT: movd %eax, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_oge_q: @@ -414,6 +416,7 @@ define <4 x i32> @test_v4f32_olt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32: # %bb.0: ; SSE-32-NEXT: pushl %ebp ; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: pushl %esi ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 @@ -426,31 +429,32 @@ define <4 x i32> @test_v4f32_olt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-32-NEXT: movaps %xmm3, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 -; SSE-32-NEXT: movl $0, %edx -; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-32-NEXT: ucomiss %xmm4, %xmm5 +; SSE-32-NEXT: movl $0, %esi +; SSE-32-NEXT: cmoval %ecx, %esi +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %esi, %xmm4 ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: cmoval %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 -; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: movd %edx, %xmm2 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 +; SSE-32-NEXT: leal -4(%ebp), %esp +; SSE-32-NEXT: popl %esi ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; @@ -465,30 +469,30 @@ define <4 x i32> @test_v4f32_olt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-64-NEXT: movl $-1, %ecx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmoval %ecx, %edx +; SSE-64-NEXT: movaps %xmm2, %xmm4 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-64-NEXT: movaps %xmm3, %xmm5 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-64-NEXT: ucomiss %xmm4, %xmm5 +; SSE-64-NEXT: movl $0, %esi +; SSE-64-NEXT: cmoval %ecx, %esi ; SSE-64-NEXT: movd %edx, %xmm4 -; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-64-NEXT: movaps %xmm3, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-64-NEXT: ucomiss %xmm5, %xmm6 -; SSE-64-NEXT: movl $0, %edx -; SSE-64-NEXT: cmoval %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: movd %esi, %xmm5 ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmoval %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-64-NEXT: cmoval %ecx, %eax -; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movd %edx, %xmm2 +; SSE-64-NEXT: movd %eax, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_olt_q: @@ -591,17 +595,17 @@ define <4 x i32> @test_v4f32_ole_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmovael %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: movd %edx, %xmm2 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -630,17 +634,17 @@ define <4 x i32> @test_v4f32_ole_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovael %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: cmovael %ecx, %eax -; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movd %edx, %xmm2 +; SSE-64-NEXT: movd %eax, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ole_q: @@ -1036,17 +1040,17 @@ define <4 x i32> @test_v4f32_ugt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmovbl %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: movd %edx, %xmm2 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1075,17 +1079,17 @@ define <4 x i32> @test_v4f32_ugt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbl %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: cmovbl %ecx, %eax -; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movd %edx, %xmm2 +; SSE-64-NEXT: movd %eax, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ugt_q: @@ -1163,6 +1167,7 @@ define <4 x i32> @test_v4f32_uge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32: # %bb.0: ; SSE-32-NEXT: pushl %ebp ; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: pushl %esi ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 @@ -1175,31 +1180,32 @@ define <4 x i32> @test_v4f32_uge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: movaps %xmm2, %xmm4 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-32-NEXT: movaps %xmm3, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 -; SSE-32-NEXT: movl $0, %edx -; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-32-NEXT: ucomiss %xmm4, %xmm5 +; SSE-32-NEXT: movl $0, %esi +; SSE-32-NEXT: cmovbel %ecx, %esi +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %esi, %xmm4 ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: cmovbel %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 -; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: movd %edx, %xmm2 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 +; SSE-32-NEXT: leal -4(%ebp), %esp +; SSE-32-NEXT: popl %esi ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; @@ -1214,30 +1220,30 @@ define <4 x i32> @test_v4f32_uge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-64-NEXT: movl $-1, %ecx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbel %ecx, %edx +; SSE-64-NEXT: movaps %xmm2, %xmm4 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-64-NEXT: movaps %xmm3, %xmm5 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-64-NEXT: ucomiss %xmm4, %xmm5 +; SSE-64-NEXT: movl $0, %esi +; SSE-64-NEXT: cmovbel %ecx, %esi ; SSE-64-NEXT: movd %edx, %xmm4 -; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-64-NEXT: movaps %xmm3, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-64-NEXT: ucomiss %xmm5, %xmm6 -; SSE-64-NEXT: movl $0, %edx -; SSE-64-NEXT: cmovbel %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: movd %esi, %xmm5 ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbel %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-64-NEXT: cmovbel %ecx, %eax -; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movd %edx, %xmm2 +; SSE-64-NEXT: movd %eax, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_uge_q: @@ -1340,17 +1346,17 @@ define <4 x i32> @test_v4f32_ult_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmovbl %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 +; SSE-32-NEXT: movd %edx, %xmm2 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1379,17 +1385,17 @@ define <4 x i32> @test_v4f32_ult_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbl %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: cmovbl %ecx, %eax -; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movd %edx, %xmm2 +; SSE-64-NEXT: movd %eax, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ult_q: @@ -1469,6 +1475,7 @@ define <4 x i32> @test_v4f32_ule_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32: # %bb.0: ; SSE-32-NEXT: pushl %ebp ; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: pushl %esi ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 @@ -1481,31 +1488,32 @@ define <4 x i32> @test_v4f32_ule_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: movaps %xmm3, %xmm4 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-32-NEXT: movaps %xmm2, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 -; SSE-32-NEXT: movl $0, %edx -; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-32-NEXT: ucomiss %xmm4, %xmm5 +; SSE-32-NEXT: movl $0, %esi +; SSE-32-NEXT: cmovbel %ecx, %esi +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movd %esi, %xmm4 ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: cmovbel %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-32-NEXT: pand %xmm5, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm5 -; SSE-32-NEXT: por %xmm5, %xmm0 -; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: movd %edx, %xmm2 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 +; SSE-32-NEXT: leal -4(%ebp), %esp +; SSE-32-NEXT: popl %esi ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; @@ -1520,30 +1528,30 @@ define <4 x i32> @test_v4f32_ule_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, ; SSE-64-NEXT: movl $-1, %ecx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbel %ecx, %edx +; SSE-64-NEXT: movaps %xmm3, %xmm4 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-64-NEXT: movaps %xmm2, %xmm5 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-64-NEXT: ucomiss %xmm4, %xmm5 +; SSE-64-NEXT: movl $0, %esi +; SSE-64-NEXT: cmovbel %ecx, %esi ; SSE-64-NEXT: movd %edx, %xmm4 -; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-64-NEXT: movaps %xmm2, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-64-NEXT: ucomiss %xmm5, %xmm6 -; SSE-64-NEXT: movl $0, %edx -; SSE-64-NEXT: cmovbel %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: movd %esi, %xmm5 ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbel %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-64-NEXT: cmovbel %ecx, %eax -; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movd %edx, %xmm2 +; SSE-64-NEXT: movd %eax, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ule_q: @@ -1904,24 +1912,24 @@ define <2 x i64> @test_v2f64_ogt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: movd %edx, %xmm2 ; SSE-32-NEXT: cmoval %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1933,16 +1941,16 @@ define <2 x i64> @test_v2f64_ogt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovaq %rcx, %rdx -; SSE-64-NEXT: movq %rdx, %xmm4 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE-64-NEXT: ucomisd %xmm3, %xmm2 ; SSE-64-NEXT: cmovaq %rcx, %rax -; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movq %rdx, %xmm2 +; SSE-64-NEXT: movq %rax, %xmm3 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_ogt_q: @@ -2024,24 +2032,24 @@ define <2 x i64> @test_v2f64_oge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmovael %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2144,24 +2152,24 @@ define <2 x i64> @test_v2f64_olt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: movd %edx, %xmm2 ; SSE-32-NEXT: cmoval %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2173,16 +2181,16 @@ define <2 x i64> @test_v2f64_olt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovaq %rcx, %rdx -; SSE-64-NEXT: movq %rdx, %xmm4 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-64-NEXT: ucomisd %xmm2, %xmm3 ; SSE-64-NEXT: cmovaq %rcx, %rax -; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movq %rdx, %xmm2 +; SSE-64-NEXT: movq %rax, %xmm3 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_olt_q: @@ -2262,24 +2270,24 @@ define <2 x i64> @test_v2f64_ole_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmovael %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2673,24 +2681,24 @@ define <2 x i64> @test_v2f64_ugt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm4 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmovbl %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2791,24 +2799,24 @@ define <2 x i64> @test_v2f64_uge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: movd %edx, %xmm2 ; SSE-32-NEXT: cmovbel %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2820,16 +2828,16 @@ define <2 x i64> @test_v2f64_uge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbeq %rcx, %rdx -; SSE-64-NEXT: movq %rdx, %xmm4 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-64-NEXT: ucomisd %xmm2, %xmm3 ; SSE-64-NEXT: cmovbeq %rcx, %rax -; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movq %rdx, %xmm2 +; SSE-64-NEXT: movq %rax, %xmm3 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_uge_q: @@ -2909,24 +2917,24 @@ define <2 x i64> @test_v2f64_ult_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmovbl %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -3029,24 +3037,24 @@ define <2 x i64> @test_v2f64_ule_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movapd 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: movd %edx, %xmm2 ; SSE-32-NEXT: cmovbel %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -3058,16 +3066,16 @@ define <2 x i64> @test_v2f64_ule_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbeq %rcx, %rdx -; SSE-64-NEXT: movq %rdx, %xmm4 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE-64-NEXT: ucomisd %xmm3, %xmm2 ; SSE-64-NEXT: cmovbeq %rcx, %rax -; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movq %rdx, %xmm2 +; SSE-64-NEXT: movq %rax, %xmm3 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_ule_q: diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll index 1e56ddc0c8ec8..f90d641ac3480 100644 --- a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll @@ -15,22 +15,22 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movaps 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: comiss %xmm4, %xmm2 -; SSE-32-NEXT: movl $-1, %ecx -; SSE-32-NEXT: movl $0, %edx -; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-32-NEXT: comiss %xmm3, %xmm2 +; SSE-32-NEXT: movl $-1, %edx +; SSE-32-NEXT: movl $0, %ecx +; SSE-32-NEXT: cmoval %edx, %ecx +; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-32-NEXT: comiss %xmm4, %xmm2 -; SSE-32-NEXT: cmoval %ecx, %eax -; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: comiss %xmm3, %xmm2 +; SSE-32-NEXT: cmoval %edx, %eax +; SSE-32-NEXT: movd %ecx, %xmm2 +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -42,16 +42,16 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; SSE-64-NEXT: movl $-1, %ecx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmoval %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: comiss %xmm3, %xmm2 ; SSE-64-NEXT: cmoval %ecx, %eax -; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: movd %edx, %xmm2 +; SSE-64-NEXT: movd %eax, %xmm3 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f32_ogt_s: @@ -100,11 +100,11 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; AVX512-32-NEXT: vcomiss 8(%ebp), %xmm2 ; AVX512-32-NEXT: seta %al ; AVX512-32-NEXT: andl $1, %eax -; AVX512-32-NEXT: kmovw %eax, %k0 ; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512-32-NEXT: vcomiss 12(%ebp), %xmm2 -; AVX512-32-NEXT: seta %al -; AVX512-32-NEXT: kmovw %eax, %k1 +; AVX512-32-NEXT: seta %cl +; AVX512-32-NEXT: kmovw %eax, %k0 +; AVX512-32-NEXT: kmovw %ecx, %k1 ; AVX512-32-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-32-NEXT: kshiftrw $14, %k1, %k1 ; AVX512-32-NEXT: korw %k1, %k0, %k1 @@ -118,12 +118,12 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; AVX512-64-NEXT: vcomiss %xmm3, %xmm2 ; AVX512-64-NEXT: seta %al ; AVX512-64-NEXT: andl $1, %eax -; AVX512-64-NEXT: kmovw %eax, %k0 ; AVX512-64-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] ; AVX512-64-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512-64-NEXT: vcomiss %xmm3, %xmm2 -; AVX512-64-NEXT: seta %al -; AVX512-64-NEXT: kmovw %eax, %k1 +; AVX512-64-NEXT: seta %cl +; AVX512-64-NEXT: kmovw %eax, %k0 +; AVX512-64-NEXT: kmovw %ecx, %k1 ; AVX512-64-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-64-NEXT: kshiftrw $14, %k1, %k1 ; AVX512-64-NEXT: korw %k1, %k0, %k1 @@ -141,11 +141,11 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; AVX512F-32-NEXT: vcomiss 8(%ebp), %xmm2 ; AVX512F-32-NEXT: seta %al ; AVX512F-32-NEXT: andl $1, %eax -; AVX512F-32-NEXT: kmovw %eax, %k0 ; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512F-32-NEXT: vcomiss 12(%ebp), %xmm2 -; AVX512F-32-NEXT: seta %al -; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: seta %cl +; AVX512F-32-NEXT: kmovw %eax, %k0 +; AVX512F-32-NEXT: kmovw %ecx, %k1 ; AVX512F-32-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-32-NEXT: kshiftrw $14, %k1, %k1 ; AVX512F-32-NEXT: korw %k1, %k0, %k1 @@ -163,12 +163,12 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; AVX512F-64-NEXT: vcomiss %xmm3, %xmm2 ; AVX512F-64-NEXT: seta %al ; AVX512F-64-NEXT: andl $1, %eax -; AVX512F-64-NEXT: kmovw %eax, %k0 ; AVX512F-64-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] ; AVX512F-64-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512F-64-NEXT: vcomiss %xmm3, %xmm2 -; AVX512F-64-NEXT: seta %al -; AVX512F-64-NEXT: kmovw %eax, %k1 +; AVX512F-64-NEXT: seta %cl +; AVX512F-64-NEXT: kmovw %eax, %k0 +; AVX512F-64-NEXT: kmovw %ecx, %k1 ; AVX512F-64-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-64-NEXT: kshiftrw $14, %k1, %k1 ; AVX512F-64-NEXT: korw %k1, %k0, %k1 @@ -193,10 +193,10 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; SSE-32-NEXT: movaps 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax ; SSE-32-NEXT: ucomiss %xmm4, %xmm2 -; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $-1, %edx ; SSE-32-NEXT: cmovnel %eax, %edx ; SSE-32-NEXT: cmovpl %eax, %edx +; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movd %edx, %xmm3 ; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] @@ -217,16 +217,16 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; SSE-64-NEXT: xorl %eax, %eax ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: movl $-1, %ecx +; SSE-64-NEXT: cmovnel %eax, %ecx +; SSE-64-NEXT: cmovpl %eax, %ecx ; SSE-64-NEXT: movl $-1, %edx -; SSE-64-NEXT: cmovnel %eax, %edx -; SSE-64-NEXT: cmovpl %eax, %edx -; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: movd %ecx, %xmm4 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 -; SSE-64-NEXT: cmovnel %eax, %ecx -; SSE-64-NEXT: cmovpl %eax, %ecx -; SSE-64-NEXT: movd %ecx, %xmm2 +; SSE-64-NEXT: cmovnel %eax, %edx +; SSE-64-NEXT: cmovpl %eax, %edx +; SSE-64-NEXT: movd %edx, %xmm2 ; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-64-NEXT: pand %xmm4, %xmm0 ; SSE-64-NEXT: pandn %xmm1, %xmm4 @@ -243,14 +243,14 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; AVX-32-NEXT: xorl %eax, %eax ; AVX-32-NEXT: vucomiss 12(%ebp), %xmm3 ; AVX-32-NEXT: movl $-1, %ecx +; AVX-32-NEXT: cmovnel %eax, %ecx +; AVX-32-NEXT: cmovpl %eax, %ecx ; AVX-32-NEXT: movl $-1, %edx +; AVX-32-NEXT: vucomiss 8(%ebp), %xmm2 ; AVX-32-NEXT: cmovnel %eax, %edx ; AVX-32-NEXT: cmovpl %eax, %edx -; AVX-32-NEXT: vucomiss 8(%ebp), %xmm2 -; AVX-32-NEXT: cmovnel %eax, %ecx -; AVX-32-NEXT: cmovpl %eax, %ecx -; AVX-32-NEXT: vmovd %ecx, %xmm2 -; AVX-32-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 +; AVX-32-NEXT: vmovd %edx, %xmm2 +; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-32-NEXT: movl %ebp, %esp ; AVX-32-NEXT: popl %ebp @@ -286,9 +286,9 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; AVX512-32-NEXT: testb %al, %cl ; AVX512-32-NEXT: setne %al ; AVX512-32-NEXT: andl $1, %eax -; AVX512-32-NEXT: kmovw %eax, %k0 ; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512-32-NEXT: vucomiss 12(%ebp), %xmm2 +; AVX512-32-NEXT: kmovw %eax, %k0 ; AVX512-32-NEXT: setnp %al ; AVX512-32-NEXT: sete %cl ; AVX512-32-NEXT: testb %al, %cl @@ -339,9 +339,9 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, ; AVX512F-32-NEXT: testb %al, %cl ; AVX512F-32-NEXT: setne %al ; AVX512F-32-NEXT: andl $1, %eax -; AVX512F-32-NEXT: kmovw %eax, %k0 ; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512F-32-NEXT: vucomiss 12(%ebp), %xmm2 +; AVX512F-32-NEXT: kmovw %eax, %k0 ; AVX512F-32-NEXT: setnp %al ; AVX512F-32-NEXT: sete %cl ; AVX512F-32-NEXT: testb %al, %cl diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll index 48a0b27a207f3..4cc10c472744b 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -82,12 +82,11 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 { ; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i64: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttsd2si %xmm0, %rax -; SSE-64-NEXT: movq %rax, %xmm1 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: cvttsd2si %xmm0, %rcx ; SSE-64-NEXT: movq %rax, %xmm0 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptosi_v2f64_to_v2i64: @@ -118,11 +117,11 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX-64-LABEL: strict_vector_fptosi_v2f64_to_v2i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vcvttsd2si %xmm0, %rax -; AVX-64-NEXT: vmovq %rax, %xmm1 ; AVX-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx ; AVX-64-NEXT: vmovq %rax, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-64-NEXT: vmovq %rcx, %xmm1 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-64-NEXT: retq ; ; AVX512F-32-LABEL: strict_vector_fptosi_v2f64_to_v2i64: @@ -153,11 +152,11 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512F-64-LABEL: strict_vector_fptosi_v2f64_to_v2i64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-64-NEXT: vcvttsd2si %xmm0, %rcx ; AVX512F-64-NEXT: vmovq %rax, %xmm0 -; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-64-NEXT: vmovq %rcx, %xmm1 +; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-64-NEXT: retq ; ; AVX512VL-32-LABEL: strict_vector_fptosi_v2f64_to_v2i64: @@ -188,11 +187,11 @@ define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512VL-64-LABEL: strict_vector_fptosi_v2f64_to_v2i64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-64-NEXT: retq ; ; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i64: @@ -441,11 +440,11 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512F-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-64-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-64-NEXT: vcvttsd2usi %xmm0, %rcx ; AVX512F-64-NEXT: vmovq %rax, %xmm0 -; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-64-NEXT: vmovq %rcx, %xmm1 +; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-64-NEXT: retq ; ; AVX512VL-32-LABEL: strict_vector_fptoui_v2f64_to_v2i64: @@ -494,11 +493,11 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { ; AVX512VL-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-64-NEXT: retq ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i64: @@ -560,12 +559,11 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 { ; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttss2si %xmm0, %rax -; SSE-64-NEXT: movq %rax, %xmm1 ; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: cvttss2si %xmm0, %rcx ; SSE-64-NEXT: movq %rax, %xmm0 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64: @@ -596,11 +594,11 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 { ; AVX-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vcvttss2si %xmm0, %rax -; AVX-64-NEXT: vmovq %rax, %xmm1 ; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vcvttss2si %xmm0, %rcx ; AVX-64-NEXT: vmovq %rax, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-64-NEXT: vmovq %rcx, %xmm1 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-64-NEXT: retq ; ; AVX512F-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64: @@ -631,11 +629,11 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 { ; AVX512F-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-64-NEXT: vcvttss2si %xmm0, %rcx ; AVX512F-64-NEXT: vmovq %rax, %xmm0 -; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-64-NEXT: vmovq %rcx, %xmm1 +; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-64-NEXT: retq ; ; AVX512VL-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64: @@ -666,11 +664,11 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 { ; AVX512VL-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-64-NEXT: vcvttss2si %xmm0, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-64-NEXT: retq ; ; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i64: @@ -733,12 +731,12 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64_load128(ptr %x) strictfp { ; ; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps (%rdi), %xmm1 -; SSE-64-NEXT: cvttss2si %xmm1, %rax +; SSE-64-NEXT: movaps (%rdi), %xmm0 +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-64-NEXT: cvttss2si %xmm0, %rcx ; SSE-64-NEXT: movq %rax, %xmm0 -; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-64-NEXT: cvttss2si %xmm1, %rax -; SSE-64-NEXT: movq %rax, %xmm1 +; SSE-64-NEXT: movq %rcx, %xmm1 ; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; @@ -772,9 +770,9 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64_load128(ptr %x) strictfp { ; AVX-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vcvttss2si 4(%rdi), %rax +; AVX-64-NEXT: vcvttss2si (%rdi), %rcx ; AVX-64-NEXT: vmovq %rax, %xmm0 -; AVX-64-NEXT: vcvttss2si (%rdi), %rax -; AVX-64-NEXT: vmovq %rax, %xmm1 +; AVX-64-NEXT: vmovq %rcx, %xmm1 ; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-64-NEXT: retq ; @@ -808,9 +806,9 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64_load128(ptr %x) strictfp { ; AVX512F-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vcvttss2si 4(%rdi), %rax +; AVX512F-64-NEXT: vcvttss2si (%rdi), %rcx ; AVX512F-64-NEXT: vmovq %rax, %xmm0 -; AVX512F-64-NEXT: vcvttss2si (%rdi), %rax -; AVX512F-64-NEXT: vmovq %rax, %xmm1 +; AVX512F-64-NEXT: vmovq %rcx, %xmm1 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-64-NEXT: retq ; @@ -844,9 +842,9 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64_load128(ptr %x) strictfp { ; AVX512VL-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vcvttss2si 4(%rdi), %rax +; AVX512VL-64-NEXT: vcvttss2si (%rdi), %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 -; AVX512VL-64-NEXT: vcvttss2si (%rdi), %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vmovq %rcx, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512VL-64-NEXT: retq ; @@ -1112,11 +1110,11 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 { ; AVX512F-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vcvttss2usi %xmm0, %rax -; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-64-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512F-64-NEXT: vmovq %rax, %xmm0 -; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-64-NEXT: vmovq %rcx, %xmm1 +; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-64-NEXT: retq ; ; AVX512VL-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64: @@ -1165,11 +1163,11 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 { ; AVX512VL-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vcvttss2usi %xmm0, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-64-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-64-NEXT: retq ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i64: @@ -1425,9 +1423,9 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp { ; AVX512F-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vcvttss2usi 4(%rdi), %rax +; AVX512F-64-NEXT: vcvttss2usi (%rdi), %rcx ; AVX512F-64-NEXT: vmovq %rax, %xmm0 -; AVX512F-64-NEXT: vcvttss2usi (%rdi), %rax -; AVX512F-64-NEXT: vmovq %rax, %xmm1 +; AVX512F-64-NEXT: vmovq %rcx, %xmm1 ; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-64-NEXT: retq ; @@ -1479,9 +1477,9 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp { ; AVX512VL-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vcvttss2usi 4(%rdi), %rax +; AVX512VL-64-NEXT: vcvttss2usi (%rdi), %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm0 -; AVX512VL-64-NEXT: vcvttss2usi (%rdi), %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vmovq %rcx, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512VL-64-NEXT: retq ; @@ -1597,12 +1595,11 @@ define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 { ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i32: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttsd2si %xmm0, %rax -; SSE-64-NEXT: movd %eax, %xmm1 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: cvttsd2si %xmm0, %rcx ; SSE-64-NEXT: movd %eax, %xmm0 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: movd %ecx, %xmm1 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptoui_v2f64_to_v2i32: @@ -1753,12 +1750,11 @@ define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 { ; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i32: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttss2si %xmm0, %rax -; SSE-64-NEXT: movd %eax, %xmm1 ; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: cvttss2si %xmm0, %rcx ; SSE-64-NEXT: movd %eax, %xmm0 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: movd %ecx, %xmm1 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptoui_v2f32_to_v2i32: @@ -2290,12 +2286,11 @@ define <2 x i1> @strict_vector_fptosi_v2f64_to_v2i1(<2 x double> %a) #0 { ; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i1: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttsd2si %xmm0, %rax -; SSE-64-NEXT: movq %rax, %xmm1 ; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: cvttsd2si %xmm0, %rcx ; SSE-64-NEXT: movq %rax, %xmm0 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptosi_v2f64_to_v2i1: @@ -2326,11 +2321,11 @@ define <2 x i1> @strict_vector_fptosi_v2f64_to_v2i1(<2 x double> %a) #0 { ; AVX-64-LABEL: strict_vector_fptosi_v2f64_to_v2i1: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vcvttsd2si %xmm0, %rax -; AVX-64-NEXT: vmovq %rax, %xmm1 ; AVX-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx ; AVX-64-NEXT: vmovq %rax, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-64-NEXT: vmovq %rcx, %xmm1 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-64-NEXT: retq ; ; AVX512F-LABEL: strict_vector_fptosi_v2f64_to_v2i1: @@ -2642,12 +2637,11 @@ define <2 x i1> @strict_vector_fptosi_v2f32_to_v2i1(<2 x float> %a) #0 { ; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i1: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttss2si %xmm0, %rax -; SSE-64-NEXT: movq %rax, %xmm1 ; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: cvttss2si %xmm0, %rcx ; SSE-64-NEXT: movq %rax, %xmm0 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptosi_v2f32_to_v2i1: @@ -2678,11 +2672,11 @@ define <2 x i1> @strict_vector_fptosi_v2f32_to_v2i1(<2 x float> %a) #0 { ; AVX-64-LABEL: strict_vector_fptosi_v2f32_to_v2i1: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vcvttss2si %xmm0, %rax -; AVX-64-NEXT: vmovq %rax, %xmm1 ; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vcvttss2si %xmm0, %rcx ; AVX-64-NEXT: vmovq %rax, %xmm0 -; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-64-NEXT: vmovq %rcx, %xmm1 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-64-NEXT: retq ; ; AVX512F-LABEL: strict_vector_fptosi_v2f32_to_v2i1: diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll index 179e8ad69672b..84a5e685e57f7 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -1113,7 +1113,7 @@ define <4 x i32> @strict_vector_fptosi_v4f64_to_v4i32(<4 x double> %a) #0 { define <4 x i32> @strict_vector_fptoui_v4f64_to_v4i32(<4 x double> %a) #0 { ; AVX-LABEL: strict_vector_fptoui_v4f64_to_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] ; AVX-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2] @@ -1379,10 +1379,10 @@ define <8 x i32> @strict_vector_fptosi_v8f32_to_v8i32(<8 x float> %a) #0 { define <8 x i32> @strict_vector_fptoui_v8f32_to_v8i32(<8 x float> %a) #0 { ; AVX-LABEL: strict_vector_fptoui_v8f32_to_v8i32: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm4 ; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vsubps %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll index ce5db5b246775..49bb5dff09519 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -74,16 +74,16 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-32-NEXT: movl %ebp, %esp ; AVX512VL-32-NEXT: popl %ebp @@ -98,13 +98,13 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512VL-64-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax +; AVX512VL-64-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512VL-64-NEXT: vcvttsd2si %xmm3, %rax +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm3, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm2 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm2 @@ -287,13 +287,13 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512VL-64-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax +; AVX512VL-64-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512VL-64-NEXT: vcvttsd2usi %xmm3, %rax +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm3, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm2 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm2 @@ -366,16 +366,16 @@ define <8 x i64> @strict_vector_fptosi_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm1, %xmm1 ; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 ; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-32-NEXT: movl %ebp, %esp ; AVX512VL-32-NEXT: popl %ebp @@ -387,17 +387,17 @@ define <8 x i64> @strict_vector_fptosi_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX512VL-64-NEXT: vcvttss2si %xmm3, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm3 ; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm3 ; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm3 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax @@ -576,17 +576,17 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax +; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 -; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX512VL-64-NEXT: vcvttss2usi %xmm3, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm3 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm3 ; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rax -; AVX512VL-64-NEXT: vmovq %rax, %xmm3 ; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rcx ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 -; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vmovq %rcx, %xmm3 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll index a336d0a01fa7b..65172f14bc35f 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -411,9 +411,9 @@ define <8 x float> @uitofp_v8i32_v8f32(<8 x i32> %x) #0 { ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vsubps %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: ret{{[l|q]}} ; @@ -587,16 +587,16 @@ define <4 x double> @uitofp_v4i32_v4f64(<4 x i32> %x) #0 { ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vorpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: uitofp_v4i32_v4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: ret{{[l|q]}} @@ -673,12 +673,12 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX1-64-NEXT: vmovq %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-64-NEXT: retq ; @@ -689,12 +689,12 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX2-64-NEXT: vmovq %xmm1, %rax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-64-NEXT: retq ; @@ -705,12 +705,12 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX512F-64-NEXT: vmovq %xmm1, %rax ; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-64-NEXT: retq ; @@ -721,12 +721,12 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX512VL-64-NEXT: vmovq %xmm1, %rax ; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512VL-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-64-NEXT: retq ; @@ -805,19 +805,19 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX1-64-NEXT: vmovd %xmm1, %eax ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 -; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-64-NEXT: vextractps $2, %xmm0, %eax +; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 ; AVX1-64-NEXT: vmovq %xmm0, %rax ; AVX1-64-NEXT: movl %eax, %eax ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-64-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 ; AVX1-64-NEXT: vpextrd $1, %xmm1, %eax ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 -; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; AVX1-64-NEXT: vpextrd $3, %xmm0, %eax ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 ; AVX1-64-NEXT: vpextrd $1, %xmm0, %eax @@ -835,12 +835,12 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX2-64-NEXT: vextractps $1, %xmm1, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 -; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-64-NEXT: vextractps $3, %xmm0, %eax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; AVX2-64-NEXT: vextractps $1, %xmm0, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 -; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-64-NEXT: vextractps $1, %xmm0, %eax +; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm3 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9] ; AVX2-64-NEXT: vmulpd %ymm3, %ymm2, %ymm2 @@ -866,12 +866,12 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 ; AVX512F-64-NEXT: vmovq %xmm1, %rax ; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 -; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm3 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 -; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm0 +; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-64-NEXT: retq ; @@ -882,12 +882,12 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 ; AVX512VL-64-NEXT: vmovq %xmm1, %rax ; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 -; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm3 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 -; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm0 +; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512VL-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-64-NEXT: retq ; @@ -950,13 +950,13 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX1-64-NEXT: vmovq %xmm0, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq @@ -967,13 +967,13 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX2-64-NEXT: vmovq %xmm0, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-64-NEXT: vzeroupper ; AVX2-64-NEXT: retq @@ -984,13 +984,13 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512F-64-NEXT: vmovq %xmm0, %rax ; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-64-NEXT: vzeroupper ; AVX512F-64-NEXT: retq @@ -1001,13 +1001,13 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax ; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-64-NEXT: vzeroupper ; AVX512VL-64-NEXT: retq @@ -1095,13 +1095,13 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX1-64-NEXT: vmovq %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-64-NEXT: vmovq %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm6, %xmm1 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm3 ; AVX1-64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 @@ -1120,14 +1120,14 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; AVX2-64-NEXT: vmovq %xmm1, %rax ; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-64-NEXT: vmovq %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX2-64-NEXT: vmovq %xmm1, %rcx +; AVX2-64-NEXT: vcvtsi2ss %rcx, %xmm4, %xmm1 +; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; AVX2-64-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 @@ -1141,13 +1141,13 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512F-64-NEXT: vmovq %xmm0, %rax ; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-64-NEXT: vzeroupper ; AVX512F-64-NEXT: retq @@ -1158,13 +1158,13 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax ; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-64-NEXT: vzeroupper ; AVX512VL-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll index 0cf945202a2d4..ce4d3850515a6 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll @@ -307,12 +307,12 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 { ; NODQ-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; NODQ-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; NODQ-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; NODQ-32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; NODQ-32-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NODQ-32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; NODQ-32-NEXT: vmovhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] +; NODQ-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-32-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 ; NODQ-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; NODQ-32-NEXT: movl %ebp, %esp ; NODQ-32-NEXT: popl %ebp @@ -326,24 +326,24 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 { ; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax ; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NODQ-64-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 -; NODQ-64-NEXT: vmovq %xmm2, %rax ; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-64-NEXT: vmovq %xmm3, %rax ; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NODQ-64-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 +; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-64-NEXT: vmovq %xmm3, %rax +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm2 ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -436,12 +436,12 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 { ; NODQ-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; NODQ-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; NODQ-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; NODQ-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] ; NODQ-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; NODQ-32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; NODQ-32-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NODQ-32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; NODQ-32-NEXT: vmovhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] +; NODQ-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NODQ-32-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 ; NODQ-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; NODQ-32-NEXT: movl %ebp, %esp ; NODQ-32-NEXT: popl %ebp @@ -455,24 +455,24 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 { ; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax ; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NODQ-64-NEXT: vpextrq $1, %xmm3, %rax ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm3 -; NODQ-64-NEXT: vmovq %xmm2, %rax ; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2 -; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax +; NODQ-64-NEXT: vmovq %xmm3, %rax ; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2 -; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NODQ-64-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm4 +; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; NODQ-64-NEXT: vmovq %xmm3, %rax +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm5, %xmm2 ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm5, %xmm3 +; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm0 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm5, %xmm0 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -551,26 +551,26 @@ define <8 x float> @sitofp_v8i64_v8f32(<8 x i64> %x) #0 { ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax ; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NODQ-64-NEXT: vmovq %xmm3, %rax +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; NODQ-64-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-64-NEXT: vmovq %xmm0, %rcx +; NODQ-64-NEXT: vcvtsi2ss %rcx, %xmm5, %xmm0 +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-64-NEXT: retq ; @@ -678,26 +678,26 @@ define <8 x float> @uitofp_v8i64_v8f32(<8 x i64> %x) #0 { ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax ; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 +; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NODQ-64-NEXT: vmovq %xmm3, %rax +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 +; NODQ-64-NEXT: vpextrq $1, %xmm3, %rax +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 -; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; NODQ-64-NEXT: vmovq %xmm0, %rcx +; NODQ-64-NEXT: vcvtusi2ss %rcx, %xmm5, %xmm0 +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] +; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll index e229165be967a..1ca2ac31389a5 100644 --- a/llvm/test/CodeGen/X86/vec_anyext.ll +++ b/llvm/test/CodeGen/X86/vec_anyext.ll @@ -172,7 +172,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind { ; X64: # %bb.0: ; X64-NEXT: vmovdqa (%rdi), %xmm0 ; X64-NEXT: vmovdqa 16(%rdi), %xmm1 -; X64-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; X64-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/vec_call.ll b/llvm/test/CodeGen/X86/vec_call.ll index cc620d3e5f5fb..680c3799b8292 100644 --- a/llvm/test/CodeGen/X86/vec_call.ll +++ b/llvm/test/CodeGen/X86/vec_call.ll @@ -9,13 +9,11 @@ define void @test() { ; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,3,9] ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,2,3,4] +; CHECK-NEXT: movups %xmm0, (%esp) ; CHECK-NEXT: movl $7, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $6, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $5, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $4, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $3, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $2, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl $1, (%esp) ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4,3,2,1] ; CHECK-NEXT: movaps {{.*#+}} xmm1 = [8,7,6,5] ; CHECK-NEXT: movaps {{.*#+}} xmm2 = [6,4,2,0] diff --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll index e0089354cc953..157bb8dd4595c 100644 --- a/llvm/test/CodeGen/X86/vec_cast.ll +++ b/llvm/test/CodeGen/X86/vec_cast.ll @@ -95,11 +95,11 @@ define <3 x i32> @e(<3 x i16> %a) nounwind { ; ; CHECK-WIN-LABEL: e: ; CHECK-WIN: # %bb.0: -; CHECK-WIN-NEXT: # kill: def $r8w killed $r8w def $r8d ; CHECK-WIN-NEXT: # kill: def $dx killed $dx def $edx ; CHECK-WIN-NEXT: movzwl %cx, %eax ; CHECK-WIN-NEXT: movd %eax, %xmm0 ; CHECK-WIN-NEXT: pinsrw $2, %edx, %xmm0 +; CHECK-WIN-NEXT: # kill: def $r8w killed $r8w def $r8d ; CHECK-WIN-NEXT: pinsrw $4, %r8d, %xmm0 ; CHECK-WIN-NEXT: retq %c = zext <3 x i16> %a to <3 x i32> @@ -145,9 +145,9 @@ define <8 x i16> @g(<8 x i32> %a) nounwind { define <3 x i16> @h(<3 x i32> %a) nounwind { ; CHECK-LIN-LABEL: h: ; CHECK-LIN: # %bb.0: -; CHECK-LIN-NEXT: movd %xmm0, %eax ; CHECK-LIN-NEXT: pextrw $2, %xmm0, %edx ; CHECK-LIN-NEXT: pextrw $4, %xmm0, %ecx +; CHECK-LIN-NEXT: movd %xmm0, %eax ; CHECK-LIN-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-LIN-NEXT: # kill: def $dx killed $dx killed $edx ; CHECK-LIN-NEXT: # kill: def $cx killed $cx killed $ecx @@ -156,12 +156,12 @@ define <3 x i16> @h(<3 x i32> %a) nounwind { ; CHECK-WIN-LABEL: h: ; CHECK-WIN: # %bb.0: ; CHECK-WIN-NEXT: movdqa (%rcx), %xmm0 -; CHECK-WIN-NEXT: movl (%rcx), %eax ; CHECK-WIN-NEXT: pextrw $2, %xmm0, %edx -; CHECK-WIN-NEXT: pextrw $4, %xmm0, %ecx +; CHECK-WIN-NEXT: pextrw $4, %xmm0, %r8d +; CHECK-WIN-NEXT: movl (%rcx), %eax ; CHECK-WIN-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-WIN-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-WIN-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-WIN-NEXT: movl %r8d, %ecx ; CHECK-WIN-NEXT: retq %c = trunc <3 x i32> %a to <3 x i16> ret <3 x i16> %c diff --git a/llvm/test/CodeGen/X86/vec_cmp_sint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_sint-128.ll index ac4b25be5eb65..74d984d575886 100644 --- a/llvm/test/CodeGen/X86/vec_cmp_sint-128.ll +++ b/llvm/test/CodeGen/X86/vec_cmp_sint-128.ll @@ -155,7 +155,7 @@ define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-LABEL: ne_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -194,7 +194,7 @@ define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512-LABEL: ne_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -233,7 +233,7 @@ define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512-LABEL: ne_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -272,7 +272,7 @@ define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: ne_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -305,7 +305,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; SSE41-LABEL: ge_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 @@ -349,7 +349,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-LABEL: ge_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -388,7 +388,7 @@ define <4 x i32> @ge_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512-LABEL: ge_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -427,7 +427,7 @@ define <8 x i16> @ge_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512-LABEL: ge_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -466,7 +466,7 @@ define <16 x i8> @ge_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: ge_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -497,7 +497,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; SSE41-LABEL: gt_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 @@ -613,7 +613,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; SSE41-LABEL: le_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 @@ -657,7 +657,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-LABEL: le_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -696,7 +696,7 @@ define <4 x i32> @le_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512-LABEL: le_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -735,7 +735,7 @@ define <8 x i16> @le_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512-LABEL: le_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -774,7 +774,7 @@ define <16 x i8> @le_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: le_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -805,7 +805,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; SSE41-LABEL: lt_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll index 9a0756edbce32..731aa173bd20c 100644 --- a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll +++ b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll @@ -155,7 +155,7 @@ define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-LABEL: ne_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -194,7 +194,7 @@ define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512-LABEL: ne_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -233,7 +233,7 @@ define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512-LABEL: ne_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -272,7 +272,7 @@ define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: ne_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -343,7 +343,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: ge_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 @@ -518,7 +518,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: gt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -535,7 +535,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -594,7 +594,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -653,7 +653,7 @@ define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -696,7 +696,7 @@ define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -767,7 +767,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: le_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -943,7 +943,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: lt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 @@ -960,7 +960,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1020,7 +1020,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1080,7 +1080,7 @@ define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1123,7 +1123,7 @@ define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_compare-sse4.ll b/llvm/test/CodeGen/X86/vec_compare-sse4.ll index dde307aae26a7..bd9ac20e8eeb6 100644 --- a/llvm/test/CodeGen/X86/vec_compare-sse4.ll +++ b/llvm/test/CodeGen/X86/vec_compare-sse4.ll @@ -21,7 +21,7 @@ define <2 x i64> @test1(<2 x i64> %A, <2 x i64> %B) nounwind { ; ; SSE41-LABEL: test1: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll index cd375c0416881..019adc000025c 100644 --- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll @@ -34,19 +34,25 @@ entry: define i32 @test1(ptr nocapture readonly %ptr) nounwind { ; X86-LABEL: test1: ; X86: # %bb.0: # %entry +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movd (%eax), %mm0 -; X86-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3] -; X86-NEXT: movd %mm0, %eax +; X86-NEXT: movq %mm0, (%esp) # 8-byte Spill ; X86-NEXT: emms +; X86-NEXT: pshufw $232, (%esp), %mm0 # 8-byte Folded Reload +; X86-NEXT: # mm0 = mem[0,2,2,3] +; X86-NEXT: movd %mm0, %eax +; X86-NEXT: addl $8, %esp ; X86-NEXT: retl ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry ; X64-NEXT: movd (%rdi), %mm0 -; X64-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3] -; X64-NEXT: movd %mm0, %eax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: pshufw $232, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload +; X64-NEXT: # mm0 = mem[0,2,2,3] +; X64-NEXT: movd %mm0, %eax ; X64-NEXT: retq entry: %0 = load i32, ptr %ptr, align 4 @@ -69,17 +75,23 @@ entry: define i32 @test2(ptr nocapture readonly %ptr) nounwind { ; X86-LABEL: test2: ; X86: # %bb.0: # %entry +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: pshufw $232, (%eax), %mm0 # mm0 = mem[0,2,2,3] -; X86-NEXT: movd %mm0, %eax +; X86-NEXT: movq %mm0, (%esp) # 8-byte Spill ; X86-NEXT: emms +; X86-NEXT: movq (%esp), %mm0 # 8-byte Reload +; X86-NEXT: movd %mm0, %eax +; X86-NEXT: addl $8, %esp ; X86-NEXT: retl ; ; X64-LABEL: test2: ; X64: # %bb.0: # %entry ; X64-NEXT: pshufw $232, (%rdi), %mm0 # mm0 = mem[0,2,2,3] -; X64-NEXT: movd %mm0, %eax +; X64-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: emms +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload +; X64-NEXT: movd %mm0, %eax ; X64-NEXT: retq entry: %0 = load <1 x i64>, ptr %ptr, align 8 diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll index d0abd7d5f7512..836e723d0c966 100644 --- a/llvm/test/CodeGen/X86/vec_fabs.ll +++ b/llvm/test/CodeGen/X86/vec_fabs.ll @@ -155,15 +155,15 @@ define <8 x half> @fabs_v8f16(ptr %p) nounwind { ; ; X86-AVX2-LABEL: fabs_v8f16: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vpand (%eax), %xmm0, %xmm0 ; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fabs_v8f16: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vpand (%eax), %xmm0, %xmm0 ; X86-AVX512-NEXT: retl ; @@ -361,15 +361,15 @@ define <16 x half> @fabs_v16f16(ptr %p) nounwind { ; ; X86-AVX2-LABEL: fabs_v16f16: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vpand (%eax), %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fabs_v16f16: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vpand (%eax), %ymm0, %ymm0 ; X86-AVX512-NEXT: retl ; @@ -424,12 +424,19 @@ define <8 x double> @fabs_v8f64(<8 x double> %p) nounwind { ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; -; X86-AVX1OR2-LABEL: fabs_v8f64: -; X86-AVX1OR2: # %bb.0: -; X86-AVX1OR2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] -; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0 -; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX1OR2-NEXT: retl +; X86-AVX1-LABEL: fabs_v8f64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X86-AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: fabs_v8f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X86-AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512VL-LABEL: fabs_v8f64: ; X86-AVX512VL: # %bb.0: @@ -455,12 +462,19 @@ define <8 x double> @fabs_v8f64(<8 x double> %p) nounwind { ; X64-SSE-NEXT: andps %xmm4, %xmm3 ; X64-SSE-NEXT: retq ; -; X64-AVX1OR2-LABEL: fabs_v8f64: -; X64-AVX1OR2: # %bb.0: -; X64-AVX1OR2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] -; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0 -; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: fabs_v8f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X64-AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: fabs_v8f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X64-AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512VL-LABEL: fabs_v8f64: ; X64-AVX512VL: # %bb.0: @@ -497,12 +511,19 @@ define <16 x float> @fabs_v16f32(<16 x float> %p) nounwind { ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; -; X86-AVX1OR2-LABEL: fabs_v16f32: -; X86-AVX1OR2: # %bb.0: -; X86-AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0 -; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX1OR2-NEXT: retl +; X86-AVX1-LABEL: fabs_v16f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: fabs_v16f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512VL-LABEL: fabs_v16f32: ; X86-AVX512VL: # %bb.0: @@ -528,12 +549,19 @@ define <16 x float> @fabs_v16f32(<16 x float> %p) nounwind { ; X64-SSE-NEXT: andps %xmm4, %xmm3 ; X64-SSE-NEXT: retq ; -; X64-AVX1OR2-LABEL: fabs_v16f32: -; X64-AVX1OR2: # %bb.0: -; X64-AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0 -; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: fabs_v16f32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: fabs_v16f32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512VL-LABEL: fabs_v16f32: ; X64-AVX512VL: # %bb.0: @@ -571,7 +599,7 @@ define <32 x half> @fabs_v32f16(ptr %p) nounwind { ; X86-AVX1-LABEL: fabs_v32f16: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X86-AVX1-NEXT: vandps (%eax), %ymm1, %ymm0 ; X86-AVX1-NEXT: vandps 32(%eax), %ymm1, %ymm1 ; X86-AVX1-NEXT: retl @@ -594,8 +622,8 @@ define <32 x half> @fabs_v32f16(ptr %p) nounwind { ; ; X86-AVX512FP16-LABEL: fabs_v32f16: ; X86-AVX512FP16: # %bb.0: -; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512FP16-NEXT: vpandq (%eax), %zmm0, %zmm0 ; X86-AVX512FP16-NEXT: retl ; @@ -621,7 +649,7 @@ define <32 x half> @fabs_v32f16(ptr %p) nounwind { ; ; X64-AVX1-LABEL: fabs_v32f16: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X64-AVX1-NEXT: vandps (%rdi), %ymm1, %ymm0 ; X64-AVX1-NEXT: vandps 32(%rdi), %ymm1, %ymm1 ; X64-AVX1-NEXT: retq @@ -724,17 +752,41 @@ define void @PR70947(ptr %src, ptr %dst) nounwind { ; X86-SSE-NEXT: movups %xmm1, 16(%eax) ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: PR70947: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] -; X86-AVX-NEXT: vandps (%ecx), %ymm0, %ymm1 -; X86-AVX-NEXT: vandps 32(%ecx), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovups %ymm1, (%eax) -; X86-AVX-NEXT: vmovups %xmm0, 16(%eax) -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: PR70947: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] +; X86-AVX1-NEXT: vandps (%ecx), %ymm0, %ymm1 +; X86-AVX1-NEXT: vandps 32(%ecx), %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovups %ymm1, (%eax) +; X86-AVX1-NEXT: vmovups %xmm0, 16(%eax) +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: PR70947: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] +; X86-AVX2-NEXT: vandps (%ecx), %ymm0, %ymm1 +; X86-AVX2-NEXT: vandps 32(%ecx), %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovups %ymm1, (%eax) +; X86-AVX2-NEXT: vmovups %xmm0, 16(%eax) +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: PR70947: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] +; X86-AVX512-NEXT: vandps (%ecx), %ymm0, %ymm1 +; X86-AVX512-NEXT: vandps 32(%ecx), %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovups %ymm1, (%eax) +; X86-AVX512-NEXT: vmovups %xmm0, 16(%eax) +; X86-AVX512-NEXT: vzeroupper +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: PR70947: ; X64-SSE: # %bb.0: @@ -747,15 +799,35 @@ define void @PR70947(ptr %src, ptr %dst) nounwind { ; X64-SSE-NEXT: movups %xmm1, 16(%rsi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: PR70947: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] -; X64-AVX-NEXT: vandps (%rdi), %ymm0, %ymm1 -; X64-AVX-NEXT: vandps 32(%rdi), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovups %ymm1, (%rsi) -; X64-AVX-NEXT: vmovups %xmm0, 16(%rsi) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: PR70947: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] +; X64-AVX1-NEXT: vandps (%rdi), %ymm0, %ymm1 +; X64-AVX1-NEXT: vandps 32(%rdi), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovups %ymm1, (%rsi) +; X64-AVX1-NEXT: vmovups %xmm0, 16(%rsi) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: PR70947: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] +; X64-AVX2-NEXT: vandps (%rdi), %ymm0, %ymm1 +; X64-AVX2-NEXT: vandps 32(%rdi), %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovups %ymm1, (%rsi) +; X64-AVX2-NEXT: vmovups %xmm0, 16(%rsi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: PR70947: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] +; X64-AVX512-NEXT: vandps (%rdi), %ymm0, %ymm1 +; X64-AVX512-NEXT: vandps 32(%rdi), %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovups %ymm1, (%rsi) +; X64-AVX512-NEXT: vmovups %xmm0, 16(%rsi) +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %src4 = getelementptr inbounds double, ptr %src, i64 4 %dst4 = getelementptr inbounds i32, ptr %dst, i64 4 %ld0 = load <4 x double>, ptr %src, align 8 @@ -766,3 +838,6 @@ define void @PR70947(ptr %src, ptr %dst) nounwind { store <2 x double> %fabs4, ptr %dst4, align 4 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64-AVX: {{.*}} +; X86-AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vec_fcopysign.ll b/llvm/test/CodeGen/X86/vec_fcopysign.ll index 5b9cda58bac20..7a792dda82e18 100644 --- a/llvm/test/CodeGen/X86/vec_fcopysign.ll +++ b/llvm/test/CodeGen/X86/vec_fcopysign.ll @@ -33,7 +33,7 @@ define <2 x double> @fcopysign_v2f64(<2 x double> %a0, <2 x double> %a1) nounwin ; ; X86-AVX512-LABEL: fcopysign_v2f64: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm1, %xmm0 +; X86-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1)) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v2f64: @@ -52,7 +52,7 @@ define <2 x double> @fcopysign_v2f64(<2 x double> %a0, <2 x double> %a1) nounwin ; ; X64-AVX512-LABEL: fcopysign_v2f64: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1)) ; X64-AVX512-NEXT: retq %t = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a0, <2 x double> %a1) ret <2 x double> %t @@ -77,15 +77,15 @@ define <4 x float> @fcopysign_v4f32(<4 x float> %a0, <4 x float> %a1) nounwind { ; X86-AVX2-LABEL: fcopysign_v4f32: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] ; X86-AVX2-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; X86-AVX2-NEXT: vandps %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vandps %xmm3, %xmm0, %xmm0 ; X86-AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fcopysign_v4f32: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0 +; X86-AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1)) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v4f32: @@ -105,15 +105,15 @@ define <4 x float> @fcopysign_v4f32(<4 x float> %a0, <4 x float> %a1) nounwind { ; X64-AVX2-LABEL: fcopysign_v4f32: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] ; X64-AVX2-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; X64-AVX2-NEXT: vandps %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vandps %xmm3, %xmm0, %xmm0 ; X64-AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: fcopysign_v4f32: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; X64-AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1)) ; X64-AVX512-NEXT: retq %t = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a0, <4 x float> %a1) ret <4 x float> %t @@ -158,9 +158,9 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm1 ; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] -; X86-AVX512-NEXT: vpternlogd $202, (%eax), %xmm1, %xmm0 +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm1 +; X86-AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = mem ^ (xmm0 & (xmm1 ^ mem)) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v8f16: @@ -192,9 +192,9 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind { ; ; X64-AVX512-LABEL: fcopysign_v8f16: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm1 ; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] -; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %xmm1, %xmm0 +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; X64-AVX512-NEXT: vpternlogd {{.*#+}} xmm0 = mem ^ (xmm0 & (xmm1 ^ mem)) ; X64-AVX512-NEXT: retq %a0 = load <8 x half>, ptr %p0, align 16 %a1 = load <8 x half>, ptr %p1, align 16 @@ -218,9 +218,9 @@ define <4 x double> @fcopysign_v4f64(<4 x double> %a0, <4 x double> %a1) nounwin ; X86-SSE-NEXT: movaps %xmm3, %xmm4 ; X86-SSE-NEXT: andnps %xmm2, %xmm4 ; X86-SSE-NEXT: andps %xmm3, %xmm0 -; X86-SSE-NEXT: orps %xmm4, %xmm0 ; X86-SSE-NEXT: andps %xmm3, %xmm1 ; X86-SSE-NEXT: andnps 8(%ebp), %xmm3 +; X86-SSE-NEXT: orps %xmm4, %xmm0 ; X86-SSE-NEXT: orps %xmm3, %xmm1 ; X86-SSE-NEXT: movl %ebp, %esp ; X86-SSE-NEXT: popl %ebp @@ -236,15 +236,15 @@ define <4 x double> @fcopysign_v4f64(<4 x double> %a0, <4 x double> %a1) nounwin ; X86-AVX2-LABEL: fcopysign_v4f64: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN] ; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] -; X86-AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fcopysign_v4f64: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0 +; X86-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1)) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v4f64: @@ -269,15 +269,15 @@ define <4 x double> @fcopysign_v4f64(<4 x double> %a0, <4 x double> %a1) nounwin ; X64-AVX2-LABEL: fcopysign_v4f64: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN] ; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] -; X64-AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: fcopysign_v4f64: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1)) ; X64-AVX512-NEXT: retq %t = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a0, <4 x double> %a1) ret <4 x double> %t @@ -295,9 +295,9 @@ define <8 x float> @fcopysign_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind { ; X86-SSE-NEXT: movaps %xmm3, %xmm4 ; X86-SSE-NEXT: andnps %xmm2, %xmm4 ; X86-SSE-NEXT: andps %xmm3, %xmm0 -; X86-SSE-NEXT: orps %xmm4, %xmm0 ; X86-SSE-NEXT: andps %xmm3, %xmm1 ; X86-SSE-NEXT: andnps 8(%ebp), %xmm3 +; X86-SSE-NEXT: orps %xmm4, %xmm0 ; X86-SSE-NEXT: orps %xmm3, %xmm1 ; X86-SSE-NEXT: movl %ebp, %esp ; X86-SSE-NEXT: popl %ebp @@ -313,15 +313,15 @@ define <8 x float> @fcopysign_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind { ; X86-AVX2-LABEL: fcopysign_v8f32: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fcopysign_v8f32: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm1, %ymm0 +; X86-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1)) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v8f32: @@ -346,15 +346,15 @@ define <8 x float> @fcopysign_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind { ; X64-AVX2-LABEL: fcopysign_v8f32: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandps %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: fcopysign_v8f32: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; X64-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1)) ; X64-AVX512-NEXT: retq %t = call <8 x float> @llvm.copysign.v8f32(<8 x float> %a0, <8 x float> %a1) ret <8 x float> %t @@ -365,17 +365,17 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; X86-SSE-LABEL: fcopysign_v16f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X86-SSE-NEXT: movaps %xmm1, %xmm2 -; X86-SSE-NEXT: andnps (%ecx), %xmm2 -; X86-SSE-NEXT: movaps (%eax), %xmm0 +; X86-SSE-NEXT: andnps (%eax), %xmm2 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movaps (%ecx), %xmm0 ; X86-SSE-NEXT: andps %xmm1, %xmm0 +; X86-SSE-NEXT: movaps %xmm1, %xmm3 +; X86-SSE-NEXT: andnps 16(%eax), %xmm3 +; X86-SSE-NEXT: andps 16(%ecx), %xmm1 ; X86-SSE-NEXT: orps %xmm2, %xmm0 -; X86-SSE-NEXT: movaps %xmm1, %xmm2 -; X86-SSE-NEXT: andnps 16(%ecx), %xmm2 -; X86-SSE-NEXT: andps 16(%eax), %xmm1 -; X86-SSE-NEXT: orps %xmm2, %xmm1 +; X86-SSE-NEXT: orps %xmm3, %xmm1 ; X86-SSE-NEXT: retl ; ; X86-AVX1-LABEL: fcopysign_v16f16: @@ -404,9 +404,9 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovdqu (%ecx), %ymm1 ; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X86-AVX512-NEXT: vpternlogd $202, (%eax), %ymm1, %ymm0 +; X86-AVX512-NEXT: vmovdqu (%ecx), %ymm1 +; X86-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm0 & (ymm1 ^ mem)) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v16f16: @@ -416,11 +416,11 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; X64-SSE-NEXT: andnps (%rsi), %xmm2 ; X64-SSE-NEXT: movaps (%rdi), %xmm0 ; X64-SSE-NEXT: andps %xmm1, %xmm0 -; X64-SSE-NEXT: orps %xmm2, %xmm0 -; X64-SSE-NEXT: movaps %xmm1, %xmm2 -; X64-SSE-NEXT: andnps 16(%rsi), %xmm2 +; X64-SSE-NEXT: movaps %xmm1, %xmm3 +; X64-SSE-NEXT: andnps 16(%rsi), %xmm3 ; X64-SSE-NEXT: andps 16(%rdi), %xmm1 -; X64-SSE-NEXT: orps %xmm2, %xmm1 +; X64-SSE-NEXT: orps %xmm2, %xmm0 +; X64-SSE-NEXT: orps %xmm3, %xmm1 ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: fcopysign_v16f16: @@ -443,9 +443,9 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; ; X64-AVX512-LABEL: fcopysign_v16f16: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1 ; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %ymm1, %ymm0 +; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1 +; X64-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm0 & (ymm1 ^ mem)) ; X64-AVX512-NEXT: retq %a0 = load <16 x half>, ptr %p0, align 16 %a1 = load <16 x half>, ptr %p1, align 16 @@ -474,38 +474,55 @@ define <8 x double> @fcopysign_v8f64(<8 x double> %a0, <8 x double> %a1) nounwin ; X86-SSE-NEXT: movaps %xmm3, %xmm4 ; X86-SSE-NEXT: andnps 40(%ebp), %xmm4 ; X86-SSE-NEXT: orps %xmm4, %xmm1 -; X86-SSE-NEXT: andps %xmm3, %xmm2 ; X86-SSE-NEXT: movaps %xmm3, %xmm4 ; X86-SSE-NEXT: andnps 56(%ebp), %xmm4 +; X86-SSE-NEXT: andps %xmm3, %xmm2 +; X86-SSE-NEXT: movaps %xmm3, %xmm5 +; X86-SSE-NEXT: andnps 72(%ebp), %xmm5 ; X86-SSE-NEXT: orps %xmm4, %xmm2 -; X86-SSE-NEXT: movaps %xmm3, %xmm4 -; X86-SSE-NEXT: andnps 72(%ebp), %xmm4 ; X86-SSE-NEXT: andps 8(%ebp), %xmm3 -; X86-SSE-NEXT: orps %xmm4, %xmm3 +; X86-SSE-NEXT: orps %xmm5, %xmm3 ; X86-SSE-NEXT: movl %ebp, %esp ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; -; X86-AVX1OR2-LABEL: fcopysign_v8f64: -; X86-AVX1OR2: # %bb.0: -; X86-AVX1OR2-NEXT: pushl %ebp -; X86-AVX1OR2-NEXT: movl %esp, %ebp -; X86-AVX1OR2-NEXT: andl $-32, %esp -; X86-AVX1OR2-NEXT: subl $32, %esp -; X86-AVX1OR2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN] -; X86-AVX1OR2-NEXT: vandnps %ymm2, %ymm3, %ymm2 -; X86-AVX1OR2-NEXT: vandps %ymm3, %ymm0, %ymm0 -; X86-AVX1OR2-NEXT: vorps %ymm2, %ymm0, %ymm0 -; X86-AVX1OR2-NEXT: vandps %ymm3, %ymm1, %ymm1 -; X86-AVX1OR2-NEXT: vandnps 8(%ebp), %ymm3, %ymm2 -; X86-AVX1OR2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX1OR2-NEXT: movl %ebp, %esp -; X86-AVX1OR2-NEXT: popl %ebp -; X86-AVX1OR2-NEXT: retl +; X86-AVX1-LABEL: fcopysign_v8f64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN] +; X86-AVX1-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; X86-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vandnps 8(%ebp), %ymm3, %ymm2 +; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X86-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: fcopysign_v8f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN] +; X86-AVX2-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; X86-AVX2-NEXT: vandps %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vandnps 8(%ebp), %ymm3, %ymm2 +; X86-AVX2-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fcopysign_v8f64: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm0 +; X86-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1)) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v8f64: @@ -528,20 +545,31 @@ define <8 x double> @fcopysign_v8f64(<8 x double> %a0, <8 x double> %a1) nounwin ; X64-SSE-NEXT: orps %xmm8, %xmm3 ; X64-SSE-NEXT: retq ; -; X64-AVX1OR2-LABEL: fcopysign_v8f64: -; X64-AVX1OR2: # %bb.0: -; X64-AVX1OR2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN] -; X64-AVX1OR2-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; X64-AVX1OR2-NEXT: vandps %ymm4, %ymm0, %ymm0 -; X64-AVX1OR2-NEXT: vorps %ymm2, %ymm0, %ymm0 -; X64-AVX1OR2-NEXT: vandnps %ymm3, %ymm4, %ymm2 -; X64-AVX1OR2-NEXT: vandps %ymm4, %ymm1, %ymm1 -; X64-AVX1OR2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: fcopysign_v8f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN] +; X64-AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; X64-AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm2 +; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: fcopysign_v8f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN] +; X64-AVX2-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; X64-AVX2-NEXT: vandps %ymm4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandnps %ymm3, %ymm4, %ymm2 +; X64-AVX2-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: fcopysign_v8f64: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 +; X64-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1)) ; X64-AVX512-NEXT: retq %t = call <8 x double> @llvm.copysign.v8f64(<8 x double> %a0, <8 x double> %a1) ret <8 x double> %t @@ -564,38 +592,55 @@ define <16 x float> @fcopysign_v16f32(<16 x float> %a0, <16 x float> %a1) nounwi ; X86-SSE-NEXT: movaps %xmm3, %xmm4 ; X86-SSE-NEXT: andnps 40(%ebp), %xmm4 ; X86-SSE-NEXT: orps %xmm4, %xmm1 -; X86-SSE-NEXT: andps %xmm3, %xmm2 ; X86-SSE-NEXT: movaps %xmm3, %xmm4 ; X86-SSE-NEXT: andnps 56(%ebp), %xmm4 +; X86-SSE-NEXT: andps %xmm3, %xmm2 +; X86-SSE-NEXT: movaps %xmm3, %xmm5 +; X86-SSE-NEXT: andnps 72(%ebp), %xmm5 ; X86-SSE-NEXT: orps %xmm4, %xmm2 -; X86-SSE-NEXT: movaps %xmm3, %xmm4 -; X86-SSE-NEXT: andnps 72(%ebp), %xmm4 ; X86-SSE-NEXT: andps 8(%ebp), %xmm3 -; X86-SSE-NEXT: orps %xmm4, %xmm3 +; X86-SSE-NEXT: orps %xmm5, %xmm3 ; X86-SSE-NEXT: movl %ebp, %esp ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; -; X86-AVX1OR2-LABEL: fcopysign_v16f32: -; X86-AVX1OR2: # %bb.0: -; X86-AVX1OR2-NEXT: pushl %ebp -; X86-AVX1OR2-NEXT: movl %esp, %ebp -; X86-AVX1OR2-NEXT: andl $-32, %esp -; X86-AVX1OR2-NEXT: subl $32, %esp -; X86-AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX1OR2-NEXT: vandnps %ymm2, %ymm3, %ymm2 -; X86-AVX1OR2-NEXT: vandps %ymm3, %ymm0, %ymm0 -; X86-AVX1OR2-NEXT: vorps %ymm2, %ymm0, %ymm0 -; X86-AVX1OR2-NEXT: vandps %ymm3, %ymm1, %ymm1 -; X86-AVX1OR2-NEXT: vandnps 8(%ebp), %ymm3, %ymm2 -; X86-AVX1OR2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX1OR2-NEXT: movl %ebp, %esp -; X86-AVX1OR2-NEXT: popl %ebp -; X86-AVX1OR2-NEXT: retl +; X86-AVX1-LABEL: fcopysign_v16f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX1-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; X86-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vandnps 8(%ebp), %ymm3, %ymm2 +; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X86-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: fcopysign_v16f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX2-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; X86-AVX2-NEXT: vandps %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vandnps 8(%ebp), %ymm3, %ymm2 +; X86-AVX2-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fcopysign_v16f32: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm1, %zmm0 +; X86-AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1)) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v16f32: @@ -618,20 +663,31 @@ define <16 x float> @fcopysign_v16f32(<16 x float> %a0, <16 x float> %a1) nounwi ; X64-SSE-NEXT: orps %xmm8, %xmm3 ; X64-SSE-NEXT: retq ; -; X64-AVX1OR2-LABEL: fcopysign_v16f32: -; X64-AVX1OR2: # %bb.0: -; X64-AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX1OR2-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; X64-AVX1OR2-NEXT: vandps %ymm4, %ymm0, %ymm0 -; X64-AVX1OR2-NEXT: vorps %ymm2, %ymm0, %ymm0 -; X64-AVX1OR2-NEXT: vandnps %ymm3, %ymm4, %ymm2 -; X64-AVX1OR2-NEXT: vandps %ymm4, %ymm1, %ymm1 -; X64-AVX1OR2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: fcopysign_v16f32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; X64-AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm2 +; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: fcopysign_v16f32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX2-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; X64-AVX2-NEXT: vandps %ymm4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vorps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandnps %ymm3, %ymm4, %ymm2 +; X64-AVX2-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: fcopysign_v16f32: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; X64-AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1)) ; X64-AVX512-NEXT: retq %t = call <16 x float> @llvm.copysign.v16f32(<16 x float> %a0, <16 x float> %a1) ret <16 x float> %t @@ -648,9 +704,9 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: andnps (%ecx), %xmm1 ; X86-SSE-NEXT: movaps (%eax), %xmm0 ; X86-SSE-NEXT: andps %xmm3, %xmm0 -; X86-SSE-NEXT: orps %xmm1, %xmm0 ; X86-SSE-NEXT: movaps %xmm3, %xmm2 ; X86-SSE-NEXT: andnps 16(%ecx), %xmm2 +; X86-SSE-NEXT: orps %xmm1, %xmm0 ; X86-SSE-NEXT: movaps 16(%eax), %xmm1 ; X86-SSE-NEXT: andps %xmm3, %xmm1 ; X86-SSE-NEXT: orps %xmm2, %xmm1 @@ -658,46 +714,46 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: andnps 32(%ecx), %xmm4 ; X86-SSE-NEXT: movaps 32(%eax), %xmm2 ; X86-SSE-NEXT: andps %xmm3, %xmm2 +; X86-SSE-NEXT: movaps %xmm3, %xmm5 +; X86-SSE-NEXT: andnps 48(%ecx), %xmm5 ; X86-SSE-NEXT: orps %xmm4, %xmm2 -; X86-SSE-NEXT: movaps %xmm3, %xmm4 -; X86-SSE-NEXT: andnps 48(%ecx), %xmm4 ; X86-SSE-NEXT: andps 48(%eax), %xmm3 -; X86-SSE-NEXT: orps %xmm4, %xmm3 +; X86-SSE-NEXT: orps %xmm5, %xmm3 ; X86-SSE-NEXT: retl ; ; X86-AVX1-LABEL: fcopysign_v32f16: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX1-NEXT: vandnps (%ecx), %ymm1, %ymm0 -; X86-AVX1-NEXT: vandps (%eax), %ymm1, %ymm2 -; X86-AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 -; X86-AVX1-NEXT: vandnps 32(%ecx), %ymm1, %ymm2 -; X86-AVX1-NEXT: vandps 32(%eax), %ymm1, %ymm1 -; X86-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX1-NEXT: vandnps (%ecx), %ymm0, %ymm1 +; X86-AVX1-NEXT: vandps (%eax), %ymm0, %ymm2 +; X86-AVX1-NEXT: vandnps 32(%ecx), %ymm0, %ymm3 +; X86-AVX1-NEXT: vandps 32(%eax), %ymm0, %ymm4 +; X86-AVX1-NEXT: vorps %ymm1, %ymm2, %ymm0 +; X86-AVX1-NEXT: vorps %ymm3, %ymm4, %ymm1 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: fcopysign_v32f16: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX2-NEXT: vpandn (%ecx), %ymm1, %ymm0 -; X86-AVX2-NEXT: vpand (%eax), %ymm1, %ymm2 -; X86-AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vpandn 32(%ecx), %ymm1, %ymm2 -; X86-AVX2-NEXT: vpand 32(%eax), %ymm1, %ymm1 -; X86-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX2-NEXT: vpandn (%ecx), %ymm0, %ymm1 +; X86-AVX2-NEXT: vpand (%eax), %ymm0, %ymm2 +; X86-AVX2-NEXT: vpandn 32(%ecx), %ymm0, %ymm3 +; X86-AVX2-NEXT: vpand 32(%eax), %ymm0, %ymm4 +; X86-AVX2-NEXT: vpor %ymm1, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpor %ymm3, %ymm4, %ymm1 ; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fcopysign_v32f16: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovdqu64 (%ecx), %zmm1 ; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X86-AVX512-NEXT: vpternlogd $202, (%eax), %zmm1, %zmm0 +; X86-AVX512-NEXT: vmovdqu64 (%ecx), %zmm1 +; X86-AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem)) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v32f16: @@ -707,9 +763,9 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X64-SSE-NEXT: andnps (%rsi), %xmm1 ; X64-SSE-NEXT: movaps (%rdi), %xmm0 ; X64-SSE-NEXT: andps %xmm3, %xmm0 -; X64-SSE-NEXT: orps %xmm1, %xmm0 ; X64-SSE-NEXT: movaps %xmm3, %xmm2 ; X64-SSE-NEXT: andnps 16(%rsi), %xmm2 +; X64-SSE-NEXT: orps %xmm1, %xmm0 ; X64-SSE-NEXT: movaps 16(%rdi), %xmm1 ; X64-SSE-NEXT: andps %xmm3, %xmm1 ; X64-SSE-NEXT: orps %xmm2, %xmm1 @@ -717,40 +773,40 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X64-SSE-NEXT: andnps 32(%rsi), %xmm4 ; X64-SSE-NEXT: movaps 32(%rdi), %xmm2 ; X64-SSE-NEXT: andps %xmm3, %xmm2 +; X64-SSE-NEXT: movaps %xmm3, %xmm5 +; X64-SSE-NEXT: andnps 48(%rsi), %xmm5 ; X64-SSE-NEXT: orps %xmm4, %xmm2 -; X64-SSE-NEXT: movaps %xmm3, %xmm4 -; X64-SSE-NEXT: andnps 48(%rsi), %xmm4 ; X64-SSE-NEXT: andps 48(%rdi), %xmm3 -; X64-SSE-NEXT: orps %xmm4, %xmm3 +; X64-SSE-NEXT: orps %xmm5, %xmm3 ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: fcopysign_v32f16: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX1-NEXT: vandnps (%rsi), %ymm1, %ymm0 -; X64-AVX1-NEXT: vandps (%rdi), %ymm1, %ymm2 -; X64-AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 -; X64-AVX1-NEXT: vandnps 32(%rsi), %ymm1, %ymm2 -; X64-AVX1-NEXT: vandps 32(%rdi), %ymm1, %ymm1 -; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1 +; X64-AVX1-NEXT: vandps (%rdi), %ymm0, %ymm2 +; X64-AVX1-NEXT: vandnps 32(%rsi), %ymm0, %ymm3 +; X64-AVX1-NEXT: vandps 32(%rdi), %ymm0, %ymm4 +; X64-AVX1-NEXT: vorps %ymm1, %ymm2, %ymm0 +; X64-AVX1-NEXT: vorps %ymm3, %ymm4, %ymm1 ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: fcopysign_v32f16: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX2-NEXT: vpandn (%rsi), %ymm1, %ymm0 -; X64-AVX2-NEXT: vpand (%rdi), %ymm1, %ymm2 -; X64-AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vpandn 32(%rsi), %ymm1, %ymm2 -; X64-AVX2-NEXT: vpand 32(%rdi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX2-NEXT: vpandn (%rsi), %ymm0, %ymm1 +; X64-AVX2-NEXT: vpand (%rdi), %ymm0, %ymm2 +; X64-AVX2-NEXT: vpandn 32(%rsi), %ymm0, %ymm3 +; X64-AVX2-NEXT: vpand 32(%rdi), %ymm0, %ymm4 +; X64-AVX2-NEXT: vpor %ymm1, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpor %ymm3, %ymm4, %ymm1 ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: fcopysign_v32f16: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 ; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %zmm1, %zmm0 +; X64-AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 +; X64-AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem)) ; X64-AVX512-NEXT: retq %a0 = load <32 x half>, ptr %p0, align 16 %a1 = load <32 x half>, ptr %p1, align 16 diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll index abb85ac83464c..994a31448b60b 100644 --- a/llvm/test/CodeGen/X86/vec_floor.ll +++ b/llvm/test/CodeGen/X86/vec_floor.ll @@ -1188,18 +1188,18 @@ define <16 x float> @floor_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwin ; SSE41-LABEL: floor_mask_512_ps: ; SSE41: ## %bb.0: ; SSE41-NEXT: roundps $9, %xmm3, %xmm8 -; SSE41-NEXT: cmpeqps %xmm7, %xmm3 ; SSE41-NEXT: roundps $9, %xmm2, %xmm9 -; SSE41-NEXT: cmpeqps %xmm6, %xmm2 ; SSE41-NEXT: roundps $9, %xmm1, %xmm10 -; SSE41-NEXT: cmpeqps %xmm5, %xmm1 ; SSE41-NEXT: roundps $9, %xmm0, %xmm11 ; SSE41-NEXT: cmpeqps %xmm4, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4 +; SSE41-NEXT: cmpeqps %xmm5, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5 +; SSE41-NEXT: cmpeqps %xmm6, %xmm2 ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: cmpeqps %xmm7, %xmm3 ; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7 ; SSE41-NEXT: movaps %xmm4, %xmm0 @@ -1272,18 +1272,18 @@ define <8 x double> @floor_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwin ; SSE41-LABEL: floor_mask_512_pd: ; SSE41: ## %bb.0: ; SSE41-NEXT: roundpd $9, %xmm3, %xmm8 -; SSE41-NEXT: cmpeqpd %xmm7, %xmm3 ; SSE41-NEXT: roundpd $9, %xmm2, %xmm9 -; SSE41-NEXT: cmpeqpd %xmm6, %xmm2 ; SSE41-NEXT: roundpd $9, %xmm1, %xmm10 -; SSE41-NEXT: cmpeqpd %xmm5, %xmm1 ; SSE41-NEXT: roundpd $9, %xmm0, %xmm11 ; SSE41-NEXT: cmpeqpd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 +; SSE41-NEXT: cmpeqpd %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 +; SSE41-NEXT: cmpeqpd %xmm6, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: cmpeqpd %xmm7, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 ; SSE41-NEXT: movapd %xmm4, %xmm0 @@ -1395,8 +1395,8 @@ define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind { ; SSE41-LABEL: floor_maskz_ss: ; SSE41: ## %bb.0: -; SSE41-NEXT: testb $1, %dil ; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: testb $1, %dil ; SSE41-NEXT: je LBB53_2 ; SSE41-NEXT: ## %bb.1: ; SSE41-NEXT: xorps %xmm2, %xmm2 @@ -1408,8 +1408,8 @@ define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwi ; ; AVX-LABEL: floor_maskz_ss: ; AVX: ## %bb.0: -; AVX-NEXT: testb $1, %dil ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: testb $1, %dil ; AVX-NEXT: je LBB53_2 ; AVX-NEXT: ## %bb.1: ; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2 @@ -1475,8 +1475,8 @@ define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind { ; SSE41-LABEL: floor_maskz_sd: ; SSE41: ## %bb.0: -; SSE41-NEXT: testb $1, %dil ; SSE41-NEXT: xorpd %xmm2, %xmm2 +; SSE41-NEXT: testb $1, %dil ; SSE41-NEXT: je LBB55_2 ; SSE41-NEXT: ## %bb.1: ; SSE41-NEXT: xorps %xmm2, %xmm2 @@ -1488,8 +1488,8 @@ define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nou ; ; AVX-LABEL: floor_maskz_sd: ; AVX: ## %bb.0: -; AVX-NEXT: testb $1, %dil ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: testb $1, %dil ; AVX-NEXT: je LBB55_2 ; AVX-NEXT: ## %bb.1: ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2 @@ -2178,18 +2178,18 @@ define <16 x float> @ceil_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind ; SSE41-LABEL: ceil_mask_512_ps: ; SSE41: ## %bb.0: ; SSE41-NEXT: roundps $10, %xmm3, %xmm8 -; SSE41-NEXT: cmpeqps %xmm7, %xmm3 ; SSE41-NEXT: roundps $10, %xmm2, %xmm9 -; SSE41-NEXT: cmpeqps %xmm6, %xmm2 ; SSE41-NEXT: roundps $10, %xmm1, %xmm10 -; SSE41-NEXT: cmpeqps %xmm5, %xmm1 ; SSE41-NEXT: roundps $10, %xmm0, %xmm11 ; SSE41-NEXT: cmpeqps %xmm4, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4 +; SSE41-NEXT: cmpeqps %xmm5, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5 +; SSE41-NEXT: cmpeqps %xmm6, %xmm2 ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: cmpeqps %xmm7, %xmm3 ; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7 ; SSE41-NEXT: movaps %xmm4, %xmm0 @@ -2262,18 +2262,18 @@ define <8 x double> @ceil_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind ; SSE41-LABEL: ceil_mask_512_pd: ; SSE41: ## %bb.0: ; SSE41-NEXT: roundpd $10, %xmm3, %xmm8 -; SSE41-NEXT: cmpeqpd %xmm7, %xmm3 ; SSE41-NEXT: roundpd $10, %xmm2, %xmm9 -; SSE41-NEXT: cmpeqpd %xmm6, %xmm2 ; SSE41-NEXT: roundpd $10, %xmm1, %xmm10 -; SSE41-NEXT: cmpeqpd %xmm5, %xmm1 ; SSE41-NEXT: roundpd $10, %xmm0, %xmm11 ; SSE41-NEXT: cmpeqpd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 +; SSE41-NEXT: cmpeqpd %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 +; SSE41-NEXT: cmpeqpd %xmm6, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: cmpeqpd %xmm7, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 ; SSE41-NEXT: movapd %xmm4, %xmm0 @@ -2385,8 +2385,8 @@ define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind { ; SSE41-LABEL: ceil_maskz_ss: ; SSE41: ## %bb.0: -; SSE41-NEXT: testb $1, %dil ; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: testb $1, %dil ; SSE41-NEXT: je LBB79_2 ; SSE41-NEXT: ## %bb.1: ; SSE41-NEXT: xorps %xmm2, %xmm2 @@ -2398,8 +2398,8 @@ define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwin ; ; AVX-LABEL: ceil_maskz_ss: ; AVX: ## %bb.0: -; AVX-NEXT: testb $1, %dil ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: testb $1, %dil ; AVX-NEXT: je LBB79_2 ; AVX-NEXT: ## %bb.1: ; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 @@ -2465,8 +2465,8 @@ define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind { ; SSE41-LABEL: ceil_maskz_sd: ; SSE41: ## %bb.0: -; SSE41-NEXT: testb $1, %dil ; SSE41-NEXT: xorpd %xmm2, %xmm2 +; SSE41-NEXT: testb $1, %dil ; SSE41-NEXT: je LBB81_2 ; SSE41-NEXT: ## %bb.1: ; SSE41-NEXT: xorps %xmm2, %xmm2 @@ -2478,8 +2478,8 @@ define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) noun ; ; AVX-LABEL: ceil_maskz_sd: ; AVX: ## %bb.0: -; AVX-NEXT: testb $1, %dil ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: testb $1, %dil ; AVX-NEXT: je LBB81_2 ; AVX-NEXT: ## %bb.1: ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_fneg.ll b/llvm/test/CodeGen/X86/vec_fneg.ll index 64204a5c2123f..680b9b3716698 100644 --- a/llvm/test/CodeGen/X86/vec_fneg.ll +++ b/llvm/test/CodeGen/X86/vec_fneg.ll @@ -153,15 +153,15 @@ define <8 x half> @fneg_v8f16(ptr %p) nounwind { ; ; X86-AVX2-LABEL: fneg_v8f16: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vpxor (%eax), %xmm0, %xmm0 ; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fneg_v8f16: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vpxor (%eax), %xmm0, %xmm0 ; X86-AVX512-NEXT: retl ; @@ -356,15 +356,15 @@ define <16 x half> @fneg_v16f16(ptr %p) nounwind { ; ; X86-AVX2-LABEL: fneg_v16f16: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: vpxor (%eax), %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: fneg_v16f16: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vpxor (%eax), %ymm0, %ymm0 ; X86-AVX512-NEXT: retl ; @@ -418,12 +418,19 @@ define <8 x double> @fneg_v8f64(<8 x double> %p) nounwind { ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: fneg_v8f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-AVX-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; X86-AVX-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: fneg_v8f64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: fneg_v8f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX2-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512VL-LABEL: fneg_v8f64: ; X86-AVX512VL: # %bb.0: @@ -449,12 +456,19 @@ define <8 x double> @fneg_v8f64(<8 x double> %p) nounwind { ; X64-SSE-NEXT: xorps %xmm4, %xmm3 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: fneg_v8f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-AVX-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; X64-AVX-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: fneg_v8f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: fneg_v8f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-AVX2-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512VL-LABEL: fneg_v8f64: ; X64-AVX512VL: # %bb.0: @@ -490,12 +504,19 @@ define <16 x float> @fneg_v16f32(<16 x float> %p) nounwind { ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: fneg_v16f32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-AVX-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; X86-AVX-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: fneg_v16f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: fneg_v16f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX2-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512VL-LABEL: fneg_v16f32: ; X86-AVX512VL: # %bb.0: @@ -521,12 +542,19 @@ define <16 x float> @fneg_v16f32(<16 x float> %p) nounwind { ; X64-SSE-NEXT: xorps %xmm4, %xmm3 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: fneg_v16f32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-AVX-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; X64-AVX-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: fneg_v16f32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: fneg_v16f32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-AVX2-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512VL-LABEL: fneg_v16f32: ; X64-AVX512VL: # %bb.0: @@ -563,7 +591,7 @@ define <32 x half> @fneg_v32f16(ptr %p) nounwind { ; X86-AVX1-LABEL: fneg_v32f16: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; X86-AVX1-NEXT: vxorps (%eax), %ymm1, %ymm0 ; X86-AVX1-NEXT: vxorps 32(%eax), %ymm1, %ymm1 ; X86-AVX1-NEXT: retl @@ -586,8 +614,8 @@ define <32 x half> @fneg_v32f16(ptr %p) nounwind { ; ; X86-AVX512FP16-LABEL: fneg_v32f16: ; X86-AVX512FP16: # %bb.0: -; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512FP16-NEXT: vpxorq (%eax), %zmm0, %zmm0 ; X86-AVX512FP16-NEXT: retl ; @@ -613,7 +641,7 @@ define <32 x half> @fneg_v32f16(ptr %p) nounwind { ; ; X64-AVX1-LABEL: fneg_v32f16: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; X64-AVX1-NEXT: vxorps (%rdi), %ymm1, %ymm0 ; X64-AVX1-NEXT: vxorps 32(%rdi), %ymm1, %ymm1 ; X64-AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index df2dc77dc1259..82f578c4c9fa1 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -20,42 +20,41 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i64: ; SSE: # %bb.0: ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f64_to_2i64: ; VEX: # %bb.0: ; VEX-NEXT: vcvttsd2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm1 ; VEX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vcvttsd2si %xmm0, %rax +; VEX-NEXT: vcvttsd2si %xmm0, %rcx ; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; VEX-NEXT: vmovq %rcx, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptosi_2f64_to_2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvttsd2si %xmm0, %rax +; AVX512F-NEXT: vcvttsd2si %xmm0, %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f64_to_2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rcx ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptosi_2f64_to_2i64: @@ -250,27 +249,26 @@ define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) { define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i64: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0] -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0] +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: subsd %xmm1, %xmm2 +; SSE-NEXT: cvttsd2si %xmm2, %rax ; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: subsd %xmm2, %xmm0 +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: subsd %xmm1, %xmm0 ; SSE-NEXT: cvttsd2si %xmm0, %rcx +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f64_to_2i64: @@ -283,37 +281,37 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; VEX-NEXT: sarq $63, %rdx ; VEX-NEXT: andq %rax, %rdx ; VEX-NEXT: orq %rcx, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 ; VEX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm1 ; VEX-NEXT: vcvttsd2si %xmm1, %rax ; VEX-NEXT: vcvttsd2si %xmm0, %rcx +; VEX-NEXT: vmovq %rdx, %xmm0 ; VEX-NEXT: movq %rcx, %rdx ; VEX-NEXT: sarq $63, %rdx ; VEX-NEXT: andq %rax, %rdx ; VEX-NEXT: orq %rcx, %rdx -; VEX-NEXT: vmovq %rdx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vmovq %rdx, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f64_to_2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512F-NEXT: vcvttsd2usi %xmm0, %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f64_to_2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rcx ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_2f64_to_2i64: @@ -337,9 +335,9 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: andpd %xmm2, %xmm0 ; SSE-NEXT: orpd %xmm1, %xmm0 ; SSE-NEXT: retq @@ -347,9 +345,9 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { ; VEX-LABEL: fptoui_2f64_to_4i32: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %xmm0, %xmm1 -; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 ; VEX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 ; VEX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; VEX-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; VEX-NEXT: retq @@ -389,9 +387,9 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: andpd %xmm2, %xmm0 ; SSE-NEXT: orpd %xmm1, %xmm0 ; SSE-NEXT: retq @@ -399,9 +397,9 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { ; VEX-LABEL: fptoui_2f64_to_2i32: ; VEX: # %bb.0: ; VEX-NEXT: vcvttpd2dq %xmm0, %xmm1 -; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 ; VEX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 ; VEX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; VEX-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; VEX-NEXT: retq @@ -441,9 +439,9 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: psrad $31, %xmm2 ; SSE-NEXT: andpd %xmm2, %xmm0 ; SSE-NEXT: orpd %xmm1, %xmm0 ; SSE-NEXT: retq @@ -452,9 +450,9 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovapd %xmm0, %xmm0 ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper @@ -510,37 +508,37 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_4i64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm3 = [9.2233720368547758E+18,0.0E+0] -; SSE-NEXT: subsd %xmm3, %xmm0 -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: cvttsd2si %xmm2, %rcx +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: subsd %xmm3, %xmm2 +; SSE-NEXT: cvttsd2si %xmm2, %rax +; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: subsd %xmm3, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: subsd %xmm3, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: subsd %xmm3, %xmm2 +; SSE-NEXT: cvttsd2si %xmm2, %rcx ; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: cvttsd2si %xmm1, %rax ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: subsd %xmm3, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: cvttsd2si %xmm1, %rcx -; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx -; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: subsd %xmm3, %xmm1 ; SSE-NEXT: cvttsd2si %xmm1, %rcx ; SSE-NEXT: movq %rax, %rdx @@ -573,10 +571,10 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttsd2si %xmm4, %rax ; AVX1-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx ; AVX1-NEXT: andq %rax, %rdx @@ -616,10 +614,10 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vcvttsd2si %xmm3, %rax +; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttsd2si %xmm4, %rax ; AVX2-NEXT: vcvttsd2si %xmm0, %rcx +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx ; AVX2-NEXT: andq %rax, %rdx @@ -692,16 +690,16 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: movapd {{.*#+}} xmm2 = [2.147483648E+9,2.147483648E+9] -; SSE-NEXT: cvttpd2dq %xmm1, %xmm3 -; SSE-NEXT: subpd %xmm2, %xmm1 +; SSE-NEXT: cvttpd2dq %xmm1, %xmm2 +; SSE-NEXT: movapd {{.*#+}} xmm3 = [2.147483648E+9,2.147483648E+9] +; SSE-NEXT: subpd %xmm3, %xmm1 ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 -; SSE-NEXT: movapd %xmm3, %xmm4 +; SSE-NEXT: movapd %xmm2, %xmm4 ; SSE-NEXT: psrad $31, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm1 -; SSE-NEXT: subpd %xmm2, %xmm0 +; SSE-NEXT: subpd %xmm3, %xmm0 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm2 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: psrad $31, %xmm0 @@ -713,9 +711,9 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { ; AVX1-LABEL: fptoui_4f64_to_4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper @@ -800,42 +798,41 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i64: ; SSE: # %bb.0: ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f32_to_2i64: ; VEX: # %bb.0: ; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm1 ; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rcx ; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; VEX-NEXT: vmovq %rcx, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptosi_2f32_to_2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vcvttss2si %xmm0, %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f32_to_2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptosi_2f32_to_2i64: @@ -859,12 +856,11 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) { ; SSE-LABEL: fptosi_4f32_to_2i64: ; SSE: # %bb.0: ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_4f32_to_2i64: @@ -1279,27 +1275,26 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) { define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { ; SSE-LABEL: fptoui_2f32_to_2i64: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: subss %xmm2, %xmm1 -; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: cvttss2si %xmm2, %rax ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: subss %xmm2, %xmm0 +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: subss %xmm1, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f32_to_2i64: @@ -1312,37 +1307,37 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { ; VEX-NEXT: sarq $63, %rdx ; VEX-NEXT: andq %rax, %rdx ; VEX-NEXT: orq %rcx, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 ; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm1 ; VEX-NEXT: vcvttss2si %xmm1, %rax ; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: vmovq %rdx, %xmm0 ; VEX-NEXT: movq %rcx, %rdx ; VEX-NEXT: sarq $63, %rdx ; VEX-NEXT: andq %rax, %rdx ; VEX-NEXT: orq %rcx, %rdx -; VEX-NEXT: vmovq %rdx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vmovq %rdx, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttss2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_2f32_to_2i64: @@ -1365,27 +1360,26 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) { ; SSE-LABEL: fptoui_4f32_to_2i64: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: subss %xmm2, %xmm1 -; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: cvttss2si %xmm2, %rax ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: subss %xmm2, %xmm0 +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: subss %xmm1, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: movq %rdx, %xmm0 ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_4f32_to_2i64: @@ -1529,30 +1523,30 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx ; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: subss %xmm1, %xmm3 ; SSE-NEXT: cvttss2si %xmm3, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] ; SSE-NEXT: cvttss2si %xmm3, %rax ; SSE-NEXT: subss %xmm1, %xmm3 ; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: subss %xmm1, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %rdx @@ -1585,10 +1579,10 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax ; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx ; AVX1-NEXT: andq %rax, %rdx @@ -1628,10 +1622,10 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax ; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx ; AVX2-NEXT: andq %rax, %rdx @@ -1713,30 +1707,30 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx ; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm3, %rax +; SSE-NEXT: movq %rdx, %xmm2 ; SSE-NEXT: subss %xmm1, %xmm3 ; SSE-NEXT: cvttss2si %xmm3, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] ; SSE-NEXT: cvttss2si %xmm3, %rax ; SSE-NEXT: subss %xmm1, %xmm3 ; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: subss %xmm1, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %rdx @@ -1769,10 +1763,10 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX1-NEXT: andq %rax, %rdx ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax ; AVX1-NEXT: vcvttss2si %xmm0, %rcx +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: sarq $63, %rdx ; AVX1-NEXT: andq %rax, %rdx @@ -1812,10 +1806,10 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX2-NEXT: andq %rax, %rdx ; AVX2-NEXT: orq %rcx, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vcvttss2si %xmm3, %rax +; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vcvttss2si %xmm4, %rax ; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: sarq $63, %rdx ; AVX2-NEXT: andq %rax, %rdx @@ -1896,15 +1890,10 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f64_to_2i64_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_2f64_to_2i64_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,18446744073709551615] -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_2f64_to_2i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] +; AVX-NEXT: retq %cvt = fptosi <2 x double> to <2 x i64> ret <2 x i64> %cvt } @@ -1915,15 +1904,10 @@ define <4 x i32> @fptosi_2f64_to_2i32_const() { ; SSE-NEXT: movsd {{.*#+}} xmm0 = [4294967295,1,0,0] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f64_to_2i32_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,1,0,0] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_2f64_to_2i32_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,1,0,0] -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_2f64_to_2i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,1,0,0] +; AVX-NEXT: retq %cvt = fptosi <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -1936,15 +1920,10 @@ define <4 x i64> @fptosi_4f64_to_4i64_const() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,18446744073709551613] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_4f64_to_4i64_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_4f64_to_4i64_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_4f64_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] +; AVX-NEXT: retq %cvt = fptosi <4 x double> to <4 x i64> ret <4 x i64> %cvt } @@ -1955,15 +1934,10 @@ define <4 x i32> @fptosi_4f64_to_4i32_const() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_4f64_to_4i32_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_4f64_to_4i32_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,1,4294967294,3] -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_4f64_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; AVX-NEXT: retq %cvt = fptosi <4 x double> to <4 x i32> ret <4 x i32> %cvt } @@ -1974,15 +1948,10 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_2i64_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptoui_2f64_to_2i64_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,4] -; AVX512-NEXT: retq +; AVX-LABEL: fptoui_2f64_to_2i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] +; AVX-NEXT: retq %cvt = fptoui <2 x double> to <2 x i64> ret <2 x i64> %cvt } @@ -1993,15 +1962,10 @@ define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) { ; SSE-NEXT: movsd {{.*#+}} xmm0 = [2,4,0,0] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_2f64_to_2i32_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm0 = [2,4,0,0] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptoui_2f64_to_2i32_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,4,0,0] -; AVX512-NEXT: retq +; AVX-LABEL: fptoui_2f64_to_2i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [2,4,0,0] +; AVX-NEXT: retq %cvt = fptoui <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -2014,15 +1978,10 @@ define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,8] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_4f64_to_4i64_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptoui_4f64_to_4i64_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,4,6,8] -; AVX512-NEXT: retq +; AVX-LABEL: fptoui_4f64_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] +; AVX-NEXT: retq %cvt = fptoui <4 x double> to <4 x i64> ret <4 x i64> %cvt } @@ -2033,15 +1992,10 @@ define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,6,8] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_4f64_to_4i32_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptoui_4f64_to_4i32_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,4,6,8] -; AVX512-NEXT: retq +; AVX-LABEL: fptoui_4f64_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] +; AVX-NEXT: retq %cvt = fptoui <4 x double> to <4 x i32> ret <4 x i32> %cvt } @@ -2052,15 +2006,10 @@ define <4 x i32> @fptosi_4f32_to_4i32_const() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_4f32_to_4i32_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_4f32_to_4i32_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,4294967295,2,3] -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_4f32_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; AVX-NEXT: retq %cvt = fptosi <4 x float> to <4 x i32> ret <4 x i32> %cvt } @@ -2072,15 +2021,10 @@ define <4 x i64> @fptosi_4f32_to_4i64_const() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,3] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_4f32_to_4i64_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_4f32_to_4i64_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,18446744073709551615,2,3] -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_4f32_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] +; AVX-NEXT: retq %cvt = fptosi <4 x float> to <4 x i64> ret <4 x i64> %cvt } @@ -2092,15 +2036,10 @@ define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_8f32_to_8i32_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_8f32_to_8i32_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_8f32_to_8i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] +; AVX-NEXT: retq %cvt = fptosi <8 x float> to <8 x i32> ret <8 x i32> %cvt } @@ -2111,15 +2050,10 @@ define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_4f32_to_4i32_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptoui_4f32_to_4i32_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,2,4,6] -; AVX512-NEXT: retq +; AVX-LABEL: fptoui_4f32_to_4i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] +; AVX-NEXT: retq %cvt = fptoui <4 x float> to <4 x i32> ret <4 x i32> %cvt } @@ -2131,15 +2065,10 @@ define <4 x i64> @fptoui_4f32_to_4i64_const() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [4,8] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_4f32_to_4i64_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptoui_4f32_to_4i64_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,2,4,8] -; AVX512-NEXT: retq +; AVX-LABEL: fptoui_4f32_to_4i64_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] +; AVX-NEXT: retq %cvt = fptoui <4 x float> to <4 x i64> ret <4 x i64> %cvt } @@ -2151,15 +2080,10 @@ define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [8,6,4,1] ; SSE-NEXT: retq ; -; VEX-LABEL: fptoui_8f32_to_8i32_const: -; VEX: # %bb.0: -; VEX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] -; VEX-NEXT: retq -; -; AVX512-LABEL: fptoui_8f32_to_8i32_const: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] -; AVX512-NEXT: retq +; AVX-LABEL: fptoui_8f32_to_8i32_const: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] +; AVX-NEXT: retq %cvt = fptoui <8 x float> to <8 x i32> ret <8 x i32> %cvt } @@ -2489,9 +2413,9 @@ define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) { ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 ; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: retq @@ -2544,8 +2468,8 @@ define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) { ; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: psrad $16, %xmm2 ; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 +; SSE-NEXT: psrad $16, %xmm2 ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: pslld $16, %xmm0 @@ -2689,12 +2613,12 @@ define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) { define <2 x i64> @fptosi_2f32_to_2i64_load(ptr %x) { ; SSE-LABEL: fptosi_2f32_to_2i64_load: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: movq %rcx, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; @@ -2702,33 +2626,33 @@ define <2 x i64> @fptosi_2f32_to_2i64_load(ptr %x) { ; VEX: # %bb.0: ; VEX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm1 ; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vcvttss2si %xmm0, %rax +; VEX-NEXT: vcvttss2si %xmm0, %rcx ; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; VEX-NEXT: vmovq %rcx, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptosi_2f32_to_2i64_load: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2si %xmm0, %rax +; AVX512F-NEXT: vcvttss2si %xmm0, %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f32_to_2i64_load: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512VL-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptosi_2f32_to_2i64_load: @@ -2751,21 +2675,21 @@ define <2 x i64> @fptosi_2f32_to_2i64_load(ptr %x) { define <2 x i64> @fptoui_2f32_to_2i64_load(ptr %x) { ; SSE-LABEL: fptoui_2f32_to_2i64_load: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: subss %xmm2, %xmm0 -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: cvttss2si %xmm1, %rcx +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: subss %xmm2, %xmm1 -; SSE-NEXT: cvttss2si %xmm1, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx @@ -2785,39 +2709,39 @@ define <2 x i64> @fptoui_2f32_to_2i64_load(ptr %x) { ; VEX-NEXT: sarq $63, %rdx ; VEX-NEXT: andq %rax, %rdx ; VEX-NEXT: orq %rcx, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 ; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm1 ; VEX-NEXT: vcvttss2si %xmm1, %rax ; VEX-NEXT: vcvttss2si %xmm0, %rcx +; VEX-NEXT: vmovq %rdx, %xmm0 ; VEX-NEXT: movq %rcx, %rdx ; VEX-NEXT: sarq $63, %rdx ; VEX-NEXT: andq %rax, %rdx ; VEX-NEXT: orq %rcx, %rdx -; VEX-NEXT: vmovq %rdx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; VEX-NEXT: vmovq %rdx, %xmm1 +; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i64_load: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vcvttss2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2usi %xmm0, %rax +; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i64_load: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rcx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_2f32_to_2i64_load: diff --git a/llvm/test/CodeGen/X86/vec_fpext.ll b/llvm/test/CodeGen/X86/vec_fpext.ll index ddec397325d7f..e45b2d1151103 100644 --- a/llvm/test/CodeGen/X86/vec_fpext.ll +++ b/llvm/test/CodeGen/X86/vec_fpext.ll @@ -198,10 +198,10 @@ define void @fpext_frommem8(ptr %in, ptr %out) { ; ; X86-AVX-LABEL: fpext_frommem8: ; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX-NEXT: vcvtps2pd (%eax), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x00] +; X86-AVX-NEXT: vcvtps2pd 16(%eax), %ymm1 # encoding: [0xc5,0xfc,0x5a,0x48,0x10] ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] -; X86-AVX-NEXT: vcvtps2pd (%ecx), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x01] -; X86-AVX-NEXT: vcvtps2pd 16(%ecx), %ymm1 # encoding: [0xc5,0xfc,0x5a,0x49,0x10] ; X86-AVX-NEXT: vmovups %ymm1, 32(%eax) # encoding: [0xc5,0xfc,0x11,0x48,0x20] ; X86-AVX-NEXT: vmovups %ymm0, (%eax) # encoding: [0xc5,0xfc,0x11,0x00] ; X86-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] diff --git a/llvm/test/CodeGen/X86/vec_fptrunc.ll b/llvm/test/CodeGen/X86/vec_fptrunc.ll index 5b2dafaf3ac28..24b5f5a6056aa 100644 --- a/llvm/test/CodeGen/X86/vec_fptrunc.ll +++ b/llvm/test/CodeGen/X86/vec_fptrunc.ll @@ -43,9 +43,9 @@ define void @fptrunc_frommem4(ptr %in, ptr %out) { ; X86-SSE-LABEL: fptrunc_frommem4: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: cvtpd2ps 16(%ecx), %xmm0 -; X86-SSE-NEXT: cvtpd2ps (%ecx), %xmm1 +; X86-SSE-NEXT: cvtpd2ps 16(%eax), %xmm0 +; X86-SSE-NEXT: cvtpd2ps (%eax), %xmm1 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X86-SSE-NEXT: movupd %xmm1, (%eax) ; X86-SSE-NEXT: retl @@ -82,23 +82,23 @@ define void @fptrunc_frommem8(ptr %in, ptr %out) { ; X86-SSE-LABEL: fptrunc_frommem8: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: cvtpd2ps 16(%ecx), %xmm0 -; X86-SSE-NEXT: cvtpd2ps (%ecx), %xmm1 +; X86-SSE-NEXT: cvtpd2ps 16(%eax), %xmm0 +; X86-SSE-NEXT: cvtpd2ps (%eax), %xmm1 +; X86-SSE-NEXT: cvtpd2ps 48(%eax), %xmm2 ; X86-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X86-SSE-NEXT: cvtpd2ps 48(%ecx), %xmm0 -; X86-SSE-NEXT: cvtpd2ps 32(%ecx), %xmm2 -; X86-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; X86-SSE-NEXT: movupd %xmm2, 16(%eax) +; X86-SSE-NEXT: cvtpd2ps 32(%eax), %xmm0 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X86-SSE-NEXT: movupd %xmm0, 16(%eax) ; X86-SSE-NEXT: movupd %xmm1, (%eax) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: fptrunc_frommem8: ; X86-AVX: # %bb.0: # %entry ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vcvtpd2psy (%ecx), %xmm0 -; X86-AVX-NEXT: vcvtpd2psy 32(%ecx), %xmm1 +; X86-AVX-NEXT: vcvtpd2psy (%eax), %xmm0 +; X86-AVX-NEXT: vcvtpd2psy 32(%eax), %xmm1 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: vmovupd %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovupd %xmm0, (%eax) ; X86-AVX-NEXT: retl @@ -107,11 +107,11 @@ define void @fptrunc_frommem8(ptr %in, ptr %out) { ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: cvtpd2ps 16(%rdi), %xmm0 ; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm1 +; X64-SSE-NEXT: cvtpd2ps 48(%rdi), %xmm2 +; X64-SSE-NEXT: cvtpd2ps 32(%rdi), %xmm3 ; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSE-NEXT: cvtpd2ps 48(%rdi), %xmm0 -; X64-SSE-NEXT: cvtpd2ps 32(%rdi), %xmm2 -; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; X64-SSE-NEXT: movupd %xmm2, 16(%rsi) +; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X64-SSE-NEXT: movupd %xmm3, 16(%rsi) ; X64-SSE-NEXT: movupd %xmm1, (%rsi) ; X64-SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_ins_extract-1.ll b/llvm/test/CodeGen/X86/vec_ins_extract-1.ll index cf70d5d7f1edf..2319a032418cc 100644 --- a/llvm/test/CodeGen/X86/vec_ins_extract-1.ll +++ b/llvm/test/CodeGen/X86/vec_ins_extract-1.ll @@ -40,9 +40,9 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind { ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $32, %esp -; X32-NEXT: andl $3, %eax ; X32-NEXT: movl $76, %ecx ; X32-NEXT: pinsrd $0, %ecx, %xmm0 +; X32-NEXT: andl $3, %eax ; X32-NEXT: movdqa %xmm0, (%esp) ; X32-NEXT: movl (%esp,%eax,4), %eax ; X32-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index af841cf38b24a..d8179c3625607 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -65,7 +65,7 @@ define <2 x float> @uitofp_2i32_to_2f32(<2 x i32> %a) { ; AVX2-LABEL: uitofp_2i32_to_2f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 @@ -319,14 +319,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: cvtsi2sd %rax, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sd %rax, %xmm1 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE2-NEXT: movapd %xmm2, %xmm0 ; SSE2-NEXT: movapd %xmm3, %xmm1 ; SSE2-NEXT: retq @@ -338,14 +338,13 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2sd %rax, %xmm0 -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2sd %rax, %xmm2 +; SSE41-NEXT: cvtsi2sd %rax, %xmm3 ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2sd %rax, %xmm1 -; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_4i64_to_4f64: @@ -355,12 +354,12 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -371,12 +370,12 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX2-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -387,12 +386,12 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; @@ -403,12 +402,12 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -689,7 +688,7 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { ; AVX2-LABEL: uitofp_2i32_to_2f64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -966,9 +965,9 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vsubpd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -980,9 +979,9 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vsubpd %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -1044,16 +1043,16 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vorpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_4i32_to_4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1545,9 +1544,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 @@ -1555,6 +1552,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -1567,14 +1565,13 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; SSE41-NEXT: retq ; @@ -1584,13 +1581,13 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1601,13 +1598,13 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1618,13 +1615,13 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1635,13 +1632,13 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1865,7 +1862,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; ; SSE41-LABEL: uitofp_2i64_to_4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq $1, %xmm2 @@ -1980,7 +1977,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; ; SSE41-LABEL: uitofp_2i64_to_2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq $1, %xmm2 @@ -2094,7 +2091,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; ; SSE41-LABEL: uitofp_4i64_to_4f32_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq $1, %xmm2 @@ -2141,8 +2138,8 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; ; AVX2-LABEL: uitofp_4i64_to_4f32_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 @@ -2151,13 +2148,13 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 @@ -2445,43 +2442,42 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; ; SSE41-LABEL: uitofp_4i64_to_4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrlq $1, %xmm3 -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1,1] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrlq $1, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: pextrq $1, %xmm5, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 ; SSE41-NEXT: movq %xmm5, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrlq $1, %xmm5 +; SSE41-NEXT: por %xmm3, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[1,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm0 +; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3] -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: psrlq $1, %xmm5 -; SSE41-NEXT: por %xmm4, %xmm5 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3] -; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0] -; SSE41-NEXT: movaps %xmm3, %xmm2 -; SSE41-NEXT: addps %xmm3, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: addps %xmm1, %xmm3 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_4i64_to_4f32: @@ -2497,13 +2493,13 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 @@ -2522,14 +2518,14 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vcvtsi2ss %rcx, %xmm4, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 @@ -2543,13 +2539,13 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2560,13 +2556,13 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2647,9 +2643,9 @@ define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vsubps %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3000,23 +2996,22 @@ define <2 x double> @sitofp_load_2i8_to_2f64(ptr%a) { define <4 x double> @sitofp_load_4i64_to_4f64(ptr%a) { ; SSE-LABEL: sitofp_load_4i64_to_4f64: ; SSE: # %bb.0: -; SSE-NEXT: cvtsi2sdq 8(%rdi), %xmm1 +; SSE-NEXT: cvtsi2sdq 8(%rdi), %xmm2 ; SSE-NEXT: cvtsi2sdq (%rdi), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: cvtsi2sdq 24(%rdi), %xmm2 -; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: cvtsi2sdq 24(%rdi), %xmm3 ; SSE-NEXT: cvtsi2sdq 16(%rdi), %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: retq ; ; VEX-LABEL: sitofp_load_4i64_to_4f64: ; VEX: # %bb.0: ; VEX-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0 ; VEX-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm3, %xmm3 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1 -; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2 -; VEX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; VEX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; VEX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; VEX-NEXT: retq ; @@ -3024,10 +3019,10 @@ define <4 x double> @sitofp_load_4i64_to_4f64(ptr%a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm3, %xmm3 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -3035,10 +3030,10 @@ define <4 x double> @sitofp_load_4i64_to_4f64(ptr%a) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0 ; AVX512VL-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm3, %xmm3 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2 -; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3260,7 +3255,7 @@ define <2 x double> @uitofp_load_2i32_to_2f64(ptr%a) { ; AVX2-LABEL: uitofp_load_2i32_to_2f64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -3560,9 +3555,9 @@ define <4 x double> @uitofp_load_4i64_to_4f64(ptr%a) { ; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vsubpd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3575,9 +3570,9 @@ define <4 x double> @uitofp_load_4i64_to_4f64(ptr%a) { ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vsubpd %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -3643,10 +3638,10 @@ define <4 x double> @uitofp_load_4i32_to_4f64(ptr%a) { ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vorpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_load_4i32_to_4f64: @@ -3751,60 +3746,57 @@ define <4 x double> @uitofp_load_4i8_to_4f64(ptr%a) { define <4 x float> @sitofp_load_4i64_to_4f32(ptr%a) { ; SSE2-LABEL: sitofp_load_4i64_to_4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: cvtsi2ssq 24(%rdi), %xmm0 -; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: cvtsi2ssq 8(%rdi), %xmm2 -; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq 24(%rdi), %xmm1 +; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm2 +; SSE2-NEXT: cvtsi2ssq 8(%rdi), %xmm3 ; SSE2-NEXT: cvtsi2ssq (%rdi), %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: sitofp_load_4i64_to_4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: cvtsi2ssq 8(%rdi), %xmm1 ; SSE41-NEXT: cvtsi2ssq (%rdi), %xmm0 +; SSE41-NEXT: cvtsi2ssq 16(%rdi), %xmm2 +; SSE41-NEXT: cvtsi2ssq 24(%rdi), %xmm3 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ssq 16(%rdi), %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ssq 24(%rdi), %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_load_4i64_to_4f32: ; VEX: # %bb.0: ; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 ; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm3 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_4i64_to_4f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 ; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: @@ -3893,64 +3885,62 @@ define <4 x float> @sitofp_load_4i8_to_4f32(ptr%a) { define <8 x float> @sitofp_load_8i64_to_8f32(ptr%a) { ; SSE2-LABEL: sitofp_load_8i64_to_8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: cvtsi2ssq 24(%rdi), %xmm0 -; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: cvtsi2ssq 8(%rdi), %xmm2 -; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq 24(%rdi), %xmm1 +; SSE2-NEXT: cvtsi2ssq 16(%rdi), %xmm2 +; SSE2-NEXT: cvtsi2ssq 8(%rdi), %xmm3 ; SSE2-NEXT: cvtsi2ssq (%rdi), %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ssq 56(%rdi), %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: cvtsi2ssq 48(%rdi), %xmm3 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ssq 48(%rdi), %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: cvtsi2ssq 40(%rdi), %xmm3 +; SSE2-NEXT: cvtsi2ssq 40(%rdi), %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ssq 32(%rdi), %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: sitofp_load_8i64_to_8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: cvtsi2ssq 8(%rdi), %xmm1 ; SSE41-NEXT: cvtsi2ssq (%rdi), %xmm0 +; SSE41-NEXT: cvtsi2ssq 16(%rdi), %xmm2 +; SSE41-NEXT: cvtsi2ssq 24(%rdi), %xmm3 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ssq 16(%rdi), %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ssq 24(%rdi), %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; SSE41-NEXT: cvtsi2ssq 40(%rdi), %xmm2 +; SSE41-NEXT: cvtsi2ssq 40(%rdi), %xmm4 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ssq 32(%rdi), %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ssq 48(%rdi), %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: cvtsi2ssq 56(%rdi), %xmm3 ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ssq 56(%rdi), %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] ; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_load_8i64_to_8f32: ; VEX: # %bb.0: ; VEX-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0 ; VEX-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq 56(%rdi), %xmm3, %xmm3 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; VEX-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; VEX-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1 -; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm4, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm4, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm4, %xmm3 ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2 -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2 +; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm4, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; VEX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; VEX-NEXT: retq @@ -3959,17 +3949,17 @@ define <8 x float> @sitofp_load_8i64_to_8f32(ptr%a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq 56(%rdi), %xmm3, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512F-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm4, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm4, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm4, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm4, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq @@ -3978,17 +3968,17 @@ define <8 x float> @sitofp_load_8i64_to_8f32(ptr%a) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0 ; AVX512VL-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq 56(%rdi), %xmm3, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512VL-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm4, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm4, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm4, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm4, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq @@ -4185,43 +4175,41 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; SSE41-LABEL: uitofp_load_4i64_to_4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1,1] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrlq $1, %xmm3 -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlq $1, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: pextrq $1, %xmm5, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: movq %xmm5, %rax +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: pextrq $1, %xmm4, %rax +; SSE41-NEXT: movdqa 16(%rdi), %xmm0 +; SSE41-NEXT: cvtsi2ss %rax, %xmm5 +; SSE41-NEXT: movq %xmm4, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ss %rax, %xmm2 +; SSE41-NEXT: pand %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrlq $1, %xmm4 +; SSE41-NEXT: por %xmm3, %xmm4 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm0 +; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3] -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: psrlq $1, %xmm5 -; SSE41-NEXT: por %xmm4, %xmm5 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3] -; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[2,3] ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0] -; SSE41-NEXT: movaps %xmm3, %xmm2 -; SSE41-NEXT: addps %xmm3, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[0] +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: addps %xmm2, %xmm3 ; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_4i64_to_4f32: @@ -4238,13 +4226,13 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 @@ -4254,27 +4242,27 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; ; AVX2-LABEL: uitofp_load_4i64_to_4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm2 +; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendvpd %ymm1, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vcvtsi2ss %rcx, %xmm4, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 -; AVX2-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX2-NEXT: vpackssdw 16(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4282,22 +4270,22 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm0, %xmm0 ; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: @@ -4574,88 +4562,87 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; ; SSE41-LABEL: uitofp_load_8i64_to_8f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm4 -; SSE41-NEXT: movdqa 16(%rdi), %xmm5 -; SSE41-NEXT: movdqa 32(%rdi), %xmm6 -; SSE41-NEXT: movdqa 48(%rdi), %xmm2 -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm7 = [1,1] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: movdqa (%rdi), %xmm2 +; SSE41-NEXT: movdqa 16(%rdi), %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1,1] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: psrlq $1, %xmm1 ; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: pextrq $1, %xmm3, %rax +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: pextrq $1, %xmm4, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ss %rax, %xmm1 +; SSE41-NEXT: movq %xmm4, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ss %rax, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: psrlq $1, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE41-NEXT: movq %xmm3, %rax ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: movq %xmm3, %rax +; SSE41-NEXT: pextrq $1, %xmm3, %rax ; SSE41-NEXT: xorps %xmm3, %xmm3 ; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[2,3] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm1 -; SSE41-NEXT: psrlq $1, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3] -; SSE41-NEXT: movaps %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movq %xmm5, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0],xmm3[3] -; SSE41-NEXT: pextrq $1, %xmm5, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0] -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: addps %xmm3, %xmm1 -; SSE41-NEXT: movaps %xmm4, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] +; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[0] +; SSE41-NEXT: movaps %xmm4, %xmm1 +; SSE41-NEXT: addps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: psrlq $1, %xmm1 ; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: pextrq $1, %xmm4, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: movq %xmm4, %rax +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: pextrq $1, %xmm3, %rax +; SSE41-NEXT: movdqa 48(%rdi), %xmm0 +; SSE41-NEXT: xorps %xmm6, %xmm6 +; SSE41-NEXT: cvtsi2ss %rax, %xmm6 +; SSE41-NEXT: movq %xmm3, %rax ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3] -; SSE41-NEXT: pand %xmm2, %xmm7 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psrlq $1, %xmm4 -; SSE41-NEXT: por %xmm7, %xmm4 -; SSE41-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm2[1,3] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] -; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: pand %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlq $1, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[1,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm0 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; SSE41-NEXT: xorps %xmm0, %xmm0 ; SSE41-NEXT: cvtsi2ss %rax, %xmm0 ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: movaps %xmm6, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: addps %xmm1, %xmm3 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_8i64_to_8f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,1,1,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm3 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm5 @@ -4667,85 +4654,85 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 ; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vmovq %xmm3, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm7 +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm7[0],xmm4[3] ; AVX1-NEXT: vpextrq $1, %xmm3, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm3 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[0] ; AVX1-NEXT: vaddps %xmm3, %xmm3, %xmm4 ; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm3, %xmm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm2 -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm5 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-NEXT: vpsrlq $1, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm2 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm5 ; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[2,3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm8 +; AVX1-NEXT: vblendvps %xmm1, %xmm4, %xmm3, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm9, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0],xmm2[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX1-NEXT: vaddps %xmm2, %xmm2, %xmm3 -; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_load_8i64_to_8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm4 -; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm5 -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0],xmm4[3] -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[0] -; AVX2-NEXT: vaddps %xmm3, %xmm3, %xmm4 -; AVX2-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm1, %xmm4, %xmm3, %xmm1 -; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm3 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX2-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax ; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm2 +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] ; AVX2-NEXT: vaddps %xmm2, %xmm2, %xmm3 -; AVX2-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX2-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm1, %xmm3, %xmm2, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpsrlq $1, %ymm2, %ymm3 +; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm5 +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm5[0],xmm3[3] +; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm3 +; AVX2-NEXT: vpackssdw 16(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -4753,17 +4740,17 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvtusi2ssq 40(%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vcvtusi2ssq 32(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq 56(%rdi), %xmm3, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512F-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vcvtusi2ssq 56(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm4, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm4, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm4, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm4, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq @@ -4772,17 +4759,17 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtusi2ssq 40(%rdi), %xmm0, %xmm0 ; AVX512VL-NEXT: vcvtusi2ssq 32(%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq 56(%rdi), %xmm3, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512VL-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vcvtusi2ssq 56(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm4, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm4, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm4, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm4, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq @@ -4866,9 +4853,9 @@ define <8 x float> @uitofp_load_8i32_to_8f32(ptr%a) { ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vsubps %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -5022,9 +5009,9 @@ define void @aggregate_sitofp_8i16_to_8f32(ptr nocapture readonly %a0) { ; ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: movq 24(%rdi), %rax ; AVX1-NEXT: vpmovsxwd 16(%rdi), %xmm0 ; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: movq 24(%rdi), %rax ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, (%rax) @@ -5137,18 +5124,18 @@ define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind { ; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: ; SSE: # %bb.0: ; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: incl %eax ; SSE-NEXT: cvtsi2ss %eax, %xmm1 +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: divss %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: ; AVX: # %bb.0: ; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: incl %eax ; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 @@ -5468,7 +5455,7 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; ; SSE41-LABEL: PR43609: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm1 = [2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2,2] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 @@ -5529,10 +5516,10 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX2-NEXT: # xmm6 = mem[0,0] @@ -5557,10 +5544,10 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX512F-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX512F-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX512F-NEXT: # xmm6 = mem[0,0] diff --git a/llvm/test/CodeGen/X86/vec_minmax_sint.ll b/llvm/test/CodeGen/X86/vec_minmax_sint.ll index 853e29b8acfcd..a20e6b4c83de3 100644 --- a/llvm/test/CodeGen/X86/vec_minmax_sint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_sint.ll @@ -34,7 +34,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: max_gt_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -119,7 +119,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-LABEL: max_gt_v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm6 @@ -415,7 +415,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: max_ge_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 @@ -500,7 +500,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-LABEL: max_ge_v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm6 @@ -796,7 +796,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: min_lt_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm4 @@ -881,7 +881,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-LABEL: min_lt_v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 @@ -1177,7 +1177,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41-LABEL: min_le_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm4 @@ -1262,7 +1262,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41-LABEL: min_le_v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm6 ; SSE41-NEXT: pxor %xmm5, %xmm6 @@ -1541,20 +1541,10 @@ define <2 x i64> @max_gt_v2i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_gt_v2i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_gt_v2i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_gt_v2i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_gt_v2i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp sgt <2 x i64> %1, %2 @@ -1569,20 +1559,10 @@ define <4 x i64> @max_gt_v4i64c() { ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: max_gt_v4i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_gt_v4i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_gt_v4i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_gt_v4i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp sgt <4 x i64> %1, %2 @@ -1596,20 +1576,10 @@ define <4 x i32> @max_gt_v4i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_gt_v4i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_gt_v4i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_gt_v4i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_gt_v4i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp sgt <4 x i32> %1, %2 @@ -1624,20 +1594,10 @@ define <8 x i32> @max_gt_v8i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_gt_v8i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_gt_v8i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_gt_v8i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_gt_v8i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp sgt <8 x i32> %1, %2 @@ -1703,20 +1663,10 @@ define <2 x i64> @max_ge_v2i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_ge_v2i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_ge_v2i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_ge_v2i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_ge_v2i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp sge <2 x i64> %1, %2 @@ -1731,20 +1681,10 @@ define <4 x i64> @max_ge_v4i64c() { ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: max_ge_v4i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_ge_v4i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_ge_v4i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_ge_v4i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp sge <4 x i64> %1, %2 @@ -1758,20 +1698,10 @@ define <4 x i32> @max_ge_v4i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_ge_v4i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_ge_v4i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_ge_v4i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_ge_v4i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp sge <4 x i32> %1, %2 @@ -1786,20 +1716,10 @@ define <8 x i32> @max_ge_v8i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_ge_v8i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_ge_v8i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_ge_v8i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_ge_v8i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp sge <8 x i32> %1, %2 @@ -1865,20 +1785,10 @@ define <2 x i64> @min_lt_v2i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_lt_v2i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_lt_v2i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_lt_v2i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_lt_v2i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp slt <2 x i64> %1, %2 @@ -1893,20 +1803,10 @@ define <4 x i64> @min_lt_v4i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_lt_v4i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_lt_v4i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_lt_v4i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_lt_v4i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp slt <4 x i64> %1, %2 @@ -1920,20 +1820,10 @@ define <4 x i32> @min_lt_v4i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_lt_v4i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_lt_v4i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_lt_v4i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_lt_v4i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp slt <4 x i32> %1, %2 @@ -1948,20 +1838,10 @@ define <8 x i32> @min_lt_v8i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_lt_v8i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_lt_v8i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_lt_v8i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_lt_v8i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp slt <8 x i32> %1, %2 @@ -2027,20 +1907,10 @@ define <2 x i64> @min_le_v2i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_le_v2i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_le_v2i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_le_v2i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_le_v2i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp sle <2 x i64> %1, %2 @@ -2055,20 +1925,10 @@ define <4 x i64> @min_le_v4i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_le_v4i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_le_v4i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_le_v4i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_le_v4i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp sle <4 x i64> %1, %2 @@ -2082,20 +1942,10 @@ define <4 x i32> @min_le_v4i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_le_v4i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_le_v4i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_le_v4i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_le_v4i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp sle <4 x i32> %1, %2 @@ -2110,20 +1960,10 @@ define <8 x i32> @min_le_v8i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_le_v8i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_le_v8i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_le_v8i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_le_v8i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp sle <8 x i32> %1, %2 diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll index 9b4da3f9b817f..89653bd00b874 100644 --- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll @@ -72,7 +72,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: max_gt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -480,7 +480,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: max_ge_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -887,7 +887,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: min_lt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1297,7 +1297,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: min_le_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1653,20 +1653,10 @@ define <2 x i64> @max_gt_v2i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_gt_v2i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_gt_v2i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_gt_v2i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_gt_v2i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp ugt <2 x i64> %1, %2 @@ -1681,20 +1671,10 @@ define <4 x i64> @max_gt_v4i64c() { ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: max_gt_v4i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_gt_v4i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_gt_v4i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_gt_v4i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp ugt <4 x i64> %1, %2 @@ -1708,20 +1688,10 @@ define <4 x i32> @max_gt_v4i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_gt_v4i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_gt_v4i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_gt_v4i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_gt_v4i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp ugt <4 x i32> %1, %2 @@ -1736,20 +1706,10 @@ define <8 x i32> @max_gt_v8i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_gt_v8i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_gt_v8i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_gt_v8i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_gt_v8i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp ugt <8 x i32> %1, %2 @@ -1815,20 +1775,10 @@ define <2 x i64> @max_ge_v2i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_ge_v2i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_ge_v2i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_ge_v2i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_ge_v2i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp uge <2 x i64> %1, %2 @@ -1843,20 +1793,10 @@ define <4 x i64> @max_ge_v4i64c() { ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: max_ge_v4i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_ge_v4i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_ge_v4i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_ge_v4i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp uge <4 x i64> %1, %2 @@ -1870,20 +1810,10 @@ define <4 x i32> @max_ge_v4i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_ge_v4i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_ge_v4i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_ge_v4i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_ge_v4i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp uge <4 x i32> %1, %2 @@ -1898,20 +1828,10 @@ define <8 x i32> @max_ge_v8i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7] ; SSE-NEXT: retq ; -; AVX1-LABEL: max_ge_v8i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: max_ge_v8i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: max_ge_v8i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX512-NEXT: retq +; AVX-LABEL: max_ge_v8i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp uge <8 x i32> %1, %2 @@ -1977,20 +1897,10 @@ define <2 x i64> @min_lt_v2i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_lt_v2i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_lt_v2i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_lt_v2i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_lt_v2i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp ult <2 x i64> %1, %2 @@ -2005,20 +1915,10 @@ define <4 x i64> @min_lt_v4i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_lt_v4i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_lt_v4i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_lt_v4i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_lt_v4i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp ult <4 x i64> %1, %2 @@ -2032,20 +1932,10 @@ define <4 x i32> @min_lt_v4i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_lt_v4i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_lt_v4i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_lt_v4i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_lt_v4i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp ult <4 x i32> %1, %2 @@ -2060,20 +1950,10 @@ define <8 x i32> @min_lt_v8i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_lt_v8i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_lt_v8i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_lt_v8i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_lt_v8i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp ult <8 x i32> %1, %2 @@ -2139,20 +2019,10 @@ define <2 x i64> @min_le_v2i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_le_v2i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_le_v2i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_le_v2i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_le_v2i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp ule <2 x i64> %1, %2 @@ -2167,20 +2037,10 @@ define <4 x i64> @min_le_v4i64c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_le_v4i64c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_le_v4i64c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_le_v4i64c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_le_v4i64c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp ule <4 x i64> %1, %2 @@ -2194,20 +2054,10 @@ define <4 x i32> @min_le_v4i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_le_v4i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_le_v4i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_le_v4i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_le_v4i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp ule <4 x i32> %1, %2 @@ -2222,20 +2072,10 @@ define <8 x i32> @min_le_v8i32c() { ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: min_le_v8i32c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: min_le_v8i32c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX2-NEXT: retq -; -; AVX512-LABEL: min_le_v8i32c: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX512-NEXT: retq +; AVX-LABEL: min_le_v8i32c: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp ule <8 x i32> %1, %2 diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index 78dd2cf783ef8..10c04b24ee19b 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -110,9 +110,9 @@ define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi) ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi) ; SSE41-NEXT: movq %xmm1, (%rdi) ; SSE41-NEXT: retq ; @@ -121,9 +121,9 @@ define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX-NEXT: vmovq %xmm1, (%rdi) ; AVX-NEXT: retq ; @@ -134,9 +134,9 @@ define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) @@ -274,7 +274,6 @@ define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; ; SSE41-LABEL: saddo_v6i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: movd %esi, %xmm1 ; SSE41-NEXT: pinsrd $1, %edx, %xmm1 ; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 @@ -286,6 +285,7 @@ define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE41-NEXT: movdqa %xmm1, %xmm4 @@ -751,74 +751,28 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind { } define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { -; SSE2-LABEL: saddo_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: movdqa %xmm0, (%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: saddo_v2i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: paddq %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSSE3-NEXT: movdqa %xmm0, (%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: saddo_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: saddo_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: paddq %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: saddo_v2i64: ; AVX: # %bb.0: @@ -935,8 +889,8 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: movw %cx, 6(%rdi) ; SSE41-NEXT: pextrd $1, %xmm0, %edx ; SSE41-NEXT: movw %dx, 3(%rdi) @@ -966,8 +920,8 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpextrd $3, %xmm1, %eax -; AVX-NEXT: movw %ax, 9(%rdi) ; AVX-NEXT: vpextrd $2, %xmm1, %ecx +; AVX-NEXT: movw %ax, 9(%rdi) ; AVX-NEXT: movw %cx, 6(%rdi) ; AVX-NEXT: vpextrd $1, %xmm1, %edx ; AVX-NEXT: movw %dx, 3(%rdi) @@ -995,8 +949,8 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: movw %cx, 6(%rdi) ; AVX512-NEXT: vpextrd $1, %xmm1, %edx ; AVX512-NEXT: movw %dx, 3(%rdi) @@ -1065,9 +1019,9 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { ; SSE2-LABEL: saddo_v2i128: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: addq %r8, %rdi ; SSE2-NEXT: adcq %r9, %rsi +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: seto %r8b ; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx @@ -1087,9 +1041,9 @@ define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; ; SSSE3-LABEL: saddo_v2i128: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSSE3-NEXT: addq %r8, %rdi ; SSSE3-NEXT: adcq %r9, %rsi +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSSE3-NEXT: seto %r8b ; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx @@ -1109,9 +1063,9 @@ define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; ; SSE41-LABEL: saddo_v2i128: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE41-NEXT: addq %r8, %rdi ; SSE41-NEXT: adcq %r9, %rsi +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE41-NEXT: seto %r8b ; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx @@ -1130,9 +1084,9 @@ define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; ; AVX-LABEL: saddo_v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: addq %r8, %rdi ; AVX-NEXT: adcq %r9, %rsi +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: seto %r8b ; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rcx diff --git a/llvm/test/CodeGen/X86/vec_setcc-2.ll b/llvm/test/CodeGen/X86/vec_setcc-2.ll index 5a71878ea4579..583e5772d8aae 100644 --- a/llvm/test/CodeGen/X86/vec_setcc-2.ll +++ b/llvm/test/CodeGen/X86/vec_setcc-2.ll @@ -32,7 +32,7 @@ define void @loop_no_const_reload(ptr %in, ptr %out, i32 %n) { ; SSE41-NEXT: je LBB0_3 ; SSE41-NEXT: ## %bb.1: ## %for.body.preheader ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25] ; SSE41-NEXT: .p2align 4 ; SSE41-NEXT: LBB0_2: ## %for.body ; SSE41-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -100,7 +100,7 @@ define void @loop_const_folding_underflow(ptr %in, ptr %out, i32 %n) { ; SSE41-NEXT: je LBB1_3 ; SSE41-NEXT: ## %bb.1: ## %for.body.preheader ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [0,26,26,26,26,26,26,26] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,26,26,26,26,26,26,26] ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: .p2align 4 ; SSE41-NEXT: LBB1_2: ## %for.body @@ -202,7 +202,7 @@ define <8 x i1> @ugt_v8i16_splat(<8 x i16> %x) { ; ; SSE41-LABEL: ugt_v8i16_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243] ; SSE41-NEXT: pmaxuw %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -219,7 +219,7 @@ define <4 x i1> @ugt_v4i32_splat(<4 x i32> %x) { ; ; SSE41-LABEL: ugt_v4i32_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967255,4294967255,4294967255,4294967255] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967255,4294967255,4294967255,4294967255] ; SSE41-NEXT: pmaxud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -231,13 +231,14 @@ define <2 x i1> @ugt_v2i64_splat(<2 x i64> %x) { ; SSE2-LABEL: ugt_v2i64_splat: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ugt_v2i64_splat: @@ -271,7 +272,7 @@ define <8 x i1> @uge_v8i16_splat(<8 x i16> %x) { ; ; SSE41-LABEL: uge_v8i16_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [242,242,242,242,242,242,242,242] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [242,242,242,242,242,242,242,242] ; SSE41-NEXT: pmaxuw %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -291,7 +292,7 @@ define <4 x i1> @uge_v4i32_splat(<4 x i32> %x) { ; ; SSE41-LABEL: uge_v4i32_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254] ; SSE41-NEXT: pmaxud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -345,7 +346,7 @@ define <8 x i1> @ult_v8i16_splat(<8 x i16> %x) { ; ; SSE41-LABEL: ult_v8i16_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [241,241,241,241,241,241,241,241] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [241,241,241,241,241,241,241,241] ; SSE41-NEXT: pminuw %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -364,7 +365,7 @@ define <4 x i1> @ult_v4i32_splat(<4 x i32> %x) { ; ; SSE41-LABEL: ult_v4i32_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967253,4294967253,4294967253,4294967253] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967253,4294967253,4294967253,4294967253] ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -415,7 +416,7 @@ define <8 x i1> @ule_v8i16_splat(<8 x i16> %x) { ; ; SSE41-LABEL: ule_v8i16_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [242,242,242,242,242,242,242,242] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [242,242,242,242,242,242,242,242] ; SSE41-NEXT: pminuw %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -434,7 +435,7 @@ define <4 x i1> @ule_v4i32_splat(<4 x i32> %x) { ; ; SSE41-LABEL: ule_v4i32_splat: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967254,4294967254,4294967254,4294967254] ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -501,7 +502,7 @@ define <4 x i1> @ugt_v4i32_nonsplat(<4 x i32> %x) { ; ; SSE41-LABEL: ugt_v4i32_nonsplat: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967254,4294967255,4294967256,4294967257] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967254,4294967255,4294967256,4294967257] ; SSE41-NEXT: pmaxud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -520,7 +521,7 @@ define <4 x i1> @ugt_v4i32_splat_commute(<4 x i32> %x) { ; ; SSE41-LABEL: ugt_v4i32_splat_commute: ; SSE41: ## %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [3,3,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3,3,3,3] ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -544,7 +545,7 @@ define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) { ; SSE41-LABEL: PR39859: ; SSE41: ## %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [43,43,43,43,43,43,43,43] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [43,43,43,43,43,43,43,43] ; SSE41-NEXT: pmaxuw %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_setcc.ll b/llvm/test/CodeGen/X86/vec_setcc.ll index 657b71df9fae0..22af645b39c03 100644 --- a/llvm/test/CodeGen/X86/vec_setcc.ll +++ b/llvm/test/CodeGen/X86/vec_setcc.ll @@ -163,34 +163,23 @@ define <16 x i8> @or_icmp_eq_const_1bit_diff(<16 x i8> %x) { } define <4 x i32> @or_icmp_ne_const_1bit_diff(<4 x i32> %x) { -; SSE2-LABEL: or_icmp_ne_const_1bit_diff: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44,60,44,60] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: or_icmp_ne_const_1bit_diff: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [44,60,44,60] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: or_icmp_ne_const_1bit_diff: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44,60,44,60] +; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: or_icmp_ne_const_1bit_diff: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq @@ -226,34 +215,23 @@ define <16 x i8> @and_icmp_eq_const_1bit_diff(<16 x i8> %x) { } define <4 x i32> @and_icmp_ne_const_1bit_diff(<4 x i32> %x) { -; SSE2-LABEL: and_icmp_ne_const_1bit_diff: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44,60,54,44] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: and_icmp_ne_const_1bit_diff: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [44,60,54,44] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: and_icmp_ne_const_1bit_diff: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44,60,54,44] +; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: and_icmp_ne_const_1bit_diff: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq @@ -344,9 +322,9 @@ define <3 x i1> @test_setcc_v3i1_v3i16(ptr %a) nounwind { ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: pextrb $2, %xmm1, %edx ; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx @@ -357,9 +335,9 @@ define <3 x i1> @test_setcc_v3i1_v3i16(ptr %a) nounwind { ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vpextrb $2, %xmm0, %edx ; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: # kill: def $dl killed $dl killed $edx ; AVX-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/vec_shift4.ll b/llvm/test/CodeGen/X86/vec_shift4.ll index 25a8055ae0ddc..9789580d6e304 100644 --- a/llvm/test/CodeGen/X86/vec_shift4.ll +++ b/llvm/test/CodeGen/X86/vec_shift4.ll @@ -29,10 +29,10 @@ define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { ; X86: # %bb.0: # %entry ; X86-NEXT: movdqa %xmm1, %xmm2 ; X86-NEXT: movdqa %xmm0, %xmm1 -; X86-NEXT: psllw $5, %xmm2 ; X86-NEXT: movdqa %xmm0, %xmm3 ; X86-NEXT: psllw $4, %xmm3 ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-NEXT: psllw $5, %xmm2 ; X86-NEXT: movdqa %xmm2, %xmm0 ; X86-NEXT: pblendvb %xmm0, %xmm3, %xmm1 ; X86-NEXT: movdqa %xmm1, %xmm3 @@ -52,10 +52,10 @@ define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { ; X64-LABEL: shl2: ; X64: # %bb.0: # %entry ; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: psllw $5, %xmm1 ; X64-NEXT: movdqa %xmm0, %xmm3 ; X64-NEXT: psllw $4, %xmm3 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-NEXT: psllw $5, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; X64-NEXT: movdqa %xmm2, %xmm3 diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll index 71e659c681d17..63688de94ef40 100644 --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -146,7 +146,7 @@ define <8 x i32> @test6(<8 x i32> %a) { ; ; SSE41-LABEL: test6: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [2,2,4,8] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] ; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: pmulld %xmm2, %xmm1 ; SSE41-NEXT: retq @@ -231,7 +231,7 @@ define <16 x i32> @test8(<16 x i32> %a) { ; ; SSE41-LABEL: test8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [2,2,4,8] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] ; SSE41-NEXT: pmulld %xmm4, %xmm0 ; SSE41-NEXT: pmulld %xmm4, %xmm1 ; SSE41-NEXT: pmulld %xmm4, %xmm2 @@ -287,7 +287,7 @@ define <8 x i64> @test9(<8 x i64> %a) { ; ; AVX2-LABEL: test9: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,1,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3] ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index a54ff67f74755..e8f942a3fe820 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -218,9 +218,9 @@ define <3 x i32> @smulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pmuldq %xmm1, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 @@ -240,9 +240,9 @@ define <3 x i32> @smulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX1-NEXT: vmovq %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -257,9 +257,9 @@ define <3 x i32> @smulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX2-NEXT: vmovq %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -269,14 +269,14 @@ define <3 x i32> @smulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 ; AVX512-NEXT: vpcmpneqd %xmm0, %xmm4, %k1 +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<3 x i32>, <3 x i1>} @llvm.smul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) @@ -400,7 +400,7 @@ define <4 x i32> @smulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 @@ -564,12 +564,12 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; ; SSE41-LABEL: smulo_v6i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: movd %esi, %xmm2 ; SSE41-NEXT: pinsrd $1, %edx, %xmm2 ; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmuldq %xmm2, %xmm0 @@ -663,7 +663,7 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] ; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0 @@ -862,7 +862,7 @@ define <8 x i32> @smulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpmuldq %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] ; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 ; AVX512-NEXT: vpsrad $31, %ymm1, %ymm0 @@ -1220,7 +1220,7 @@ define <16 x i32> @smulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX512-NEXT: vpmuldq %zmm3, %zmm4, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpsrad $31, %zmm1, %zmm0 @@ -1345,7 +1345,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: packuswb %xmm3, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm1, %xmm5 ; SSE41-NEXT: pand %xmm1, %xmm4 ; SSE41-NEXT: packuswb %xmm5, %xmm4 @@ -1665,7 +1665,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movdqa %xmm3, %xmm7 ; SSE41-NEXT: psrlw $8, %xmm7 ; SSE41-NEXT: packuswb %xmm5, %xmm7 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: pand %xmm5, %xmm3 ; SSE41-NEXT: packuswb %xmm6, %xmm3 @@ -2251,7 +2251,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movdqa %xmm7, %xmm10 ; SSE41-NEXT: psrlw $8, %xmm10 ; SSE41-NEXT: packuswb %xmm8, %xmm10 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm8, %xmm9 ; SSE41-NEXT: pand %xmm8, %xmm7 ; SSE41-NEXT: packuswb %xmm9, %xmm7 @@ -2586,59 +2586,59 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; ; AVX512F-LABEL: smulo_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm2 +; AVX512F-NEXT: vpsraw $8, %ymm2, %ymm5 +; AVX512F-NEXT: vpmovsxwd %ymm5, %zmm5 +; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm6 +; AVX512F-NEXT: vpsraw $15, %ymm6, %ymm6 +; AVX512F-NEXT: vpmovsxwd %ymm6, %zmm6 +; AVX512F-NEXT: vpcmpneqd %zmm5, %zmm6, %k1 ; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 ; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4 -; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpsraw $8, %ymm4, %ymm3 -; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3 +; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm4 +; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 +; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm5 +; AVX512F-NEXT: vpsraw $15, %ymm5, %ymm5 +; AVX512F-NEXT: vpmovsxwd %ymm5, %zmm5 +; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm5, %k2 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpsraw $8, %ymm4, %ymm5 +; AVX512F-NEXT: vpmovsxwd %ymm5, %zmm5 ; AVX512F-NEXT: vpsllw $8, %ymm4, %ymm6 ; AVX512F-NEXT: vpsraw $15, %ymm6, %ymm6 ; AVX512F-NEXT: vpmovsxwd %ymm6, %zmm6 -; AVX512F-NEXT: vpcmpneqd %zmm3, %zmm6, %k1 -; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 -; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm3 -; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpsraw $8, %ymm5, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vpsllw $8, %ymm5, %ymm3 -; AVX512F-NEXT: vpsraw $15, %ymm3, %ymm3 -; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3 -; AVX512F-NEXT: vpcmpneqd %zmm2, %zmm3, %k2 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3 -; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm6 -; AVX512F-NEXT: vpsraw $8, %ymm6, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 -; AVX512F-NEXT: vpsllw $8, %ymm6, %ymm3 -; AVX512F-NEXT: vpsraw $15, %ymm3, %ymm3 -; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3 -; AVX512F-NEXT: vpcmpneqd %zmm2, %zmm3, %k3 +; AVX512F-NEXT: vpcmpneqd %zmm5, %zmm6, %k3 ; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm7 -; AVX512F-NEXT: vpsraw $8, %ymm7, %ymm0 +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpsraw $8, %ymm5, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vpsllw $8, %ymm7, %ymm1 +; AVX512F-NEXT: vpsllw $8, %ymm5, %ymm1 ; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 ; AVX512F-NEXT: vpcmpneqd %zmm0, %zmm1, %k4 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, 48(%rdi) ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpmovdb %zmm2, 32(%rdi) ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vpmovdb %zmm3, 16(%rdi) ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi) ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero ; AVX512F-NEXT: vpmovdb %zmm4, (%rdi) ; AVX512F-NEXT: retq ; @@ -2654,7 +2654,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm4 @@ -2887,19 +2887,19 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; AVX512F-LABEL: smulo_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vmovq %xmm0, %rcx -; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512F-NEXT: vmovq %xmm0, %rdx ; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512F-NEXT: imulq %rdx, %rsi -; AVX512F-NEXT: seto %dl +; AVX512F-NEXT: imulq %rcx, %rsi +; AVX512F-NEXT: seto %cl ; AVX512F-NEXT: vmovq %rsi, %xmm0 -; AVX512F-NEXT: imulq %rax, %rcx -; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: imulq %rax, %rdx +; AVX512F-NEXT: vmovq %rdx, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512F-NEXT: seto %al ; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $14, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k1 @@ -2911,19 +2911,19 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; AVX512BW-LABEL: smulo_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq %xmm1, %rax -; AVX512BW-NEXT: vmovq %xmm0, %rcx -; AVX512BW-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512BW-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512BW-NEXT: vmovq %xmm0, %rdx ; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512BW-NEXT: imulq %rdx, %rsi -; AVX512BW-NEXT: seto %dl +; AVX512BW-NEXT: imulq %rcx, %rsi +; AVX512BW-NEXT: seto %cl ; AVX512BW-NEXT: vmovq %rsi, %xmm0 -; AVX512BW-NEXT: imulq %rax, %rcx -; AVX512BW-NEXT: vmovq %rcx, %xmm1 +; AVX512BW-NEXT: imulq %rax, %rdx +; AVX512BW-NEXT: vmovq %rdx, %xmm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512BW-NEXT: seto %al ; AVX512BW-NEXT: andl $1, %eax ; AVX512BW-NEXT: kmovw %eax, %k0 -; AVX512BW-NEXT: kmovd %edx, %k1 +; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k1 @@ -3067,9 +3067,9 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pslld $8, %xmm0 ; SSE41-NEXT: psrad $8, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: pextrd $3, %xmm1, %eax ; SSE41-NEXT: pextrd $2, %xmm1, %ecx +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE41-NEXT: pextrd $1, %xmm1, %edx ; SSE41-NEXT: movd %xmm1, %esi ; SSE41-NEXT: psrad $31, %xmm1 @@ -3115,8 +3115,8 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: movw %ax, 9(%rdi) ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: movw %ax, 9(%rdi) ; AVX1-NEXT: movw %cx, 6(%rdi) ; AVX1-NEXT: vpextrd $1, %xmm1, %edx ; AVX1-NEXT: movw %dx, 3(%rdi) @@ -3155,8 +3155,8 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpxor %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: movw %ax, 9(%rdi) ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: movw %ax, 9(%rdi) ; AVX2-NEXT: movw %cx, 6(%rdi) ; AVX2-NEXT: vpextrd $1, %xmm1, %edx ; AVX2-NEXT: movw %dx, 3(%rdi) @@ -3182,7 +3182,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrad $31, %xmm1, %xmm0 @@ -3194,8 +3194,8 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: movw %cx, 6(%rdi) ; AVX512-NEXT: vpextrd $1, %xmm1, %edx ; AVX512-NEXT: movw %dx, 3(%rdi) @@ -3291,84 +3291,84 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %r9, %r10 ; SSE2-NEXT: movq %rcx, %rbx -; SSE2-NEXT: movq %rdx, %rcx +; SSE2-NEXT: movq %rdx, %r10 ; SSE2-NEXT: movq %rsi, %r11 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSE2-NEXT: movq %rsi, %rbp -; SSE2-NEXT: sarq $63, %rbp -; SSE2-NEXT: imulq %r8, %rbp +; SSE2-NEXT: movq %rsi, %r13 +; SSE2-NEXT: sarq $63, %r13 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r8 -; SSE2-NEXT: movq %rdx, %r8 ; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: movq %rdx, %r15 +; SSE2-NEXT: imulq %r8, %r13 ; SSE2-NEXT: addq %rsi, %r14 -; SSE2-NEXT: adcq %rbp, %r8 -; SSE2-NEXT: movq %r8, %rbp -; SSE2-NEXT: sarq $63, %rbp -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: imulq %rdi, %r9 +; SSE2-NEXT: adcq %r13, %r15 +; SSE2-NEXT: movq %r15, %r8 +; SSE2-NEXT: sarq $63, %r8 +; SSE2-NEXT: movq %r9, %r13 +; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: imulq %rdi, %r13 ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: mulq %r10 +; SSE2-NEXT: mulq %r9 ; SSE2-NEXT: movq %rdx, %rdi ; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: addq %r14, %rsi -; SSE2-NEXT: adcq %r9, %rdi -; SSE2-NEXT: movq %rdi, %r9 -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: addq %r8, %rdi -; SSE2-NEXT: adcq %rbp, %r9 +; SSE2-NEXT: adcq %r13, %rdi +; SSE2-NEXT: movq %rdi, %r13 +; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: addq %r15, %rdi +; SSE2-NEXT: adcq %r8, %r13 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: movq %r11, %rax -; SSE2-NEXT: imulq %r10 +; SSE2-NEXT: imulq %r9 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE2-NEXT: addq %rdi, %rax -; SSE2-NEXT: adcq %r9, %rdx -; SSE2-NEXT: movq %rsi, 8(%r15) +; SSE2-NEXT: adcq %r13, %rdx +; SSE2-NEXT: movq %rsi, 8(%r12) ; SSE2-NEXT: sarq $63, %rsi ; SSE2-NEXT: xorq %rsi, %rdx ; SSE2-NEXT: xorq %rax, %rsi -; SSE2-NEXT: xorl %r11d, %r11d +; SSE2-NEXT: xorl %ebp, %ebp ; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: setne %r11b -; SSE2-NEXT: movq %rbx, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: imulq %r13, %r10 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %r13 +; SSE2-NEXT: setne %bpl +; SSE2-NEXT: movq %rbx, %r15 +; SSE2-NEXT: sarq $63, %r15 +; SSE2-NEXT: movq %r10, %rax +; SSE2-NEXT: mulq %r14 ; SSE2-NEXT: movq %rdx, %rdi ; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: movq %rbx, %rax -; SSE2-NEXT: mulq %r13 -; SSE2-NEXT: movq %rdx, %r8 -; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: addq %rdi, %r9 -; SSE2-NEXT: adcq %r10, %r8 -; SSE2-NEXT: movq %r8, %r14 +; SSE2-NEXT: mulq %r14 +; SSE2-NEXT: movq %rax, %r8 +; SSE2-NEXT: movq %rdx, %r9 +; SSE2-NEXT: imulq %r14, %r15 +; SSE2-NEXT: addq %rdi, %r8 +; SSE2-NEXT: adcq %r15, %r9 +; SSE2-NEXT: movq %r9, %r14 ; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movq %r12, %r13 -; SSE2-NEXT: sarq $63, %r13 -; SSE2-NEXT: imulq %rcx, %r13 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %r12 +; SSE2-NEXT: movq %r11, %r15 +; SSE2-NEXT: sarq $63, %r15 +; SSE2-NEXT: imulq %r10, %r15 +; SSE2-NEXT: movq %r10, %rax +; SSE2-NEXT: mulq %r11 ; SSE2-NEXT: movq %rdx, %rdi ; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: addq %r9, %r10 -; SSE2-NEXT: adcq %r13, %rdi -; SSE2-NEXT: movq %rdi, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: addq %r8, %rdi -; SSE2-NEXT: adcq %r14, %rcx +; SSE2-NEXT: addq %r8, %r10 +; SSE2-NEXT: adcq %r15, %rdi +; SSE2-NEXT: movq %rdi, %r8 +; SSE2-NEXT: sarq $63, %r8 +; SSE2-NEXT: addq %r9, %rdi +; SSE2-NEXT: adcq %r14, %r8 ; SSE2-NEXT: movq %rbx, %rax -; SSE2-NEXT: imulq %r12 +; SSE2-NEXT: imulq %r11 ; SSE2-NEXT: addq %rdi, %rax -; SSE2-NEXT: adcq %rcx, %rdx -; SSE2-NEXT: movq %r10, 24(%r15) +; SSE2-NEXT: adcq %r8, %rdx +; SSE2-NEXT: movq %r10, 24(%r12) ; SSE2-NEXT: sarq $63, %r10 ; SSE2-NEXT: xorq %r10, %rdx ; SSE2-NEXT: xorq %rax, %r10 @@ -3377,12 +3377,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: setne %al ; SSE2-NEXT: negl %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: negl %r11d -; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: negl %ebp +; SSE2-NEXT: movd %ebp, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rsi, 16(%r15) -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: movq %rax, (%r15) +; SSE2-NEXT: movq %rsi, 16(%r12) +; SSE2-NEXT: movq %rcx, (%r12) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -3399,84 +3398,84 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: pushq %r13 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %r9, %r10 ; SSSE3-NEXT: movq %rcx, %rbx -; SSSE3-NEXT: movq %rdx, %rcx +; SSSE3-NEXT: movq %rdx, %r10 ; SSSE3-NEXT: movq %rsi, %r11 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSSE3-NEXT: movq %rsi, %rbp -; SSSE3-NEXT: sarq $63, %rbp -; SSSE3-NEXT: imulq %r8, %rbp +; SSSE3-NEXT: movq %rsi, %r13 +; SSSE3-NEXT: sarq $63, %r13 ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSSE3-NEXT: movq %rax, %rcx ; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r8 -; SSSE3-NEXT: movq %rdx, %r8 ; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: movq %rdx, %r15 +; SSSE3-NEXT: imulq %r8, %r13 ; SSSE3-NEXT: addq %rsi, %r14 -; SSSE3-NEXT: adcq %rbp, %r8 -; SSSE3-NEXT: movq %r8, %rbp -; SSSE3-NEXT: sarq $63, %rbp -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: imulq %rdi, %r9 +; SSSE3-NEXT: adcq %r13, %r15 +; SSSE3-NEXT: movq %r15, %r8 +; SSSE3-NEXT: sarq $63, %r8 +; SSSE3-NEXT: movq %r9, %r13 +; SSSE3-NEXT: sarq $63, %r13 +; SSSE3-NEXT: imulq %rdi, %r13 ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: mulq %r10 +; SSSE3-NEXT: mulq %r9 ; SSSE3-NEXT: movq %rdx, %rdi ; SSSE3-NEXT: movq %rax, %rsi ; SSSE3-NEXT: addq %r14, %rsi -; SSSE3-NEXT: adcq %r9, %rdi -; SSSE3-NEXT: movq %rdi, %r9 -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: addq %r8, %rdi -; SSSE3-NEXT: adcq %rbp, %r9 +; SSSE3-NEXT: adcq %r13, %rdi +; SSSE3-NEXT: movq %rdi, %r13 +; SSSE3-NEXT: sarq $63, %r13 +; SSSE3-NEXT: addq %r15, %rdi +; SSSE3-NEXT: adcq %r8, %r13 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSSE3-NEXT: movq %r11, %rax -; SSSE3-NEXT: imulq %r10 +; SSSE3-NEXT: imulq %r9 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSSE3-NEXT: addq %rdi, %rax -; SSSE3-NEXT: adcq %r9, %rdx -; SSSE3-NEXT: movq %rsi, 8(%r15) +; SSSE3-NEXT: adcq %r13, %rdx +; SSSE3-NEXT: movq %rsi, 8(%r12) ; SSSE3-NEXT: sarq $63, %rsi ; SSSE3-NEXT: xorq %rsi, %rdx ; SSSE3-NEXT: xorq %rax, %rsi -; SSSE3-NEXT: xorl %r11d, %r11d +; SSSE3-NEXT: xorl %ebp, %ebp ; SSSE3-NEXT: orq %rdx, %rsi -; SSSE3-NEXT: setne %r11b -; SSSE3-NEXT: movq %rbx, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: imulq %r13, %r10 -; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: mulq %r13 +; SSSE3-NEXT: setne %bpl +; SSSE3-NEXT: movq %rbx, %r15 +; SSSE3-NEXT: sarq $63, %r15 +; SSSE3-NEXT: movq %r10, %rax +; SSSE3-NEXT: mulq %r14 ; SSSE3-NEXT: movq %rdx, %rdi ; SSSE3-NEXT: movq %rax, %rsi ; SSSE3-NEXT: movq %rbx, %rax -; SSSE3-NEXT: mulq %r13 -; SSSE3-NEXT: movq %rdx, %r8 -; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: addq %rdi, %r9 -; SSSE3-NEXT: adcq %r10, %r8 -; SSSE3-NEXT: movq %r8, %r14 +; SSSE3-NEXT: mulq %r14 +; SSSE3-NEXT: movq %rax, %r8 +; SSSE3-NEXT: movq %rdx, %r9 +; SSSE3-NEXT: imulq %r14, %r15 +; SSSE3-NEXT: addq %rdi, %r8 +; SSSE3-NEXT: adcq %r15, %r9 +; SSSE3-NEXT: movq %r9, %r14 ; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movq %r12, %r13 -; SSSE3-NEXT: sarq $63, %r13 -; SSSE3-NEXT: imulq %rcx, %r13 -; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: mulq %r12 +; SSSE3-NEXT: movq %r11, %r15 +; SSSE3-NEXT: sarq $63, %r15 +; SSSE3-NEXT: imulq %r10, %r15 +; SSSE3-NEXT: movq %r10, %rax +; SSSE3-NEXT: mulq %r11 ; SSSE3-NEXT: movq %rdx, %rdi ; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: addq %r9, %r10 -; SSSE3-NEXT: adcq %r13, %rdi -; SSSE3-NEXT: movq %rdi, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: addq %r8, %rdi -; SSSE3-NEXT: adcq %r14, %rcx +; SSSE3-NEXT: addq %r8, %r10 +; SSSE3-NEXT: adcq %r15, %rdi +; SSSE3-NEXT: movq %rdi, %r8 +; SSSE3-NEXT: sarq $63, %r8 +; SSSE3-NEXT: addq %r9, %rdi +; SSSE3-NEXT: adcq %r14, %r8 ; SSSE3-NEXT: movq %rbx, %rax -; SSSE3-NEXT: imulq %r12 +; SSSE3-NEXT: imulq %r11 ; SSSE3-NEXT: addq %rdi, %rax -; SSSE3-NEXT: adcq %rcx, %rdx -; SSSE3-NEXT: movq %r10, 24(%r15) +; SSSE3-NEXT: adcq %r8, %rdx +; SSSE3-NEXT: movq %r10, 24(%r12) ; SSSE3-NEXT: sarq $63, %r10 ; SSSE3-NEXT: xorq %r10, %rdx ; SSSE3-NEXT: xorq %rax, %r10 @@ -3485,12 +3484,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: negl %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: negl %r11d -; SSSE3-NEXT: movd %r11d, %xmm0 +; SSSE3-NEXT: negl %ebp +; SSSE3-NEXT: movd %ebp, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rsi, 16(%r15) -; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSSE3-NEXT: movq %rax, (%r15) +; SSSE3-NEXT: movq %rsi, 16(%r12) +; SSSE3-NEXT: movq %rcx, (%r12) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -3507,97 +3505,98 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %r9, %r10 ; SSE41-NEXT: movq %rcx, %rbx -; SSE41-NEXT: movq %rdx, %rcx +; SSE41-NEXT: movq %rdx, %r10 ; SSE41-NEXT: movq %rsi, %r11 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSE41-NEXT: movq %rsi, %rbp -; SSE41-NEXT: sarq $63, %rbp -; SSE41-NEXT: imulq %r8, %rbp +; SSE41-NEXT: movq %rsi, %r13 +; SSE41-NEXT: sarq $63, %r13 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rdx, %rsi ; SSE41-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: movq %rdx, %r8 ; SSE41-NEXT: movq %rax, %r14 +; SSE41-NEXT: movq %rdx, %r15 +; SSE41-NEXT: imulq %r8, %r13 ; SSE41-NEXT: addq %rsi, %r14 -; SSE41-NEXT: adcq %rbp, %r8 -; SSE41-NEXT: movq %r8, %rbp -; SSE41-NEXT: sarq $63, %rbp -; SSE41-NEXT: sarq $63, %r9 -; SSE41-NEXT: imulq %rdi, %r9 +; SSE41-NEXT: adcq %r13, %r15 +; SSE41-NEXT: movq %r15, %r8 +; SSE41-NEXT: sarq $63, %r8 +; SSE41-NEXT: movq %r9, %r13 +; SSE41-NEXT: sarq $63, %r13 +; SSE41-NEXT: imulq %rdi, %r13 ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: mulq %r10 +; SSE41-NEXT: mulq %r9 ; SSE41-NEXT: movq %rdx, %rdi ; SSE41-NEXT: movq %rax, %rsi ; SSE41-NEXT: addq %r14, %rsi -; SSE41-NEXT: adcq %r9, %rdi -; SSE41-NEXT: movq %rdi, %r9 -; SSE41-NEXT: sarq $63, %r9 -; SSE41-NEXT: addq %r8, %rdi -; SSE41-NEXT: adcq %rbp, %r9 +; SSE41-NEXT: adcq %r13, %rdi +; SSE41-NEXT: movq %rdi, %r14 +; SSE41-NEXT: sarq $63, %r14 +; SSE41-NEXT: addq %r15, %rdi +; SSE41-NEXT: adcq %r8, %r14 ; SSE41-NEXT: movq %r11, %rax -; SSE41-NEXT: imulq %r10 +; SSE41-NEXT: imulq %r9 ; SSE41-NEXT: addq %rdi, %rax -; SSE41-NEXT: adcq %r9, %rdx -; SSE41-NEXT: movq %rsi, 8(%r15) -; SSE41-NEXT: sarq $63, %rsi -; SSE41-NEXT: xorq %rsi, %rdx -; SSE41-NEXT: xorq %rax, %rsi -; SSE41-NEXT: xorl %r11d, %r11d -; SSE41-NEXT: orq %rdx, %rsi -; SSE41-NEXT: setne %r11b -; SSE41-NEXT: movq %rbx, %r10 -; SSE41-NEXT: sarq $63, %r10 -; SSE41-NEXT: imulq %r13, %r10 -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %r13 -; SSE41-NEXT: movq %rdx, %rdi -; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: movq %rbx, %rax -; SSE41-NEXT: mulq %r13 +; SSE41-NEXT: adcq %r14, %rdx +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; SSE41-NEXT: movq %rsi, %rdi +; SSE41-NEXT: sarq $63, %rdi +; SSE41-NEXT: xorq %rdi, %rdx +; SSE41-NEXT: xorq %rax, %rdi +; SSE41-NEXT: xorl %ebp, %ebp +; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: setne %bpl +; SSE41-NEXT: movq %rcx, %r15 +; SSE41-NEXT: sarq $63, %r15 +; SSE41-NEXT: imulq %r9, %r15 +; SSE41-NEXT: movq %r10, %rax +; SSE41-NEXT: mulq %r9 ; SSE41-NEXT: movq %rdx, %r8 -; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: addq %rdi, %r9 -; SSE41-NEXT: adcq %r10, %r8 -; SSE41-NEXT: movq %r8, %r14 -; SSE41-NEXT: sarq $63, %r14 -; SSE41-NEXT: movq %r12, %r13 -; SSE41-NEXT: sarq $63, %r13 -; SSE41-NEXT: imulq %rcx, %r13 +; SSE41-NEXT: movq %rax, %rdi ; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %r12 -; SSE41-NEXT: movq %rdx, %rdi -; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: addq %r9, %r10 -; SSE41-NEXT: adcq %r13, %rdi -; SSE41-NEXT: movq %rdi, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: addq %r8, %rdi -; SSE41-NEXT: adcq %r14, %rcx -; SSE41-NEXT: movq %rbx, %rax -; SSE41-NEXT: imulq %r12 -; SSE41-NEXT: addq %rdi, %rax -; SSE41-NEXT: adcq %rcx, %rdx -; SSE41-NEXT: movq %r10, 24(%r15) +; SSE41-NEXT: mulq %r9 +; SSE41-NEXT: movq %rdx, %r11 +; SSE41-NEXT: movq %rax, %r14 +; SSE41-NEXT: addq %r8, %r14 +; SSE41-NEXT: adcq %r15, %r11 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: movq %r11, %r13 +; SSE41-NEXT: sarq $63, %r13 +; SSE41-NEXT: movq %rcx, %r15 +; SSE41-NEXT: sarq $63, %r15 +; SSE41-NEXT: movq %r10, %rax +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rax, %r9 +; SSE41-NEXT: movq %rdx, %r8 +; SSE41-NEXT: imulq %r10, %r15 +; SSE41-NEXT: addq %r14, %r9 +; SSE41-NEXT: adcq %r15, %r8 +; SSE41-NEXT: movq %r8, %r10 ; SSE41-NEXT: sarq $63, %r10 -; SSE41-NEXT: xorq %r10, %rdx -; SSE41-NEXT: xorq %rax, %r10 +; SSE41-NEXT: addq %r11, %r8 +; SSE41-NEXT: adcq %r13, %r10 +; SSE41-NEXT: movq %rbx, %rax +; SSE41-NEXT: imulq %rcx +; SSE41-NEXT: addq %r8, %rax +; SSE41-NEXT: adcq %r10, %rdx +; SSE41-NEXT: movq %r9, 24(%r12) +; SSE41-NEXT: sarq $63, %r9 +; SSE41-NEXT: xorq %r9, %rdx +; SSE41-NEXT: xorq %rax, %r9 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rdx, %r10 +; SSE41-NEXT: orq %rdx, %r9 ; SSE41-NEXT: setne %al ; SSE41-NEXT: negl %eax -; SSE41-NEXT: negl %r11d -; SSE41-NEXT: movd %r11d, %xmm0 +; SSE41-NEXT: negl %ebp +; SSE41-NEXT: movd %ebp, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movq %rsi, 16(%r15) +; SSE41-NEXT: movq %rsi, 8(%r12) +; SSE41-NEXT: movq %rdi, 16(%r12) ; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE41-NEXT: movq %rax, (%r15) +; SSE41-NEXT: movq %rax, (%r12) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -3614,97 +3613,98 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %r9, %r10 ; AVX-NEXT: movq %rcx, %rbx -; AVX-NEXT: movq %rdx, %rcx +; AVX-NEXT: movq %rdx, %r10 ; AVX-NEXT: movq %rsi, %r11 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX-NEXT: movq %rsi, %rbp -; AVX-NEXT: sarq $63, %rbp -; AVX-NEXT: imulq %r8, %rbp +; AVX-NEXT: movq %rsi, %r13 +; AVX-NEXT: sarq $63, %r13 ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rdx, %rsi ; AVX-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r8 -; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: movq %rax, %r14 +; AVX-NEXT: movq %rdx, %r15 +; AVX-NEXT: imulq %r8, %r13 ; AVX-NEXT: addq %rsi, %r14 -; AVX-NEXT: adcq %rbp, %r8 -; AVX-NEXT: movq %r8, %rbp -; AVX-NEXT: sarq $63, %rbp -; AVX-NEXT: sarq $63, %r9 -; AVX-NEXT: imulq %rdi, %r9 +; AVX-NEXT: adcq %r13, %r15 +; AVX-NEXT: movq %r15, %r8 +; AVX-NEXT: sarq $63, %r8 +; AVX-NEXT: movq %r9, %r13 +; AVX-NEXT: sarq $63, %r13 +; AVX-NEXT: imulq %rdi, %r13 ; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: mulq %r10 +; AVX-NEXT: mulq %r9 ; AVX-NEXT: movq %rdx, %rdi ; AVX-NEXT: movq %rax, %rsi ; AVX-NEXT: addq %r14, %rsi -; AVX-NEXT: adcq %r9, %rdi -; AVX-NEXT: movq %rdi, %r9 -; AVX-NEXT: sarq $63, %r9 -; AVX-NEXT: addq %r8, %rdi -; AVX-NEXT: adcq %rbp, %r9 +; AVX-NEXT: adcq %r13, %rdi +; AVX-NEXT: movq %rdi, %r14 +; AVX-NEXT: sarq $63, %r14 +; AVX-NEXT: addq %r15, %rdi +; AVX-NEXT: adcq %r8, %r14 ; AVX-NEXT: movq %r11, %rax -; AVX-NEXT: imulq %r10 +; AVX-NEXT: imulq %r9 ; AVX-NEXT: addq %rdi, %rax -; AVX-NEXT: adcq %r9, %rdx -; AVX-NEXT: movq %rsi, 8(%r15) -; AVX-NEXT: sarq $63, %rsi -; AVX-NEXT: xorq %rsi, %rdx -; AVX-NEXT: xorq %rax, %rsi -; AVX-NEXT: xorl %r11d, %r11d -; AVX-NEXT: orq %rdx, %rsi -; AVX-NEXT: setne %r11b -; AVX-NEXT: movq %rbx, %r10 -; AVX-NEXT: sarq $63, %r10 -; AVX-NEXT: imulq %r13, %r10 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %r13 -; AVX-NEXT: movq %rdx, %rdi -; AVX-NEXT: movq %rax, %rsi -; AVX-NEXT: movq %rbx, %rax -; AVX-NEXT: mulq %r13 +; AVX-NEXT: adcq %r14, %rdx +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; AVX-NEXT: movq %rsi, %rdi +; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: xorq %rdi, %rdx +; AVX-NEXT: xorq %rax, %rdi +; AVX-NEXT: xorl %ebp, %ebp +; AVX-NEXT: orq %rdx, %rdi +; AVX-NEXT: setne %bpl +; AVX-NEXT: movq %rcx, %r15 +; AVX-NEXT: sarq $63, %r15 +; AVX-NEXT: imulq %r9, %r15 +; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: mulq %r9 ; AVX-NEXT: movq %rdx, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: addq %rdi, %r9 -; AVX-NEXT: adcq %r10, %r8 -; AVX-NEXT: movq %r8, %r14 -; AVX-NEXT: sarq $63, %r14 -; AVX-NEXT: movq %r12, %r13 -; AVX-NEXT: sarq $63, %r13 -; AVX-NEXT: imulq %rcx, %r13 +; AVX-NEXT: movq %rax, %rdi ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %r12 -; AVX-NEXT: movq %rdx, %rdi -; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: addq %r9, %r10 -; AVX-NEXT: adcq %r13, %rdi -; AVX-NEXT: movq %rdi, %rcx -; AVX-NEXT: sarq $63, %rcx -; AVX-NEXT: addq %r8, %rdi -; AVX-NEXT: adcq %r14, %rcx -; AVX-NEXT: movq %rbx, %rax -; AVX-NEXT: imulq %r12 -; AVX-NEXT: addq %rdi, %rax -; AVX-NEXT: adcq %rcx, %rdx -; AVX-NEXT: movq %r10, 24(%r15) +; AVX-NEXT: mulq %r9 +; AVX-NEXT: movq %rdx, %r11 +; AVX-NEXT: movq %rax, %r14 +; AVX-NEXT: addq %r8, %r14 +; AVX-NEXT: adcq %r15, %r11 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movq %r11, %r13 +; AVX-NEXT: sarq $63, %r13 +; AVX-NEXT: movq %rcx, %r15 +; AVX-NEXT: sarq $63, %r15 +; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: movq %rdx, %r8 +; AVX-NEXT: imulq %r10, %r15 +; AVX-NEXT: addq %r14, %r9 +; AVX-NEXT: adcq %r15, %r8 +; AVX-NEXT: movq %r8, %r10 ; AVX-NEXT: sarq $63, %r10 -; AVX-NEXT: xorq %r10, %rdx -; AVX-NEXT: xorq %rax, %r10 +; AVX-NEXT: addq %r11, %r8 +; AVX-NEXT: adcq %r13, %r10 +; AVX-NEXT: movq %rbx, %rax +; AVX-NEXT: imulq %rcx +; AVX-NEXT: addq %r8, %rax +; AVX-NEXT: adcq %r10, %rdx +; AVX-NEXT: movq %r9, 24(%r12) +; AVX-NEXT: sarq $63, %r9 +; AVX-NEXT: xorq %r9, %rdx +; AVX-NEXT: xorq %rax, %r9 ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: orq %rdx, %r10 +; AVX-NEXT: orq %rdx, %r9 ; AVX-NEXT: setne %al ; AVX-NEXT: negl %eax -; AVX-NEXT: negl %r11d -; AVX-NEXT: vmovd %r11d, %xmm0 +; AVX-NEXT: negl %ebp +; AVX-NEXT: vmovd %ebp, %xmm0 ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rsi, 16(%r15) +; AVX-NEXT: movq %rsi, 8(%r12) +; AVX-NEXT: movq %rdi, 16(%r12) ; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX-NEXT: movq %rax, (%r15) +; AVX-NEXT: movq %rax, (%r12) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r13 @@ -3723,85 +3723,85 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: pushq %rbx ; AVX512F-NEXT: movq %rcx, %r11 ; AVX512F-NEXT: movq %rdx, %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512F-NEXT: movq %r11, %rbp +; AVX512F-NEXT: movq %rcx, %rbp ; AVX512F-NEXT: sarq $63, %rbp -; AVX512F-NEXT: imulq %r14, %rbp ; AVX512F-NEXT: movq %rdx, %rax -; AVX512F-NEXT: mulq %r14 +; AVX512F-NEXT: mulq %r13 ; AVX512F-NEXT: movq %rdx, %rbx -; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %r14 -; AVX512F-NEXT: movq %rdx, %r14 -; AVX512F-NEXT: movq %rax, %r15 -; AVX512F-NEXT: addq %rbx, %r15 -; AVX512F-NEXT: adcq %rbp, %r14 -; AVX512F-NEXT: movq %r14, %rbp -; AVX512F-NEXT: sarq $63, %rbp -; AVX512F-NEXT: movq %rcx, %r13 +; AVX512F-NEXT: mulq %r13 +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %rdx, %r15 +; AVX512F-NEXT: imulq %r13, %rbp +; AVX512F-NEXT: addq %rbx, %r14 +; AVX512F-NEXT: adcq %rbp, %r15 +; AVX512F-NEXT: movq %r15, %r13 ; AVX512F-NEXT: sarq $63, %r13 -; AVX512F-NEXT: imulq %r10, %r13 +; AVX512F-NEXT: movq %r12, %rbp +; AVX512F-NEXT: sarq $63, %rbp +; AVX512F-NEXT: imulq %r10, %rbp ; AVX512F-NEXT: movq %r10, %rax -; AVX512F-NEXT: mulq %rcx +; AVX512F-NEXT: mulq %r12 ; AVX512F-NEXT: movq %rdx, %rbx ; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %r15, %r10 -; AVX512F-NEXT: adcq %r13, %rbx -; AVX512F-NEXT: movq %rbx, %r15 -; AVX512F-NEXT: sarq $63, %r15 -; AVX512F-NEXT: addq %r14, %rbx -; AVX512F-NEXT: adcq %rbp, %r15 +; AVX512F-NEXT: addq %r14, %r10 +; AVX512F-NEXT: adcq %rbp, %rbx +; AVX512F-NEXT: movq %rbx, %r14 +; AVX512F-NEXT: sarq $63, %r14 +; AVX512F-NEXT: addq %r15, %rbx +; AVX512F-NEXT: adcq %r13, %r14 ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: imulq %rcx +; AVX512F-NEXT: imulq %r12 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512F-NEXT: addq %rbx, %rax -; AVX512F-NEXT: adcq %r15, %rdx -; AVX512F-NEXT: movq %r10, 24(%r12) +; AVX512F-NEXT: adcq %r14, %rdx +; AVX512F-NEXT: movq %r10, 24(%r15) ; AVX512F-NEXT: sarq $63, %r10 ; AVX512F-NEXT: xorq %r10, %rdx ; AVX512F-NEXT: xorq %rax, %r10 ; AVX512F-NEXT: orq %rdx, %r10 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: movq %rsi, %rcx -; AVX512F-NEXT: sarq $63, %rcx -; AVX512F-NEXT: imulq %r8, %rcx +; AVX512F-NEXT: movq %rsi, %r12 +; AVX512F-NEXT: sarq $63, %r12 ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r11 ; AVX512F-NEXT: movq %rax, %r10 ; AVX512F-NEXT: movq %rsi, %rax ; AVX512F-NEXT: mulq %r8 -; AVX512F-NEXT: movq %rdx, %r8 ; AVX512F-NEXT: movq %rax, %rbx +; AVX512F-NEXT: movq %rdx, %r14 +; AVX512F-NEXT: imulq %r8, %r12 ; AVX512F-NEXT: addq %r11, %rbx -; AVX512F-NEXT: adcq %rcx, %r8 -; AVX512F-NEXT: movq %r8, %rcx -; AVX512F-NEXT: sarq $63, %rcx -; AVX512F-NEXT: movq %r9, %r14 -; AVX512F-NEXT: sarq $63, %r14 -; AVX512F-NEXT: imulq %rdi, %r14 +; AVX512F-NEXT: adcq %r12, %r14 +; AVX512F-NEXT: movq %r14, %r11 +; AVX512F-NEXT: sarq $63, %r11 +; AVX512F-NEXT: movq %r9, %r12 +; AVX512F-NEXT: sarq $63, %r12 +; AVX512F-NEXT: imulq %rdi, %r12 ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %r9 ; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: movq %rax, %r11 -; AVX512F-NEXT: addq %rbx, %r11 -; AVX512F-NEXT: adcq %r14, %rdi +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: addq %rbx, %r8 +; AVX512F-NEXT: adcq %r12, %rdi ; AVX512F-NEXT: movq %rdi, %rbx ; AVX512F-NEXT: sarq $63, %rbx -; AVX512F-NEXT: addq %r8, %rdi -; AVX512F-NEXT: adcq %rcx, %rbx +; AVX512F-NEXT: addq %r14, %rdi +; AVX512F-NEXT: adcq %r11, %rbx ; AVX512F-NEXT: movq %rsi, %rax ; AVX512F-NEXT: imulq %r9 ; AVX512F-NEXT: addq %rdi, %rax ; AVX512F-NEXT: adcq %rbx, %rdx -; AVX512F-NEXT: movq %r11, 8(%r12) -; AVX512F-NEXT: sarq $63, %r11 -; AVX512F-NEXT: xorq %r11, %rdx -; AVX512F-NEXT: xorq %rax, %r11 -; AVX512F-NEXT: orq %rdx, %r11 +; AVX512F-NEXT: movq %r8, 8(%r15) +; AVX512F-NEXT: sarq $63, %r8 +; AVX512F-NEXT: xorq %r8, %rdx +; AVX512F-NEXT: xorq %rax, %r8 +; AVX512F-NEXT: orq %rdx, %r8 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: kmovw %eax, %k1 @@ -3809,9 +3809,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512F-NEXT: movq %rax, 16(%r12) -; AVX512F-NEXT: movq %r10, (%r12) +; AVX512F-NEXT: movq %rcx, 16(%r15) +; AVX512F-NEXT: movq %r10, (%r15) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 @@ -3830,85 +3829,85 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: pushq %rbx ; AVX512BW-NEXT: movq %rcx, %r11 ; AVX512BW-NEXT: movq %rdx, %r10 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512BW-NEXT: movq %r11, %rbp +; AVX512BW-NEXT: movq %rcx, %rbp ; AVX512BW-NEXT: sarq $63, %rbp -; AVX512BW-NEXT: imulq %r14, %rbp ; AVX512BW-NEXT: movq %rdx, %rax -; AVX512BW-NEXT: mulq %r14 +; AVX512BW-NEXT: mulq %r13 ; AVX512BW-NEXT: movq %rdx, %rbx -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: movq %rax, %rcx ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %r14 -; AVX512BW-NEXT: movq %rdx, %r14 -; AVX512BW-NEXT: movq %rax, %r15 -; AVX512BW-NEXT: addq %rbx, %r15 -; AVX512BW-NEXT: adcq %rbp, %r14 -; AVX512BW-NEXT: movq %r14, %rbp -; AVX512BW-NEXT: sarq $63, %rbp -; AVX512BW-NEXT: movq %rcx, %r13 +; AVX512BW-NEXT: mulq %r13 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %rdx, %r15 +; AVX512BW-NEXT: imulq %r13, %rbp +; AVX512BW-NEXT: addq %rbx, %r14 +; AVX512BW-NEXT: adcq %rbp, %r15 +; AVX512BW-NEXT: movq %r15, %r13 ; AVX512BW-NEXT: sarq $63, %r13 -; AVX512BW-NEXT: imulq %r10, %r13 +; AVX512BW-NEXT: movq %r12, %rbp +; AVX512BW-NEXT: sarq $63, %rbp +; AVX512BW-NEXT: imulq %r10, %rbp ; AVX512BW-NEXT: movq %r10, %rax -; AVX512BW-NEXT: mulq %rcx +; AVX512BW-NEXT: mulq %r12 ; AVX512BW-NEXT: movq %rdx, %rbx ; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %r15, %r10 -; AVX512BW-NEXT: adcq %r13, %rbx -; AVX512BW-NEXT: movq %rbx, %r15 -; AVX512BW-NEXT: sarq $63, %r15 -; AVX512BW-NEXT: addq %r14, %rbx -; AVX512BW-NEXT: adcq %rbp, %r15 +; AVX512BW-NEXT: addq %r14, %r10 +; AVX512BW-NEXT: adcq %rbp, %rbx +; AVX512BW-NEXT: movq %rbx, %r14 +; AVX512BW-NEXT: sarq $63, %r14 +; AVX512BW-NEXT: addq %r15, %rbx +; AVX512BW-NEXT: adcq %r13, %r14 ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: imulq %rcx +; AVX512BW-NEXT: imulq %r12 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512BW-NEXT: addq %rbx, %rax -; AVX512BW-NEXT: adcq %r15, %rdx -; AVX512BW-NEXT: movq %r10, 24(%r12) +; AVX512BW-NEXT: adcq %r14, %rdx +; AVX512BW-NEXT: movq %r10, 24(%r15) ; AVX512BW-NEXT: sarq $63, %r10 ; AVX512BW-NEXT: xorq %r10, %rdx ; AVX512BW-NEXT: xorq %rax, %r10 ; AVX512BW-NEXT: orq %rdx, %r10 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: movq %rsi, %rcx -; AVX512BW-NEXT: sarq $63, %rcx -; AVX512BW-NEXT: imulq %r8, %rcx +; AVX512BW-NEXT: movq %rsi, %r12 +; AVX512BW-NEXT: sarq $63, %r12 ; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r11 ; AVX512BW-NEXT: movq %rax, %r10 ; AVX512BW-NEXT: movq %rsi, %rax ; AVX512BW-NEXT: mulq %r8 -; AVX512BW-NEXT: movq %rdx, %r8 ; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: movq %rdx, %r14 +; AVX512BW-NEXT: imulq %r8, %r12 ; AVX512BW-NEXT: addq %r11, %rbx -; AVX512BW-NEXT: adcq %rcx, %r8 -; AVX512BW-NEXT: movq %r8, %rcx -; AVX512BW-NEXT: sarq $63, %rcx -; AVX512BW-NEXT: movq %r9, %r14 -; AVX512BW-NEXT: sarq $63, %r14 -; AVX512BW-NEXT: imulq %rdi, %r14 +; AVX512BW-NEXT: adcq %r12, %r14 +; AVX512BW-NEXT: movq %r14, %r11 +; AVX512BW-NEXT: sarq $63, %r11 +; AVX512BW-NEXT: movq %r9, %r12 +; AVX512BW-NEXT: sarq $63, %r12 +; AVX512BW-NEXT: imulq %rdi, %r12 ; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %r9 ; AVX512BW-NEXT: movq %rdx, %rdi -; AVX512BW-NEXT: movq %rax, %r11 -; AVX512BW-NEXT: addq %rbx, %r11 -; AVX512BW-NEXT: adcq %r14, %rdi +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: addq %rbx, %r8 +; AVX512BW-NEXT: adcq %r12, %rdi ; AVX512BW-NEXT: movq %rdi, %rbx ; AVX512BW-NEXT: sarq $63, %rbx -; AVX512BW-NEXT: addq %r8, %rdi -; AVX512BW-NEXT: adcq %rcx, %rbx +; AVX512BW-NEXT: addq %r14, %rdi +; AVX512BW-NEXT: adcq %r11, %rbx ; AVX512BW-NEXT: movq %rsi, %rax ; AVX512BW-NEXT: imulq %r9 ; AVX512BW-NEXT: addq %rdi, %rax ; AVX512BW-NEXT: adcq %rbx, %rdx -; AVX512BW-NEXT: movq %r11, 8(%r12) -; AVX512BW-NEXT: sarq $63, %r11 -; AVX512BW-NEXT: xorq %r11, %rdx -; AVX512BW-NEXT: xorq %rax, %r11 -; AVX512BW-NEXT: orq %rdx, %r11 +; AVX512BW-NEXT: movq %r8, 8(%r15) +; AVX512BW-NEXT: sarq $63, %r8 +; AVX512BW-NEXT: xorq %r8, %rdx +; AVX512BW-NEXT: xorq %rax, %r8 +; AVX512BW-NEXT: orq %rdx, %r8 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: andl $1, %eax ; AVX512BW-NEXT: kmovw %eax, %k1 @@ -3916,9 +3915,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: movq %rax, 16(%r12) -; AVX512BW-NEXT: movq %r10, (%r12) +; AVX512BW-NEXT: movq %rcx, 16(%r15) +; AVX512BW-NEXT: movq %r10, (%r15) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 746c09e5e70db..8583f0dd0f38f 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -114,9 +114,9 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psubd %xmm1, %xmm3 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE41-NEXT: pextrd $2, %xmm3, 8(%rdi) ; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pextrd $2, %xmm3, 8(%rdi) ; SSE41-NEXT: movq %xmm3, (%rdi) ; SSE41-NEXT: retq ; @@ -125,9 +125,9 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX-NEXT: vmovq %xmm1, (%rdi) ; AVX-NEXT: retq ; @@ -138,9 +138,9 @@ define <3 x i32> @ssubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<3 x i32>, <3 x i1>} @llvm.ssub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) @@ -277,7 +277,6 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; ; SSE41-LABEL: ssubo_v6i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: movd %esi, %xmm1 ; SSE41-NEXT: pinsrd $1, %edx, %xmm1 ; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 @@ -289,6 +288,7 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE41-NEXT: movdqa %xmm1, %xmm4 @@ -756,86 +756,32 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind { } define <2 x i32> @ssubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { -; SSE2-LABEL: ssubo_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rdi) -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: ssubo_v2i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: psubq %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, (%rdi) -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: ssubo_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm3 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: retq +; SSE-LABEL: ssubo_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: psubq %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: retq ; ; AVX-LABEL: ssubo_v2i64: ; AVX: # %bb.0: @@ -952,8 +898,8 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: movw %cx, 6(%rdi) ; SSE41-NEXT: pextrd $1, %xmm0, %edx ; SSE41-NEXT: movw %dx, 3(%rdi) @@ -983,8 +929,8 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpextrd $3, %xmm1, %eax -; AVX-NEXT: movw %ax, 9(%rdi) ; AVX-NEXT: vpextrd $2, %xmm1, %ecx +; AVX-NEXT: movw %ax, 9(%rdi) ; AVX-NEXT: movw %cx, 6(%rdi) ; AVX-NEXT: vpextrd $1, %xmm1, %edx ; AVX-NEXT: movw %dx, 3(%rdi) @@ -1012,8 +958,8 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: movw %cx, 6(%rdi) ; AVX512-NEXT: vpextrd $1, %xmm1, %edx ; AVX512-NEXT: movw %dx, 3(%rdi) @@ -1082,9 +1028,9 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { ; SSE2-LABEL: ssubo_v2i128: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: subq %r8, %rdi ; SSE2-NEXT: sbbq %r9, %rsi +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: seto %r8b ; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx @@ -1104,9 +1050,9 @@ define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; ; SSSE3-LABEL: ssubo_v2i128: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSSE3-NEXT: subq %r8, %rdi ; SSSE3-NEXT: sbbq %r9, %rsi +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSSE3-NEXT: seto %r8b ; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx @@ -1126,9 +1072,9 @@ define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; ; SSE41-LABEL: ssubo_v2i128: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE41-NEXT: subq %r8, %rdi ; SSE41-NEXT: sbbq %r9, %rsi +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE41-NEXT: seto %r8b ; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx @@ -1147,9 +1093,9 @@ define <2 x i32> @ssubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; ; AVX-LABEL: ssubo_v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: subq %r8, %rdi ; AVX-NEXT: sbbq %r9, %rsi +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: seto %r8b ; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rdx ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index be7888cd76a6b..5ca304d1d470c 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -126,9 +126,9 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: pmaxud %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi) ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi) ; SSE41-NEXT: movq %xmm1, (%rdi) ; SSE41-NEXT: retq ; @@ -137,9 +137,9 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX-NEXT: vmovq %xmm1, (%rdi) ; AVX-NEXT: retq ; @@ -147,9 +147,9 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpltud %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) @@ -298,7 +298,6 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; ; SSE41-LABEL: uaddo_v6i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: movd %esi, %xmm0 ; SSE41-NEXT: pinsrd $1, %edx, %xmm0 ; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 @@ -311,6 +310,7 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE41-NEXT: paddd %xmm0, %xmm3 ; SSE41-NEXT: pmaxud %xmm3, %xmm0 @@ -863,7 +863,7 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; ; AVX2-LABEL: uaddo_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 @@ -964,8 +964,8 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: movw %cx, 6(%rdi) ; SSE41-NEXT: pextrd $1, %xmm0, %edx ; SSE41-NEXT: movw %dx, 3(%rdi) @@ -993,8 +993,8 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: movw %ax, 9(%rdi) ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: movw %ax, 9(%rdi) ; AVX1-NEXT: movw %cx, 6(%rdi) ; AVX1-NEXT: vpextrd $1, %xmm1, %edx ; AVX1-NEXT: movw %dx, 3(%rdi) @@ -1021,8 +1021,8 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: movw %ax, 9(%rdi) ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: movw %ax, 9(%rdi) ; AVX2-NEXT: movw %cx, 6(%rdi) ; AVX2-NEXT: vpextrd $1, %xmm1, %edx ; AVX2-NEXT: movw %dx, 3(%rdi) @@ -1048,8 +1048,8 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: movw %cx, 6(%rdi) ; AVX512-NEXT: vpextrd $1, %xmm1, %edx ; AVX512-NEXT: movw %dx, 3(%rdi) @@ -1118,80 +1118,80 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { ; SSE2-LABEL: uaddo_v2i128: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: xorl %r10d, %r10d +; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movl $0, %r11d ; SSE2-NEXT: sbbl %r11d, %r11d ; SSE2-NEXT: addq %r8, %rdi ; SSE2-NEXT: adcq %r9, %rsi +; SSE2-NEXT: sbbl %eax, %eax ; SSE2-NEXT: movd %r11d, %xmm1 -; SSE2-NEXT: sbbl %r10d, %r10d -; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rdx, 16(%rax) -; SSE2-NEXT: movq %rdi, (%rax) -; SSE2-NEXT: movq %rcx, 24(%rax) -; SSE2-NEXT: movq %rsi, 8(%rax) +; SSE2-NEXT: movq %rdx, 16(%r10) +; SSE2-NEXT: movq %rdi, (%r10) +; SSE2-NEXT: movq %rcx, 24(%r10) +; SSE2-NEXT: movq %rsi, 8(%r10) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: uaddo_v2i128: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSSE3-NEXT: xorl %r10d, %r10d +; SSSE3-NEXT: xorl %eax, %eax ; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: movl $0, %r11d ; SSSE3-NEXT: sbbl %r11d, %r11d ; SSSE3-NEXT: addq %r8, %rdi ; SSSE3-NEXT: adcq %r9, %rsi +; SSSE3-NEXT: sbbl %eax, %eax ; SSSE3-NEXT: movd %r11d, %xmm1 -; SSSE3-NEXT: sbbl %r10d, %r10d -; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rdx, 16(%rax) -; SSSE3-NEXT: movq %rdi, (%rax) -; SSSE3-NEXT: movq %rcx, 24(%rax) -; SSSE3-NEXT: movq %rsi, 8(%rax) +; SSSE3-NEXT: movq %rdx, 16(%r10) +; SSSE3-NEXT: movq %rdi, (%r10) +; SSSE3-NEXT: movq %rcx, 24(%r10) +; SSSE3-NEXT: movq %rsi, 8(%r10) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: uaddo_v2i128: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE41-NEXT: xorl %r10d, %r10d +; SSE41-NEXT: xorl %eax, %eax ; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx ; SSE41-NEXT: movl $0, %r11d ; SSE41-NEXT: sbbl %r11d, %r11d ; SSE41-NEXT: addq %r8, %rdi ; SSE41-NEXT: adcq %r9, %rsi -; SSE41-NEXT: sbbl %r10d, %r10d -; SSE41-NEXT: movd %r10d, %xmm0 +; SSE41-NEXT: sbbl %eax, %eax +; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: pinsrd $1, %r11d, %xmm0 -; SSE41-NEXT: movq %rdx, 16(%rax) -; SSE41-NEXT: movq %rdi, (%rax) -; SSE41-NEXT: movq %rcx, 24(%rax) -; SSE41-NEXT: movq %rsi, 8(%rax) +; SSE41-NEXT: movq %rdx, 16(%r10) +; SSE41-NEXT: movq %rdi, (%r10) +; SSE41-NEXT: movq %rcx, 24(%r10) +; SSE41-NEXT: movq %rsi, 8(%r10) ; SSE41-NEXT: retq ; ; AVX-LABEL: uaddo_v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: xorl %r10d, %r10d +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: addq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: adcq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: movl $0, %r11d ; AVX-NEXT: sbbl %r11d, %r11d ; AVX-NEXT: addq %r8, %rdi ; AVX-NEXT: adcq %r9, %rsi -; AVX-NEXT: sbbl %r10d, %r10d -; AVX-NEXT: vmovd %r10d, %xmm0 +; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 -; AVX-NEXT: movq %rdx, 16(%rax) -; AVX-NEXT: movq %rdi, (%rax) -; AVX-NEXT: movq %rcx, 24(%rax) -; AVX-NEXT: movq %rsi, 8(%rax) +; AVX-NEXT: movq %rdx, 16(%r10) +; AVX-NEXT: movq %rdi, (%r10) +; AVX-NEXT: movq %rcx, 24(%r10) +; AVX-NEXT: movq %rsi, 8(%r10) ; AVX-NEXT: retq ; ; AVX512-LABEL: uaddo_v2i128: diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll index c0beb6fef3234..590d5199dcfcb 100644 --- a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll +++ b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CST --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX @@ -41,48 +42,46 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) { ; SSE2-LABEL: test_uitofp_v4i32_to_v4f32: -; SSE2: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]] -; SSE2-NEXT: pand %xmm0, [[MASK]] -; After this instruction, MASK will have the value of the low parts -; of the vector. -; SSE2-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]] -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0 -; SSE2-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 -; SSE2-NEXT: addps [[MASK]], %xmm0 -; SSE2-NEXT: retq +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; Currently we commute the arguments of the first blend, but this could be -; improved to match the lowering of the second blend. ; SSE41-LABEL: test_uitofp_v4i32_to_v4f32: -; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]] -; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0 -; SSE41-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 -; SSE41-NEXT: addps [[LOWVEC]], %xmm0 -; SSE41-NEXT: retq +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_uitofp_v4i32_to_v4f32: -; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]] -; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] -; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] -; AVX-NEXT: vsubps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]] -; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 -; AVX-NEXT: retq +; AVX: # %bb.0: +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq ; -; The lowering for AVX2 is a bit messy, because we select broadcast -; instructions, instead of folding the constant loads. ; AVX2-LABEL: test_uitofp_v4i32_to_v4f32: -; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]] -; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] -; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] -; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]] -; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]] -; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 -; AVX2-NEXT: retq +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_uitofp_v4i32_to_v4f32: ; AVX512F: # %bb.0: @@ -96,6 +95,12 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VL-NEXT: retq +; After this instruction, MASK will have the value of the low parts +; of the vector. +; Currently we commute the arguments of the first blend, but this could be +; improved to match the lowering of the second blend. +; The lowering for AVX2 is a bit messy, because we select broadcast +; instructions, instead of folding the constant loads. %tmp = uitofp <4 x i32> %arg to <4 x float> ret <4 x float> %tmp } @@ -116,72 +121,87 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) { ; two sequences of instructions. ; ; SSE2-LABEL: test_uitofp_v8i32_to_v8f32: -; SSE2: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] -; SSE2-NEXT: pand %[[MASK]], [[VECLOW]] -; SSE2-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] -; SSE2-NEXT: por %[[LOWCST]], [[VECLOW]] -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] -; SSE2-NEXT: por %[[HIGHCST]], %xmm0 -; SSE2-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; SSE2-NEXT: subps %[[MAGICCST]], %xmm0 -; SSE2-NEXT: addps [[VECLOW]], %xmm0 -; MASK is the low vector of the second part after this point. -; SSE2-NEXT: pand %xmm1, %[[MASK]] -; SSE2-NEXT: por %[[LOWCST]], %[[MASK]] -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %[[HIGHCST]], %xmm1 -; SSE2-NEXT: subps %[[MAGICCST]], %xmm1 -; SSE2-NEXT: addps %[[MASK]], %xmm1 -; SSE2-NEXT: retq +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm6 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; SSE2-NEXT: subps %xmm6, %xmm0 +; SSE2-NEXT: addps %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: subps %xmm6, %xmm1 +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: retq ; ; SSE41-LABEL: test_uitofp_v8i32_to_v8f32: -; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] -; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] -; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] -; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0 -; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; SSE41-NEXT: subps %[[MAGICCST]], %xmm0 -; SSE41-NEXT: addps [[VECLOW]], %xmm0 -; LOWCST is the low vector of the second part after this point. -; The operands of the blend are inverted because we reuse xmm1 -; in the next shift. -; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]] -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1 -; SSE41-NEXT: subps %[[MAGICCST]], %xmm1 -; SSE41-NEXT: addps %[[LOWCST]], %xmm1 -; SSE41-NEXT: retq +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; SSE41-NEXT: subps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: subps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq ; -; Test that we are not lowering uinttofp to scalars -; AVX-NOT: cvtsd2ss -; AVX: retq +; AVX-LABEL: test_uitofp_v8i32_to_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq ; ; AVX2-LABEL: test_uitofp_v8i32_to_v8f32: -; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]] -; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]] -; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]] -; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]] -; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]] -; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0 -; AVX2-NEXT: retq +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vsubps %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_uitofp_v8i32_to_v8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512F-NEXT: # kill +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_uitofp_v8i32_to_v8f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; MASK is the low vector of the second part after this point. +; LOWCST is the low vector of the second part after this point. +; The operands of the blend are inverted because we reuse xmm1 +; in the next shift. +; Test that we are not lowering uinttofp to scalars %tmp = uitofp <8 x i32> %arg to <8 x float> ret <8 x float> %tmp } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp.ll index 63feb77c91586..c119bc8d3b54a 100644 --- a/llvm/test/CodeGen/X86/vec_uint_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_uint_to_fp.ll @@ -1,167 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST %s ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck --check-prefix=CHECK --check-prefix=SSE41 --check-prefix=CST %s ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST %s ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 | FileCheck --check-prefix=CHECK --check-prefix=AVX2 %s ; Check that the constant used in the vectors are the right ones. -; SSE: [[MASKCSTADDR:LCPI0_[0-9]+]]: -; SSE-NEXT: .long 65535 ## 0xffff -; SSE-NEXT: .long 65535 ## 0xffff -; SSE-NEXT: .long 65535 ## 0xffff -; SSE-NEXT: .long 65535 ## 0xffff - -; CST: [[LOWCSTADDR:LCPI0_[0-9]+]]: -; CST-NEXT: .long 1258291200 ## 0x4b000000 -; CST-NEXT: .long 1258291200 ## 0x4b000000 -; CST-NEXT: .long 1258291200 ## 0x4b000000 -; CST-NEXT: .long 1258291200 ## 0x4b000000 - -; CST: [[HIGHCSTADDR:LCPI0_[0-9]+]]: -; CST-NEXT: .long 1392508928 ## 0x53000000 -; CST-NEXT: .long 1392508928 ## 0x53000000 -; CST-NEXT: .long 1392508928 ## 0x53000000 -; CST-NEXT: .long 1392508928 ## 0x53000000 - -; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]: -; CST-NEXT: .long 0x53000080 ## float 5.49764202E+11 -; CST-NEXT: .long 0x53000080 ## float 5.49764202E+11 -; CST-NEXT: .long 0x53000080 ## float 5.49764202E+11 -; CST-NEXT: .long 0x53000080 ## float 5.49764202E+11 - -; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]: -; AVX2-NEXT: .long 1258291200 ## 0x4b000000 - -; AVX2: [[HIGHCSTADDR:LCPI0_[0-9]+]]: -; AVX2-NEXT: .long 1392508928 ## 0x53000000 - -; AVX2: [[MAGICCSTADDR:LCPI0_[0-9]+]]: -; AVX2-NEXT: .long 0x53000080 ## float 5.49764202E+11 - define <4 x float> @test1(<4 x i32> %A) nounwind { -; CHECK-LABEL: test1: +; SSE-LABEL: test1: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: retq ; -; SSE: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]] -; SSE-NEXT: pand %xmm0, [[MASK]] +; SSE41-LABEL: test1: +; SSE41: ## %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test1: +; AVX: ## %bb.0: +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test1: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; After this instruction, MASK will have the value of the low parts ; of the vector. -; SSE-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0 -; SSE-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 -; SSE-NEXT: addps [[MASK]], %xmm0 -; SSE-NEXT: retq -; ; Currently we commute the arguments of the first blend, but this could be ; improved to match the lowering of the second blend. -; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]] -; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0 -; SSE41-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 -; SSE41-NEXT: addps [[LOWVEC]], %xmm0 -; SSE41-NEXT: retq -; -; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]] -; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] -; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] -; AVX-NEXT: vsubps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]] -; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 -; AVX-NEXT: retq -; ; The lowering for AVX2 is a bit messy, because we select broadcast ; instructions, instead of folding the constant loads. -; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]] -; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] -; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] -; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]] -; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]] -; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 -; AVX2-NEXT: retq %C = uitofp <4 x i32> %A to <4 x float> ret <4 x float> %C } ; Match the AVX2 constants used in the next function -; AVX2: [[LOWCSTADDR:LCPI1_[0-9]+]]: -; AVX2-NEXT: .long 1258291200 ## 0x4b000000 - -; AVX2: [[HIGHCSTADDR:LCPI1_[0-9]+]]: -; AVX2-NEXT: .long 1392508928 ## 0x53000000 - -; AVX2: [[MAGICCSTADDR:LCPI1_[0-9]+]]: -; AVX2-NEXT: .long 0x53000080 ## float 5.49764202E+11 - define <8 x float> @test2(<8 x i32> %A) nounwind { -; CHECK-LABEL: test2: +; SSE-LABEL: test2: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; SSE-NEXT: subps %xmm6, %xmm0 +; SSE-NEXT: addps %xmm3, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: subps %xmm6, %xmm1 +; SSE-NEXT: addps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; SSE41-LABEL: test2: +; SSE41: ## %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; SSE41-NEXT: subps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: subps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: test2: +; AVX: ## %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test2: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vsubps %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX. ; The constant used for in the vector instruction are shared between the ; two sequences of instructions. -; -; SSE: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] -; SSE-NEXT: pand %[[MASK]], [[VECLOW]] -; SSE-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] -; SSE-NEXT: por %[[LOWCST]], [[VECLOW]] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] -; SSE-NEXT: por %[[HIGHCST]], %xmm0 -; SSE-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; SSE-NEXT: subps %[[MAGICCST]], %xmm0 -; SSE-NEXT: addps [[VECLOW]], %xmm0 ; MASK is the low vector of the second part after this point. -; SSE-NEXT: pand %xmm1, %[[MASK]] -; SSE-NEXT: por %[[LOWCST]], %[[MASK]] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %[[HIGHCST]], %xmm1 -; SSE-NEXT: subps %[[MAGICCST]], %xmm1 -; SSE-NEXT: addps %[[MASK]], %xmm1 -; SSE-NEXT: retq -; -; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] -; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] -; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] -; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0 -; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; SSE41-NEXT: subps %[[MAGICCST]], %xmm0 -; SSE41-NEXT: addps [[VECLOW]], %xmm0 ; LOWCST is the low vector of the second part after this point. ; The operands of the blend are inverted because we reuse xmm1 ; in the next shift. -; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]] -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1 -; SSE41-NEXT: subps %[[MAGICCST]], %xmm1 -; SSE41-NEXT: addps %[[LOWCST]], %xmm1 -; SSE41-NEXT: retq -; ; Test that we are not lowering uinttofp to scalars -; AVX-NOT: cvtsd2ss -; AVX: retq -; -; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]] -; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]] -; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]] -; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]] -; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]] -; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0 -; AVX2-NEXT: retq %C = uitofp <8 x i32> %A to <8 x float> ret <8 x float> %C } define <4 x double> @test3(<4 x i32> %arg) { -; CHECK-LABEL: test3: +; SSE-LABEL: test3: +; SSE: ## %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: xorpd %xmm2, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE-NEXT: orpd %xmm3, %xmm0 +; SSE-NEXT: subpd %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: orpd %xmm3, %xmm1 +; SSE-NEXT: subpd %xmm3, %xmm1 +; SSE-NEXT: retq +; +; SSE41-LABEL: test3: +; SSE41: ## %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: subpd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: subpd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: test3: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vorpd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test3: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; This test used to crash because we were custom lowering it as if it was ; a conversion between <4 x i32> and <4 x float>. -; AVX: vsubpd -; AVX2: vsubpd -; CHECK: retq %tmp = uitofp <4 x i32> %arg to <4 x double> ret <4 x double> %tmp } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} +; CST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 62db6d234d301..552068b3f81f7 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -176,12 +176,12 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm2 ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm1 ; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: umulo_v3i32: @@ -194,12 +194,11 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi) -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vmovdqa %xmm2, %xmm0 +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovq %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v3i32: @@ -212,12 +211,11 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi) -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovq %xmm1, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: umulo_v3i32: @@ -226,13 +224,13 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) @@ -342,7 +340,7 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 @@ -461,48 +459,49 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; ; SSE41-LABEL: umulo_v6i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edi -; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT: movdqa %xmm3, %xmm2 +; SSE41-NEXT: pinsrd $1, %r10d, %xmm2 ; SSE41-NEXT: movd %r9d, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm1 -; SSE41-NEXT: pinsrd $1, %edi, %xmm2 ; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %r9d +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pinsrd $1, %r9d, %xmm0 +; SSE41-NEXT: movd %esi, %xmm4 +; SSE41-NEXT: pinsrd $1, %edx, %xmm4 +; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: pinsrd $2, %ecx, %xmm4 +; SSE41-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm5 +; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm5 +; SSE41-NEXT: pmuludq %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pmuludq %xmm4, %xmm5 +; SSE41-NEXT: pinsrd $3, %r8d, %xmm4 +; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pmulld %xmm2, %xmm0 -; SSE41-NEXT: movd %esi, %xmm2 -; SSE41-NEXT: pinsrd $1, %edx, %xmm2 -; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 -; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 -; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm4 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE41-NEXT: pmuludq %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7] ; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE41-NEXT: pxor %xmm6, %xmm3 -; SSE41-NEXT: movd %edi, %xmm7 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: movd %r10d, %xmm7 ; SSE41-NEXT: movd %r9d, %xmm8 ; SSE41-NEXT: pmuludq %xmm7, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] ; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 ; SSE41-NEXT: pxor %xmm6, %xmm1 -; SSE41-NEXT: pmulld %xmm2, %xmm4 +; SSE41-NEXT: pmulld %xmm4, %xmm3 ; SSE41-NEXT: movq %xmm0, 16(%rcx) -; SSE41-NEXT: movdqa %xmm4, (%rcx) -; SSE41-NEXT: movq %xmm1, 16(%rax) -; SSE41-NEXT: movdqa %xmm3, (%rax) +; SSE41-NEXT: movdqa %xmm3, (%rcx) +; SSE41-NEXT: movq %xmm1, 16(%rdi) +; SSE41-NEXT: movdqa %xmm2, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: umulo_v6i32: @@ -560,7 +559,7 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] ; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 @@ -730,7 +729,7 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] ; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] ; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 @@ -1022,7 +1021,7 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin ; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX512-NEXT: vpmuludq %zmm3, %zmm4, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1 @@ -1128,7 +1127,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm4, %xmm1 ; SSE41-NEXT: pmullw %xmm3, %xmm5 @@ -1419,7 +1418,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm0, %xmm6 ; SSE41-NEXT: pand %xmm2, %xmm6 ; SSE41-NEXT: pmullw %xmm5, %xmm4 @@ -1951,7 +1950,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] ; SSE41-NEXT: pmullw %xmm4, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pand %xmm9, %xmm4 ; SSE41-NEXT: pmullw %xmm10, %xmm8 @@ -2265,47 +2264,47 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; ; AVX512F-LABEL: umulo_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k2 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero +; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm5 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm6 -; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k3 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm4 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k2 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero +; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm5 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k3 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm7 -; AVX512F-NEXT: vpsrlw $8, %ymm7, %ymm0 +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, 48(%rdi) ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpmovdb %zmm2, 32(%rdi) ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero +; AVX512F-NEXT: vpmovdb %zmm3, 16(%rdi) ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi) ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero -; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero ; AVX512F-NEXT: vpmovdb %zmm4, (%rdi) ; AVX512F-NEXT: retq ; @@ -2315,7 +2314,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63] ; AVX512BW-NEXT: vpmullw %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm5 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55] @@ -2457,22 +2456,23 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rcx ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rsi +; SSE2-NEXT: movq %xmm2, %r8 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movq %xmm1, %rdx -; SSE2-NEXT: xorl %r8d, %r8d +; SSE2-NEXT: xorl %r9d, %r9d ; SSE2-NEXT: mulq %rdx -; SSE2-NEXT: movq $-1, %r9 -; SSE2-NEXT: movl $0, %r10d -; SSE2-NEXT: cmovoq %r9, %r10 -; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: movq $-1, %r10 +; SSE2-NEXT: movl $0, %r11d +; SSE2-NEXT: cmovoq %r10, %r11 ; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi +; SSE2-NEXT: mulq %r8 +; SSE2-NEXT: movq %rsi, %xmm1 ; SSE2-NEXT: movq %rax, %xmm0 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movq %r10, %xmm0 -; SSE2-NEXT: cmovoq %r9, %r8 -; SSE2-NEXT: movq %r8, %xmm2 +; SSE2-NEXT: movq %r11, %xmm0 +; SSE2-NEXT: cmovoq %r10, %r9 +; SSE2-NEXT: movq %r9, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: movdqa %xmm1, (%rdi) @@ -2483,22 +2483,23 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSSE3-NEXT: movq %xmm2, %rcx ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSSE3-NEXT: movq %xmm2, %rsi +; SSSE3-NEXT: movq %xmm2, %r8 ; SSSE3-NEXT: movq %xmm0, %rax ; SSSE3-NEXT: movq %xmm1, %rdx -; SSSE3-NEXT: xorl %r8d, %r8d +; SSSE3-NEXT: xorl %r9d, %r9d ; SSSE3-NEXT: mulq %rdx -; SSSE3-NEXT: movq $-1, %r9 -; SSSE3-NEXT: movl $0, %r10d -; SSSE3-NEXT: cmovoq %r9, %r10 -; SSSE3-NEXT: movq %rax, %xmm1 +; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: movq $-1, %r10 +; SSSE3-NEXT: movl $0, %r11d +; SSSE3-NEXT: cmovoq %r10, %r11 ; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: mulq %rsi +; SSSE3-NEXT: mulq %r8 +; SSSE3-NEXT: movq %rsi, %xmm1 ; SSSE3-NEXT: movq %rax, %xmm0 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: movq %r10, %xmm0 -; SSSE3-NEXT: cmovoq %r9, %r8 -; SSSE3-NEXT: movq %r8, %xmm2 +; SSSE3-NEXT: movq %r11, %xmm0 +; SSSE3-NEXT: cmovoq %r10, %r9 +; SSSE3-NEXT: movq %r9, %xmm2 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: movdqa %xmm1, (%rdi) @@ -2507,22 +2508,23 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSE41-LABEL: umulo_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %xmm0, %rcx -; SSE41-NEXT: movq %xmm1, %rsi +; SSE41-NEXT: movq %xmm1, %r8 ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: pextrq $1, %xmm1, %rdx -; SSE41-NEXT: xorl %r8d, %r8d +; SSE41-NEXT: xorl %r9d, %r9d ; SSE41-NEXT: mulq %rdx -; SSE41-NEXT: movq $-1, %r9 -; SSE41-NEXT: movl $0, %r10d -; SSE41-NEXT: cmovoq %r9, %r10 -; SSE41-NEXT: movq %rax, %xmm0 +; SSE41-NEXT: movq %rax, %rsi +; SSE41-NEXT: movq $-1, %r10 +; SSE41-NEXT: movl $0, %r11d +; SSE41-NEXT: cmovoq %r10, %r11 ; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi +; SSE41-NEXT: mulq %r8 +; SSE41-NEXT: movq %rsi, %xmm0 ; SSE41-NEXT: movq %rax, %xmm1 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: movq %r10, %xmm0 -; SSE41-NEXT: cmovoq %r9, %r8 -; SSE41-NEXT: movq %r8, %xmm2 +; SSE41-NEXT: movq %r11, %xmm0 +; SSE41-NEXT: cmovoq %r10, %r9 +; SSE41-NEXT: movq %r9, %xmm2 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: movdqa %xmm1, (%rdi) @@ -2531,22 +2533,23 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; AVX-LABEL: umulo_v2i64: ; AVX: # %bb.0: ; AVX-NEXT: vmovq %xmm0, %rcx -; AVX-NEXT: vmovq %xmm1, %rsi +; AVX-NEXT: vmovq %xmm1, %r8 ; AVX-NEXT: vpextrq $1, %xmm0, %rax ; AVX-NEXT: vpextrq $1, %xmm1, %rdx -; AVX-NEXT: xorl %r8d, %r8d +; AVX-NEXT: xorl %r9d, %r9d ; AVX-NEXT: mulq %rdx -; AVX-NEXT: movq $-1, %r9 -; AVX-NEXT: movl $0, %r10d -; AVX-NEXT: cmovoq %r9, %r10 -; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: movq $-1, %r10 +; AVX-NEXT: movl $0, %r11d +; AVX-NEXT: cmovoq %r10, %r11 ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi +; AVX-NEXT: mulq %r8 +; AVX-NEXT: vmovq %rsi, %xmm0 ; AVX-NEXT: vmovq %rax, %xmm1 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovq %r10, %xmm0 -; AVX-NEXT: cmovoq %r9, %r8 -; AVX-NEXT: vmovq %r8, %xmm2 +; AVX-NEXT: vmovq %r11, %xmm0 +; AVX-NEXT: cmovoq %r10, %r9 +; AVX-NEXT: vmovq %r9, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vmovdqa %xmm1, (%rdi) @@ -2555,9 +2558,9 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; AVX512F-LABEL: umulo_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %xmm0, %rcx -; AVX512F-NEXT: vmovq %xmm1, %rsi ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512F-NEXT: vmovq %xmm1, %rsi ; AVX512F-NEXT: mulq %rdx ; AVX512F-NEXT: seto %r8b ; AVX512F-NEXT: vmovq %rax, %xmm0 @@ -2580,9 +2583,9 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; AVX512BW-LABEL: umulo_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq %xmm0, %rcx -; AVX512BW-NEXT: vmovq %xmm1, %rsi ; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax ; AVX512BW-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512BW-NEXT: vmovq %xmm1, %rsi ; AVX512BW-NEXT: mulq %rdx ; AVX512BW-NEXT: seto %r8b ; AVX512BW-NEXT: vmovq %rax, %xmm0 @@ -2755,8 +2758,8 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: movw %ax, 9(%rdi) ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: movw %ax, 9(%rdi) ; AVX1-NEXT: movw %cx, 6(%rdi) ; AVX1-NEXT: vpextrd $1, %xmm1, %edx ; AVX1-NEXT: movw %dx, 3(%rdi) @@ -2792,8 +2795,8 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: movw %ax, 9(%rdi) ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: movw %ax, 9(%rdi) ; AVX2-NEXT: movw %cx, 6(%rdi) ; AVX2-NEXT: vpextrd $1, %xmm1, %edx ; AVX2-NEXT: movw %dx, 3(%rdi) @@ -2818,7 +2821,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,3,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] ; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0 @@ -2827,8 +2830,8 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: movw %cx, 6(%rdi) ; AVX512-NEXT: vpextrd $1, %xmm1, %edx ; AVX512-NEXT: movw %dx, 3(%rdi) @@ -2905,51 +2908,51 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: pushq %r14 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %r9, %r11 ; SSE2-NEXT: movq %rcx, %r10 ; SSE2-NEXT: movq %rdx, %rcx ; SSE2-NEXT: movq %rsi, %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSE2-NEXT: testq %r11, %r11 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE2-NEXT: testq %r9, %r9 ; SSE2-NEXT: setne %dl ; SSE2-NEXT: testq %rsi, %rsi -; SSE2-NEXT: setne %bpl -; SSE2-NEXT: andb %dl, %bpl +; SSE2-NEXT: setne %sil +; SSE2-NEXT: andb %dl, %sil ; SSE2-NEXT: mulq %r8 -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: seto %r15b -; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: movq %rax, %r11 +; SSE2-NEXT: seto %bpl +; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: mulq %rdi +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; SSE2-NEXT: seto %r12b -; SSE2-NEXT: orb %r15b, %r12b ; SSE2-NEXT: orb %bpl, %r12b -; SSE2-NEXT: leaq (%rsi,%rax), %r11 +; SSE2-NEXT: orb %sil, %r12b ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rdi ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: addq %r11, %rsi +; SSE2-NEXT: leaq (%r11,%rbx), %rax +; SSE2-NEXT: addq %rax, %rsi ; SSE2-NEXT: setb %r11b ; SSE2-NEXT: orb %r12b, %r11b ; SSE2-NEXT: testq %r9, %r9 ; SSE2-NEXT: setne %al ; SSE2-NEXT: testq %r10, %r10 -; SSE2-NEXT: setne %bpl -; SSE2-NEXT: andb %al, %bpl +; SSE2-NEXT: setne %bl +; SSE2-NEXT: andb %al, %bl ; SSE2-NEXT: movq %r10, %rax -; SSE2-NEXT: mulq %r14 +; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: movq %rax, %r8 ; SSE2-NEXT: seto %r10b ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: mulq %rcx ; SSE2-NEXT: seto %r9b ; SSE2-NEXT: orb %r10b, %r9b -; SSE2-NEXT: orb %bpl, %r9b +; SSE2-NEXT: orb %bl, %r9b ; SSE2-NEXT: addq %rax, %r8 ; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %r14 +; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: addq %r8, %rdx ; SSE2-NEXT: setb %cl ; SSE2-NEXT: orb %r9b, %cl @@ -2960,10 +2963,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rax, 16(%rbx) -; SSE2-NEXT: movq %rdi, (%rbx) -; SSE2-NEXT: movq %rdx, 24(%rbx) -; SSE2-NEXT: movq %rsi, 8(%rbx) +; SSE2-NEXT: movq %rax, 16(%r14) +; SSE2-NEXT: movq %rdi, (%r14) +; SSE2-NEXT: movq %rdx, 24(%r14) +; SSE2-NEXT: movq %rsi, 8(%r14) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r14 @@ -2978,51 +2981,51 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: pushq %r14 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %r9, %r11 ; SSSE3-NEXT: movq %rcx, %r10 ; SSSE3-NEXT: movq %rdx, %rcx ; SSSE3-NEXT: movq %rsi, %rax -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSSE3-NEXT: testq %r11, %r11 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSSE3-NEXT: testq %r9, %r9 ; SSSE3-NEXT: setne %dl ; SSSE3-NEXT: testq %rsi, %rsi -; SSSE3-NEXT: setne %bpl -; SSSE3-NEXT: andb %dl, %bpl +; SSSE3-NEXT: setne %sil +; SSSE3-NEXT: andb %dl, %sil ; SSSE3-NEXT: mulq %r8 -; SSSE3-NEXT: movq %rax, %rsi -; SSSE3-NEXT: seto %r15b -; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: movq %rax, %r11 +; SSSE3-NEXT: seto %bpl +; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: mulq %rdi +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; SSSE3-NEXT: seto %r12b -; SSSE3-NEXT: orb %r15b, %r12b ; SSSE3-NEXT: orb %bpl, %r12b -; SSSE3-NEXT: leaq (%rsi,%rax), %r11 +; SSSE3-NEXT: orb %sil, %r12b ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rdi ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: addq %r11, %rsi +; SSSE3-NEXT: leaq (%r11,%rbx), %rax +; SSSE3-NEXT: addq %rax, %rsi ; SSSE3-NEXT: setb %r11b ; SSSE3-NEXT: orb %r12b, %r11b ; SSSE3-NEXT: testq %r9, %r9 ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: testq %r10, %r10 -; SSSE3-NEXT: setne %bpl -; SSSE3-NEXT: andb %al, %bpl +; SSSE3-NEXT: setne %bl +; SSSE3-NEXT: andb %al, %bl ; SSSE3-NEXT: movq %r10, %rax -; SSSE3-NEXT: mulq %r14 +; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: movq %rax, %r8 ; SSSE3-NEXT: seto %r10b ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: mulq %rcx ; SSSE3-NEXT: seto %r9b ; SSSE3-NEXT: orb %r10b, %r9b -; SSSE3-NEXT: orb %bpl, %r9b +; SSSE3-NEXT: orb %bl, %r9b ; SSSE3-NEXT: addq %rax, %r8 ; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: mulq %r14 +; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: addq %r8, %rdx ; SSSE3-NEXT: setb %cl ; SSSE3-NEXT: orb %r9b, %cl @@ -3033,10 +3036,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rax, 16(%rbx) -; SSSE3-NEXT: movq %rdi, (%rbx) -; SSSE3-NEXT: movq %rdx, 24(%rbx) -; SSSE3-NEXT: movq %rsi, 8(%rbx) +; SSSE3-NEXT: movq %rax, 16(%r14) +; SSSE3-NEXT: movq %rdi, (%r14) +; SSSE3-NEXT: movq %rdx, 24(%r14) +; SSSE3-NEXT: movq %rsi, 8(%r14) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r14 @@ -3051,51 +3054,51 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: pushq %r14 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %r9, %r11 ; SSE41-NEXT: movq %rcx, %r10 ; SSE41-NEXT: movq %rdx, %rcx ; SSE41-NEXT: movq %rsi, %rax -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSE41-NEXT: testq %r11, %r11 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %dl ; SSE41-NEXT: testq %rsi, %rsi -; SSE41-NEXT: setne %bpl -; SSE41-NEXT: andb %dl, %bpl +; SSE41-NEXT: setne %sil +; SSE41-NEXT: andb %dl, %sil ; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: movq %rax, %rsi -; SSE41-NEXT: seto %r15b -; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: movq %rax, %r11 +; SSE41-NEXT: seto %bpl +; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: mulq %rdi +; SSE41-NEXT: movq %rax, %rbx +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; SSE41-NEXT: seto %r12b -; SSE41-NEXT: orb %r15b, %r12b ; SSE41-NEXT: orb %bpl, %r12b -; SSE41-NEXT: leaq (%rsi,%rax), %r11 +; SSE41-NEXT: orb %sil, %r12b ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rdi ; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: addq %r11, %rsi +; SSE41-NEXT: leaq (%r11,%rbx), %rax +; SSE41-NEXT: addq %rax, %rsi ; SSE41-NEXT: setb %r11b ; SSE41-NEXT: orb %r12b, %r11b ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %al ; SSE41-NEXT: testq %r10, %r10 -; SSE41-NEXT: setne %bpl -; SSE41-NEXT: andb %al, %bpl +; SSE41-NEXT: setne %bl +; SSE41-NEXT: andb %al, %bl ; SSE41-NEXT: movq %r10, %rax -; SSE41-NEXT: mulq %r14 +; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: movq %rax, %r8 ; SSE41-NEXT: seto %r10b ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: mulq %rcx ; SSE41-NEXT: seto %r9b ; SSE41-NEXT: orb %r10b, %r9b -; SSE41-NEXT: orb %bpl, %r9b +; SSE41-NEXT: orb %bl, %r9b ; SSE41-NEXT: addq %rax, %r8 ; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %r14 +; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: addq %r8, %rdx ; SSE41-NEXT: setb %cl ; SSE41-NEXT: orb %r9b, %cl @@ -3105,10 +3108,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: negl %r8d ; SSE41-NEXT: movd %r8d, %xmm0 ; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, 16(%rbx) -; SSE41-NEXT: movq %rdi, (%rbx) -; SSE41-NEXT: movq %rdx, 24(%rbx) -; SSE41-NEXT: movq %rsi, 8(%rbx) +; SSE41-NEXT: movq %rax, 16(%r14) +; SSE41-NEXT: movq %rdi, (%r14) +; SSE41-NEXT: movq %rdx, 24(%r14) +; SSE41-NEXT: movq %rsi, 8(%r14) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r14 @@ -3123,51 +3126,51 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: pushq %r14 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %r9, %r11 ; AVX-NEXT: movq %rcx, %r10 ; AVX-NEXT: movq %rdx, %rcx ; AVX-NEXT: movq %rsi, %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; AVX-NEXT: testq %r11, %r11 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX-NEXT: testq %r9, %r9 ; AVX-NEXT: setne %dl ; AVX-NEXT: testq %rsi, %rsi -; AVX-NEXT: setne %bpl -; AVX-NEXT: andb %dl, %bpl +; AVX-NEXT: setne %sil +; AVX-NEXT: andb %dl, %sil ; AVX-NEXT: mulq %r8 -; AVX-NEXT: movq %rax, %rsi -; AVX-NEXT: seto %r15b -; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: movq %rax, %r11 +; AVX-NEXT: seto %bpl +; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: mulq %rdi +; AVX-NEXT: movq %rax, %rbx +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; AVX-NEXT: seto %r12b -; AVX-NEXT: orb %r15b, %r12b ; AVX-NEXT: orb %bpl, %r12b -; AVX-NEXT: leaq (%rsi,%rax), %r11 +; AVX-NEXT: orb %sil, %r12b ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %rdi ; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: addq %r11, %rsi +; AVX-NEXT: leaq (%r11,%rbx), %rax +; AVX-NEXT: addq %rax, %rsi ; AVX-NEXT: setb %r11b ; AVX-NEXT: orb %r12b, %r11b ; AVX-NEXT: testq %r9, %r9 ; AVX-NEXT: setne %al ; AVX-NEXT: testq %r10, %r10 -; AVX-NEXT: setne %bpl -; AVX-NEXT: andb %al, %bpl +; AVX-NEXT: setne %bl +; AVX-NEXT: andb %al, %bl ; AVX-NEXT: movq %r10, %rax -; AVX-NEXT: mulq %r14 +; AVX-NEXT: mulq %r15 ; AVX-NEXT: movq %rax, %r8 ; AVX-NEXT: seto %r10b ; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: mulq %rcx ; AVX-NEXT: seto %r9b ; AVX-NEXT: orb %r10b, %r9b -; AVX-NEXT: orb %bpl, %r9b +; AVX-NEXT: orb %bl, %r9b ; AVX-NEXT: addq %rax, %r8 ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %r14 +; AVX-NEXT: mulq %r15 ; AVX-NEXT: addq %r8, %rdx ; AVX-NEXT: setb %cl ; AVX-NEXT: orb %r9b, %cl @@ -3177,10 +3180,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: negl %r8d ; AVX-NEXT: vmovd %r8d, %xmm0 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, 16(%rbx) -; AVX-NEXT: movq %rdi, (%rbx) -; AVX-NEXT: movq %rdx, 24(%rbx) -; AVX-NEXT: movq %rsi, 8(%rbx) +; AVX-NEXT: movq %rax, 16(%r14) +; AVX-NEXT: movq %rdi, (%r14) +; AVX-NEXT: movq %rdx, 24(%r14) +; AVX-NEXT: movq %rsi, 8(%r14) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r14 @@ -3193,35 +3196,37 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: pushq %rbp ; AVX512F-NEXT: pushq %r15 ; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %r13 ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx ; AVX512F-NEXT: movq %rcx, %rax ; AVX512F-NEXT: movq %rdx, %rcx ; AVX512F-NEXT: movq %rsi, %r10 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512F-NEXT: testq %rsi, %rsi ; AVX512F-NEXT: setne %dl ; AVX512F-NEXT: testq %rax, %rax ; AVX512F-NEXT: setne %bpl ; AVX512F-NEXT: andb %dl, %bpl -; AVX512F-NEXT: mulq %r14 +; AVX512F-NEXT: mulq %r15 ; AVX512F-NEXT: movq %rax, %r11 -; AVX512F-NEXT: seto %r15b +; AVX512F-NEXT: seto %r12b ; AVX512F-NEXT: movq %rsi, %rax ; AVX512F-NEXT: mulq %rcx -; AVX512F-NEXT: seto %r12b -; AVX512F-NEXT: orb %r15b, %r12b -; AVX512F-NEXT: orb %bpl, %r12b -; AVX512F-NEXT: addq %rax, %r11 +; AVX512F-NEXT: movq %rax, %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: seto %r13b +; AVX512F-NEXT: orb %r12b, %r13b +; AVX512F-NEXT: orb %bpl, %r13b ; AVX512F-NEXT: movq %rcx, %rax -; AVX512F-NEXT: mulq %r14 +; AVX512F-NEXT: mulq %r15 ; AVX512F-NEXT: movq %rax, %rsi ; AVX512F-NEXT: movq %rdx, %rcx -; AVX512F-NEXT: addq %r11, %rcx +; AVX512F-NEXT: leaq (%r11,%rbx), %rax +; AVX512F-NEXT: addq %rax, %rcx ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: orb %r12b, %al +; AVX512F-NEXT: orb %r13b, %al ; AVX512F-NEXT: kmovw %eax, %k0 ; AVX512F-NEXT: testq %r9, %r9 ; AVX512F-NEXT: setne %al @@ -3231,11 +3236,11 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: movq %r10, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: seto %bpl +; AVX512F-NEXT: seto %bl ; AVX512F-NEXT: movq %r9, %rax ; AVX512F-NEXT: mulq %rdi ; AVX512F-NEXT: seto %r9b -; AVX512F-NEXT: orb %bpl, %r9b +; AVX512F-NEXT: orb %bl, %r9b ; AVX512F-NEXT: orb %r11b, %r9b ; AVX512F-NEXT: addq %rax, %r10 ; AVX512F-NEXT: movq %rdi, %rax @@ -3249,12 +3254,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq %rsi, 16(%rbx) -; AVX512F-NEXT: movq %rax, (%rbx) -; AVX512F-NEXT: movq %rcx, 24(%rbx) -; AVX512F-NEXT: movq %rdx, 8(%rbx) +; AVX512F-NEXT: movq %rsi, 16(%r14) +; AVX512F-NEXT: movq %rax, (%r14) +; AVX512F-NEXT: movq %rcx, 24(%r14) +; AVX512F-NEXT: movq %rdx, 8(%r14) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 +; AVX512F-NEXT: popq %r13 ; AVX512F-NEXT: popq %r14 ; AVX512F-NEXT: popq %r15 ; AVX512F-NEXT: popq %rbp @@ -3265,35 +3271,37 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: pushq %rbp ; AVX512BW-NEXT: pushq %r15 ; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: pushq %r13 ; AVX512BW-NEXT: pushq %r12 ; AVX512BW-NEXT: pushq %rbx ; AVX512BW-NEXT: movq %rcx, %rax ; AVX512BW-NEXT: movq %rdx, %rcx ; AVX512BW-NEXT: movq %rsi, %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512BW-NEXT: testq %rsi, %rsi ; AVX512BW-NEXT: setne %dl ; AVX512BW-NEXT: testq %rax, %rax ; AVX512BW-NEXT: setne %bpl ; AVX512BW-NEXT: andb %dl, %bpl -; AVX512BW-NEXT: mulq %r14 +; AVX512BW-NEXT: mulq %r15 ; AVX512BW-NEXT: movq %rax, %r11 -; AVX512BW-NEXT: seto %r15b +; AVX512BW-NEXT: seto %r12b ; AVX512BW-NEXT: movq %rsi, %rax ; AVX512BW-NEXT: mulq %rcx -; AVX512BW-NEXT: seto %r12b -; AVX512BW-NEXT: orb %r15b, %r12b -; AVX512BW-NEXT: orb %bpl, %r12b -; AVX512BW-NEXT: addq %rax, %r11 +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512BW-NEXT: seto %r13b +; AVX512BW-NEXT: orb %r12b, %r13b +; AVX512BW-NEXT: orb %bpl, %r13b ; AVX512BW-NEXT: movq %rcx, %rax -; AVX512BW-NEXT: mulq %r14 +; AVX512BW-NEXT: mulq %r15 ; AVX512BW-NEXT: movq %rax, %rsi ; AVX512BW-NEXT: movq %rdx, %rcx -; AVX512BW-NEXT: addq %r11, %rcx +; AVX512BW-NEXT: leaq (%r11,%rbx), %rax +; AVX512BW-NEXT: addq %rax, %rcx ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: orb %r12b, %al +; AVX512BW-NEXT: orb %r13b, %al ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: testq %r9, %r9 ; AVX512BW-NEXT: setne %al @@ -3303,11 +3311,11 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: movq %r10, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: seto %bpl +; AVX512BW-NEXT: seto %bl ; AVX512BW-NEXT: movq %r9, %rax ; AVX512BW-NEXT: mulq %rdi ; AVX512BW-NEXT: seto %r9b -; AVX512BW-NEXT: orb %bpl, %r9b +; AVX512BW-NEXT: orb %bl, %r9b ; AVX512BW-NEXT: orb %r11b, %r9b ; AVX512BW-NEXT: addq %rax, %r10 ; AVX512BW-NEXT: movq %rdi, %rax @@ -3321,12 +3329,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq %rsi, 16(%rbx) -; AVX512BW-NEXT: movq %rax, (%rbx) -; AVX512BW-NEXT: movq %rcx, 24(%rbx) -; AVX512BW-NEXT: movq %rdx, 8(%rbx) +; AVX512BW-NEXT: movq %rsi, 16(%r14) +; AVX512BW-NEXT: movq %rax, (%r14) +; AVX512BW-NEXT: movq %rcx, 24(%r14) +; AVX512BW-NEXT: movq %rdx, 8(%r14) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 +; AVX512BW-NEXT: popq %r13 ; AVX512BW-NEXT: popq %r14 ; AVX512BW-NEXT: popq %r15 ; AVX512BW-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index ceb1ad13bc153..6f4aa659dff15 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -136,9 +136,9 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: psubd %xmm1, %xmm2 ; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE41-NEXT: pextrd $2, %xmm2, 8(%rdi) ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pextrd $2, %xmm2, 8(%rdi) ; SSE41-NEXT: movq %xmm2, (%rdi) ; SSE41-NEXT: retq ; @@ -147,9 +147,9 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX-NEXT: vmovq %xmm1, (%rdi) ; AVX-NEXT: retq ; @@ -157,9 +157,9 @@ define <3 x i32> @usubo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpnleud %xmm0, %xmm1, %k1 +; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) ; AVX512-NEXT: vmovq %xmm1, (%rdi) ; AVX512-NEXT: retq %t = call {<3 x i32>, <3 x i1>} @llvm.usub.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) @@ -317,7 +317,6 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; ; SSE41-LABEL: usubo_v6i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: movd %esi, %xmm0 ; SSE41-NEXT: pinsrd $1, %edx, %xmm0 ; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 @@ -329,6 +328,7 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -910,7 +910,7 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; ; AVX2-LABEL: usubo_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 @@ -1011,8 +1011,8 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm1 ; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: pextrd $2, %xmm0, %ecx +; SSE41-NEXT: movw %ax, 9(%rdi) ; SSE41-NEXT: movw %cx, 6(%rdi) ; SSE41-NEXT: pextrd $1, %xmm0, %edx ; SSE41-NEXT: movw %dx, 3(%rdi) @@ -1040,8 +1040,8 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: movw %ax, 9(%rdi) ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: movw %ax, 9(%rdi) ; AVX1-NEXT: movw %cx, 6(%rdi) ; AVX1-NEXT: vpextrd $1, %xmm1, %edx ; AVX1-NEXT: movw %dx, 3(%rdi) @@ -1068,8 +1068,8 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: movw %ax, 9(%rdi) ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: movw %ax, 9(%rdi) ; AVX2-NEXT: movw %cx, 6(%rdi) ; AVX2-NEXT: vpextrd $1, %xmm1, %edx ; AVX2-NEXT: movw %dx, 3(%rdi) @@ -1095,8 +1095,8 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm1, %ecx +; AVX512-NEXT: movw %ax, 9(%rdi) ; AVX512-NEXT: movw %cx, 6(%rdi) ; AVX512-NEXT: vpextrd $1, %xmm1, %edx ; AVX512-NEXT: movw %dx, 3(%rdi) @@ -1165,80 +1165,80 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { define <2 x i32> @usubo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { ; SSE2-LABEL: usubo_v2i128: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: xorl %r10d, %r10d +; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE2-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movl $0, %r11d ; SSE2-NEXT: sbbl %r11d, %r11d ; SSE2-NEXT: subq %r8, %rdi ; SSE2-NEXT: sbbq %r9, %rsi +; SSE2-NEXT: sbbl %eax, %eax ; SSE2-NEXT: movd %r11d, %xmm1 -; SSE2-NEXT: sbbl %r10d, %r10d -; SSE2-NEXT: movd %r10d, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rdx, 16(%rax) -; SSE2-NEXT: movq %rdi, (%rax) -; SSE2-NEXT: movq %rcx, 24(%rax) -; SSE2-NEXT: movq %rsi, 8(%rax) +; SSE2-NEXT: movq %rdx, 16(%r10) +; SSE2-NEXT: movq %rdi, (%r10) +; SSE2-NEXT: movq %rcx, 24(%r10) +; SSE2-NEXT: movq %rsi, 8(%r10) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: usubo_v2i128: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSSE3-NEXT: xorl %r10d, %r10d +; SSSE3-NEXT: xorl %eax, %eax ; SSSE3-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSSE3-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: movl $0, %r11d ; SSSE3-NEXT: sbbl %r11d, %r11d ; SSSE3-NEXT: subq %r8, %rdi ; SSSE3-NEXT: sbbq %r9, %rsi +; SSSE3-NEXT: sbbl %eax, %eax ; SSSE3-NEXT: movd %r11d, %xmm1 -; SSSE3-NEXT: sbbl %r10d, %r10d -; SSSE3-NEXT: movd %r10d, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rdx, 16(%rax) -; SSSE3-NEXT: movq %rdi, (%rax) -; SSSE3-NEXT: movq %rcx, 24(%rax) -; SSSE3-NEXT: movq %rsi, 8(%rax) +; SSSE3-NEXT: movq %rdx, 16(%r10) +; SSSE3-NEXT: movq %rdi, (%r10) +; SSSE3-NEXT: movq %rcx, 24(%r10) +; SSSE3-NEXT: movq %rsi, 8(%r10) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: usubo_v2i128: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE41-NEXT: xorl %r10d, %r10d +; SSE41-NEXT: xorl %eax, %eax ; SSE41-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE41-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx ; SSE41-NEXT: movl $0, %r11d ; SSE41-NEXT: sbbl %r11d, %r11d ; SSE41-NEXT: subq %r8, %rdi ; SSE41-NEXT: sbbq %r9, %rsi -; SSE41-NEXT: sbbl %r10d, %r10d -; SSE41-NEXT: movd %r10d, %xmm0 +; SSE41-NEXT: sbbl %eax, %eax +; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: pinsrd $1, %r11d, %xmm0 -; SSE41-NEXT: movq %rdx, 16(%rax) -; SSE41-NEXT: movq %rdi, (%rax) -; SSE41-NEXT: movq %rcx, 24(%rax) -; SSE41-NEXT: movq %rsi, 8(%rax) +; SSE41-NEXT: movq %rdx, 16(%r10) +; SSE41-NEXT: movq %rdi, (%r10) +; SSE41-NEXT: movq %rcx, 24(%r10) +; SSE41-NEXT: movq %rsi, 8(%r10) ; SSE41-NEXT: retq ; ; AVX-LABEL: usubo_v2i128: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: xorl %r10d, %r10d +; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: subq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: movl $0, %r11d ; AVX-NEXT: sbbl %r11d, %r11d ; AVX-NEXT: subq %r8, %rdi ; AVX-NEXT: sbbq %r9, %rsi -; AVX-NEXT: sbbl %r10d, %r10d -; AVX-NEXT: vmovd %r10d, %xmm0 +; AVX-NEXT: sbbl %eax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 -; AVX-NEXT: movq %rdx, 16(%rax) -; AVX-NEXT: movq %rdi, (%rax) -; AVX-NEXT: movq %rcx, 24(%rax) -; AVX-NEXT: movq %rsi, 8(%rax) +; AVX-NEXT: movq %rdx, 16(%r10) +; AVX-NEXT: movq %rdi, (%r10) +; AVX-NEXT: movq %rcx, 24(%r10) +; AVX-NEXT: movq %rsi, 8(%r10) ; AVX-NEXT: retq ; ; AVX512-LABEL: usubo_v2i128: diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index 5dcf19013f0b7..8551ce6844c82 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -357,7 +357,7 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { ; ; AVX2-LABEL: test_bitreverse_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -370,7 +370,7 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { ; ; AVX512-LABEL: test_bitreverse_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -462,7 +462,7 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { ; AVX2-LABEL: test_bitreverse_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -476,7 +476,7 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { ; AVX512-LABEL: test_bitreverse_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -575,7 +575,7 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { ; AVX2-LABEL: test_bitreverse_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -589,7 +589,7 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { ; AVX512-LABEL: test_bitreverse_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -690,7 +690,7 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { ; AVX2-LABEL: test_bitreverse_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -704,7 +704,7 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { ; AVX512-LABEL: test_bitreverse_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -829,31 +829,27 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_bitreverse_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_bitreverse_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq ; ; XOPAVX1-LABEL: test_bitreverse_v32i8: ; XOPAVX1: # %bb.0: @@ -997,32 +993,28 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_bitreverse_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_bitreverse_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq ; ; XOPAVX1-LABEL: test_bitreverse_v16i16: ; XOPAVX1: # %bb.0: @@ -1196,32 +1188,28 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_bitreverse_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_bitreverse_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq ; ; XOPAVX1-LABEL: test_bitreverse_v8i32: ; XOPAVX1: # %bb.0: @@ -1399,32 +1387,28 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_bitreverse_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_bitreverse_v4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq ; ; XOPAVX1-LABEL: test_bitreverse_v4i64: ; XOPAVX1: # %bb.0: @@ -1643,13 +1627,11 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 @@ -1665,16 +1647,14 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1685,15 +1665,13 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_bitreverse_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -1943,18 +1921,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; ; AVX2-LABEL: test_bitreverse_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 @@ -1969,13 +1944,11 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_bitreverse_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 @@ -1983,8 +1956,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -1996,15 +1968,13 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; AVX512BW-LABEL: test_bitreverse_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -2067,8 +2037,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; ; GFNIAVX2-LABEL: test_bitreverse_v32i16: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 @@ -2079,8 +2048,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; GFNIAVX512F-LABEL: test_bitreverse_v32i16: ; GFNIAVX512F: # %bb.0: ; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -2301,18 +2269,15 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; ; AVX2-LABEL: test_bitreverse_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 @@ -2327,13 +2292,11 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; AVX512F-LABEL: test_bitreverse_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 @@ -2341,8 +2304,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -2354,15 +2316,13 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; AVX512BW-LABEL: test_bitreverse_v16i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -2425,8 +2385,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; ; GFNIAVX2-LABEL: test_bitreverse_v16i32: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 @@ -2437,8 +2396,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; GFNIAVX512F-LABEL: test_bitreverse_v16i32: ; GFNIAVX512F: # %bb.0: ; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -2667,18 +2625,15 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; ; AVX2-LABEL: test_bitreverse_v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 @@ -2693,13 +2648,11 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-LABEL: test_bitreverse_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 @@ -2707,8 +2660,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -2720,15 +2672,13 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512BW-LABEL: test_bitreverse_v8i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -2791,8 +2741,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; ; GFNIAVX2-LABEL: test_bitreverse_v8i64: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 @@ -2803,8 +2752,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; GFNIAVX512F-LABEL: test_bitreverse_v8i64: ; GFNIAVX512F: # %bb.0: ; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll index bd5c9363794aa..fa583c3b5a046 100644 --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -270,7 +270,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { ; ; AVX2-LABEL: vsel_i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vector-bo-select-avx512.ll b/llvm/test/CodeGen/X86/vector-bo-select-avx512.ll index 4e9c0d9c1d0ca..6746ea1958c63 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select-avx512.ll @@ -124,9 +124,9 @@ entry: define dso_local <8 x i32> @select_mul(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) { ; AVX512F-LABEL: select_mul: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpmulld %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -151,9 +151,9 @@ entry: define dso_local <8 x i32> @select_smax(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) { ; AVX512F-LABEL: select_smax: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -179,9 +179,9 @@ declare <8 x i32> @llvm.smax.v4i32(<8 x i32> %a, <8 x i32> %b) define dso_local <8 x i32> @select_smin(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) { ; AVX512F-LABEL: select_smin: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpminsd %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -207,9 +207,9 @@ declare <8 x i32> @llvm.smin.v4i32(<8 x i32> %a, <8 x i32> %b) define dso_local <8 x i32> @select_umax(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) { ; AVX512F-LABEL: select_umax: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpmaxud %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -235,9 +235,9 @@ declare <8 x i32> @llvm.umax.v4i32(<8 x i32> %a, <8 x i32> %b) define dso_local <8 x i32> @select_umin(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) { ; AVX512F-LABEL: select_umin: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpminud %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 11e7fe85d0239..5b35530599b2f 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -110,10 +110,10 @@ define <8 x float> @fadd_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vaddps %ymm1, %ymm3, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fadd_v8f32_commute: @@ -203,8 +203,8 @@ define <16 x float> @fadd_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; AVX2-LABEL: fadd_v16f32_swap: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -301,8 +301,8 @@ define <16 x float> @fadd_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; AVX2-LABEL: fadd_v16f32_commute_swap: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -724,10 +724,10 @@ define <8 x float> @fmul_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmulps %ymm1, %ymm3, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fmul_v8f32_commute: @@ -749,43 +749,43 @@ define <16 x float> @fmul_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; SSE2-NEXT: movaps %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: psrad $31, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm10, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: por %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 +; SSE2-NEXT: por %xmm11, %xmm9 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm2, %xmm11 ; SSE2-NEXT: pandn %xmm7, %xmm2 -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm7, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm11, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: mulps %xmm1, %xmm0 -; SSE2-NEXT: mulps %xmm8, %xmm9 +; SSE2-NEXT: mulps %xmm8, %xmm7 ; SSE2-NEXT: mulps %xmm3, %xmm2 -; SSE2-NEXT: mulps %xmm4, %xmm10 -; SSE2-NEXT: movaps %xmm9, %xmm1 -; SSE2-NEXT: movaps %xmm10, %xmm3 +; SSE2-NEXT: mulps %xmm4, %xmm9 +; SSE2-NEXT: movaps %xmm7, %xmm1 +; SSE2-NEXT: movaps %xmm9, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fmul_v16f32_swap: @@ -821,8 +821,8 @@ define <16 x float> @fmul_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; AVX2-LABEL: fmul_v16f32_swap: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -851,43 +851,43 @@ define <16 x float> @fmul_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; SSE2-NEXT: movaps %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: psrad $31, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm10, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: por %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 +; SSE2-NEXT: por %xmm11, %xmm9 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm2, %xmm11 ; SSE2-NEXT: pandn %xmm7, %xmm2 -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm7, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm11, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: mulps %xmm1, %xmm0 -; SSE2-NEXT: mulps %xmm8, %xmm9 +; SSE2-NEXT: mulps %xmm8, %xmm7 ; SSE2-NEXT: mulps %xmm3, %xmm2 -; SSE2-NEXT: mulps %xmm4, %xmm10 -; SSE2-NEXT: movaps %xmm9, %xmm1 -; SSE2-NEXT: movaps %xmm10, %xmm3 +; SSE2-NEXT: mulps %xmm4, %xmm9 +; SSE2-NEXT: movaps %xmm7, %xmm1 +; SSE2-NEXT: movaps %xmm9, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fmul_v16f32_commute_swap: @@ -923,8 +923,8 @@ define <16 x float> @fmul_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; AVX2-LABEL: fmul_v16f32_commute_swap: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -1152,8 +1152,8 @@ define <16 x float> @fdiv_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; AVX2-LABEL: fdiv_v16f32_swap: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -1165,10 +1165,10 @@ define <16 x float> @fdiv_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 ; ; AVX512-LABEL: fdiv_v16f32_swap: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpmovsxbd %xmm0, %zmm3 ; AVX512-NEXT: vdivps %zmm2, %zmm1, %zmm0 +; AVX512-NEXT: vpslld $31, %zmm3, %zmm2 +; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x float> , <16 x float> %y @@ -1182,43 +1182,43 @@ define <16 x float> @fdiv_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; SSE2-NEXT: movaps %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: psrad $31, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm10, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10 -; SSE2-NEXT: por %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 +; SSE2-NEXT: por %xmm11, %xmm9 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: pand %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm2, %xmm11 ; SSE2-NEXT: pandn %xmm7, %xmm2 -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: por %xmm11, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm7, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pand %xmm7, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: por %xmm11, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: divps %xmm1, %xmm0 -; SSE2-NEXT: divps %xmm8, %xmm9 +; SSE2-NEXT: divps %xmm8, %xmm7 ; SSE2-NEXT: divps %xmm3, %xmm2 -; SSE2-NEXT: divps %xmm4, %xmm10 -; SSE2-NEXT: movaps %xmm9, %xmm1 -; SSE2-NEXT: movaps %xmm10, %xmm3 +; SSE2-NEXT: divps %xmm4, %xmm9 +; SSE2-NEXT: movaps %xmm7, %xmm1 +; SSE2-NEXT: movaps %xmm9, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: fdiv_v16f32_commute_swap: @@ -1254,8 +1254,8 @@ define <16 x float> @fdiv_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef ; AVX2-LABEL: fdiv_v16f32_commute_swap: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 ; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -1306,14 +1306,14 @@ define <8 x float> @fadd_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; SSE42-NEXT: movaps %xmm0, %xmm4 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; SSE42-NEXT: movaps %xmm6, %xmm7 ; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 @@ -1327,7 +1327,7 @@ define <8 x float> @fadd_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] @@ -1338,8 +1338,8 @@ define <8 x float> @fadd_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; AVX512F-LABEL: fadd_v8f32_cast_cond: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovaps %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1407,26 +1407,26 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0] ; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] +; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE42-NEXT: pand %xmm0, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 @@ -1442,15 +1442,15 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm7, %ymm2 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1467,45 +1467,28 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n } define <8 x float> @fsub_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) { -; SSE2-LABEL: fsub_v8f32_cast_cond: -; SSE2: # %bb.0: -; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: subps %xmm4, %xmm0 -; SSE2-NEXT: subps %xmm6, %xmm1 -; SSE2-NEXT: retq -; -; SSE42-LABEL: fsub_v8f32_cast_cond: -; SSE42: # %bb.0: -; SSE42-NEXT: movd %edi, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = [16,32,64,128] -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE42-NEXT: pand %xmm3, %xmm6 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE42-NEXT: pand %xmm2, %xmm4 -; SSE42-NEXT: subps %xmm4, %xmm0 -; SSE42-NEXT: subps %xmm6, %xmm1 -; SSE42-NEXT: retq +; SSE-LABEL: fsub_v8f32_cast_cond: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: subps %xmm4, %xmm0 +; SSE-NEXT: subps %xmm6, %xmm1 +; SSE-NEXT: retq ; ; AVX2-LABEL: fsub_v8f32_cast_cond: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -1573,22 +1556,22 @@ define <8 x double> @fsub_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm7, %xmm10 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 ; SSE42-NEXT: pand %xmm5, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -1602,11 +1585,11 @@ define <8 x double> @fsub_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -1653,14 +1636,14 @@ define <8 x float> @fmul_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; SSE42-NEXT: movaps %xmm0, %xmm4 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SSE42-NEXT: movaps %xmm6, %xmm7 ; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 @@ -1674,7 +1657,7 @@ define <8 x float> @fmul_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] @@ -1685,8 +1668,8 @@ define <8 x float> @fmul_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; AVX512F-LABEL: fmul_v8f32_cast_cond: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovaps %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1754,26 +1737,26 @@ define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1.0E+0,1.0E+0] ; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] +; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE42-NEXT: pand %xmm0, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 @@ -1789,15 +1772,15 @@ define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm7, %ymm2 ; AVX2-NEXT: vmulpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1841,14 +1824,14 @@ define <8 x float> @fdiv_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; SSE42-NEXT: movaps %xmm0, %xmm4 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SSE42-NEXT: movaps %xmm6, %xmm7 ; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 @@ -1862,7 +1845,7 @@ define <8 x float> @fdiv_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] @@ -1942,26 +1925,26 @@ define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm11 = [1.0E+0,1.0E+0] ; SSE42-NEXT: movapd %xmm11, %xmm10 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm10 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm11, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm11, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] +; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE42-NEXT: pand %xmm0, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 @@ -1977,15 +1960,15 @@ define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm7, %ymm2 ; AVX2-NEXT: vdivpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivpd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -2106,45 +2089,28 @@ define <8 x i32> @add_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> } define <8 x i32> @add_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { -; SSE2-LABEL: add_v8i32_cast_cond: -; SSE2: # %bb.0: -; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: retq -; -; SSE42-LABEL: add_v8i32_cast_cond: -; SSE42: # %bb.0: -; SSE42-NEXT: movd %edi, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = [16,32,64,128] -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE42-NEXT: pand %xmm3, %xmm6 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE42-NEXT: pand %xmm2, %xmm4 -; SSE42-NEXT: paddd %xmm4, %xmm0 -; SSE42-NEXT: paddd %xmm6, %xmm1 -; SSE42-NEXT: retq +; SSE-LABEL: add_v8i32_cast_cond: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: paddd %xmm4, %xmm0 +; SSE-NEXT: paddd %xmm6, %xmm1 +; SSE-NEXT: retq ; ; AVX2-LABEL: add_v8i32_cast_cond: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -2212,22 +2178,22 @@ define <8 x i64> @add_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm7, %xmm10 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 ; SSE42-NEXT: pand %xmm5, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -2241,11 +2207,11 @@ define <8 x i64> @add_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -2559,45 +2525,28 @@ define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, } define <8 x i32> @sub_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { -; SSE2-LABEL: sub_v8i32_cast_cond: -; SSE2: # %bb.0: -; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: psubd %xmm6, %xmm1 -; SSE2-NEXT: retq -; -; SSE42-LABEL: sub_v8i32_cast_cond: -; SSE42: # %bb.0: -; SSE42-NEXT: movd %edi, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = [16,32,64,128] -; SSE42-NEXT: movdqa %xmm4, %xmm6 -; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE42-NEXT: pand %xmm3, %xmm6 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE42-NEXT: pand %xmm2, %xmm4 -; SSE42-NEXT: psubd %xmm4, %xmm0 -; SSE42-NEXT: psubd %xmm6, %xmm1 -; SSE42-NEXT: retq +; SSE-LABEL: sub_v8i32_cast_cond: +; SSE: # %bb.0: +; SSE-NEXT: movd %edi, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: psubd %xmm4, %xmm0 +; SSE-NEXT: psubd %xmm6, %xmm1 +; SSE-NEXT: retq ; ; AVX2-LABEL: sub_v8i32_cast_cond: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -2665,22 +2614,22 @@ define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm10 ; SSE42-NEXT: pand %xmm7, %xmm10 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 ; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 ; SSE42-NEXT: pand %xmm5, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -2694,11 +2643,11 @@ define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -2756,10 +2705,10 @@ define <4 x i32> @mul_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vpmulld %xmm3, %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2840,10 +2789,10 @@ define <8 x i32> @mul_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vpmulld %ymm1, %ymm3, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: mul_v8i32_commute: @@ -2899,14 +2848,14 @@ define <8 x i32> @mul_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42-NEXT: movdqa %xmm0, %xmm4 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 ; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1,1,1,1] ; SSE42-NEXT: movaps %xmm6, %xmm7 ; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8] ; SSE42-NEXT: pand %xmm0, %xmm5 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 ; SSE42-NEXT: movdqa %xmm5, %xmm0 @@ -2920,7 +2869,7 @@ define <8 x i32> @mul_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] @@ -2931,8 +2880,8 @@ define <8 x i32> @mul_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX512F-LABEL: mul_v8i32_cast_cond: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -3036,26 +2985,26 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: movdqa %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1,1] ; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] +; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE42-NEXT: pand %xmm0, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 @@ -3107,12 +3056,12 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,1,1,1] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 @@ -3615,12 +3564,12 @@ define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm4 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = [16,32,64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm4, %xmm6 ; SSE42-NEXT: pand %xmm5, %xmm6 ; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE42-NEXT: pand %xmm3, %xmm6 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] ; SSE42-NEXT: pand %xmm3, %xmm4 ; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE42-NEXT: pand %xmm2, %xmm4 @@ -3639,7 +3588,7 @@ define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -3683,35 +3632,35 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] ; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pand %xmm6, %xmm11 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2] +; SSE2-NEXT: pand %xmm6, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,0,3,2] ; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psllq %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,0,3,2] +; SSE2-NEXT: pand %xmm6, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: psllq %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] ; SSE2-NEXT: psllq %xmm5, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE2-NEXT: pand %xmm4, %xmm11 ; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psllq %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE2-NEXT: psllq %xmm11, %xmm4 +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,3,2,3] ; SSE2-NEXT: psllq %xmm5, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psllq %xmm7, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psllq %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; SSE2-NEXT: psllq %xmm5, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] +; SSE2-NEXT: psllq %xmm4, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psllq %xmm8, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] @@ -3723,37 +3672,37 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm10, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 -; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm10 -; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm6, %xmm10 -; SSE42-NEXT: pand %xmm5, %xmm10 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] -; SSE42-NEXT: pand %xmm5, %xmm9 -; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 +; SSE42-NEXT: movdqa {{.*#+}} xmm11 = [1,2] +; SSE42-NEXT: pand %xmm11, %xmm9 +; SSE42-NEXT: pcmpeqq %xmm11, %xmm9 +; SSE42-NEXT: movdqa {{.*#+}} xmm11 = [4,8] +; SSE42-NEXT: pand %xmm11, %xmm10 ; SSE42-NEXT: pand %xmm4, %xmm9 ; SSE42-NEXT: movdqa %xmm0, %xmm4 ; SSE42-NEXT: psllq %xmm9, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] -; SSE42-NEXT: psllq %xmm5, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm11, %xmm10 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] +; SSE42-NEXT: psllq %xmm9, %xmm0 +; SSE42-NEXT: pand %xmm5, %xmm10 +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: psllq %xmm10, %xmm5 +; SSE42-NEXT: pand %xmm6, %xmm7 +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,3,2,3] +; SSE42-NEXT: psllq %xmm6, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: psllq %xmm10, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; SSE42-NEXT: psllq %xmm5, %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm4 ; SSE42-NEXT: psllq %xmm7, %xmm4 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] ; SSE42-NEXT: psllq %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] @@ -3768,11 +3717,11 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -3803,14 +3752,14 @@ define <4 x i32> @lshr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> nounde ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm0, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -3827,12 +3776,12 @@ define <4 x i32> @lshr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> nounde ; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm5 ; SSE42-NEXT: psrld %xmm4, %xmm5 -; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm0 -; SSE42-NEXT: psrld %xmm3, %xmm0 +; SSE42-NEXT: psrld %xmm4, %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm2, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE42-NEXT: retq @@ -3874,43 +3823,43 @@ define <8 x i32> @lshr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32 ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrld %xmm4, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: psrld %xmm6, %xmm7 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: psrld %xmm3, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm3, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: psrld %xmm3, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm3, %xmm0 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrld %xmm3, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrld %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrld %xmm3, %xmm6 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm2, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[0,3] ; SSE2-NEXT: retq ; @@ -3918,28 +3867,28 @@ define <8 x i32> @lshr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32 ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm5 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE42-NEXT: pslld $31, %xmm5 -; SSE42-NEXT: psrad $31, %xmm5 -; SSE42-NEXT: pand %xmm4, %xmm5 ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: psrad $31, %xmm0 ; SSE42-NEXT: pand %xmm3, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: psrld %xmm4, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrld %xmm4, %xmm7 -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE42-NEXT: psrld %xmm6, %xmm7 +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: psrld %xmm6, %xmm8 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: psrld %xmm3, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: psrld %xmm3, %xmm6 +; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE42-NEXT: pslld $31, %xmm5 +; SSE42-NEXT: psrad $31, %xmm5 +; SSE42-NEXT: pand %xmm4, %xmm5 ; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm1, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5],xmm8[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm5, %xmm4 @@ -3947,10 +3896,10 @@ define <8 x i32> @lshr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm5, %xmm6 ; SSE42-NEXT: psrld %xmm3, %xmm6 -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm5, %xmm1 ; SSE42-NEXT: psrld %xmm3, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm2, %xmm5 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] @@ -3993,23 +3942,9 @@ define <16 x i32> @lshr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i ; SSE2-LABEL: lshr_v16i32_swap: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: psrad $31, %xmm10 -; SSE2-NEXT: pandn %xmm7, %xmm10 +; SSE2-NEXT: movdqa %xmm0, %xmm9 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm8, %xmm10 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm8 ; SSE2-NEXT: psrad $31, %xmm8 @@ -4017,134 +3952,150 @@ define <16 x i32> @lshr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrld %xmm0, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld %xmm6, %xmm0 +; SSE2-NEXT: psrld %xmm11, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: psrld %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: pandn %xmm7, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm10 +; SSE2-NEXT: psrad $31, %xmm10 +; SSE2-NEXT: pandn %xmm6, %xmm10 +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm6, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: psrld %xmm6, %xmm8 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: psrld %xmm1, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrld %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: psrld %xmm7, %xmm10 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrld %xmm2, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: psrld %xmm6, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: psrld %xmm6, %xmm7 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: psrld %xmm3, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: psrld %xmm6, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: psrld %xmm7, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm4, %xmm7 ; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; SSE2-NEXT: movaps %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: lshr_v16i32_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm8 -; SSE42-NEXT: psrad $31, %xmm8 -; SSE42-NEXT: pandn %xmm7, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm7 -; SSE42-NEXT: psrad $31, %xmm7 -; SSE42-NEXT: pandn %xmm6, %xmm7 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm6 -; SSE42-NEXT: psrad $31, %xmm6 -; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm0 +; SSE42-NEXT: psrad $31, %xmm0 +; SSE42-NEXT: pandn %xmm5, %xmm0 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm10 +; SSE42-NEXT: psrld %xmm5, %xmm10 +; SSE42-NEXT: pandn %xmm7, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm11 +; SSE42-NEXT: psrld %xmm5, %xmm11 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrld %xmm5, %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,1,1] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm12 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm12 +; SSE42-NEXT: psrad $31, %xmm12 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm5 ; SSE42-NEXT: psrad $31, %xmm5 ; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm9 -; SSE42-NEXT: psrld %xmm0, %xmm9 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm11 -; SSE42-NEXT: psrld %xmm0, %xmm11 -; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm11[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm0 -; SSE42-NEXT: psrld %xmm6, %xmm0 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pandn %xmm6, %xmm12 +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm6, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: psrld %xmm1, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm10 -; SSE42-NEXT: psrld %xmm1, %xmm10 -; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm10[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm2, %xmm8 +; SSE42-NEXT: psrld %xmm1, %xmm8 +; SSE42-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm1 -; SSE42-NEXT: psrld %xmm6, %xmm1 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrld %xmm10, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm6, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm6 ; SSE42-NEXT: psrld %xmm2, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm9 -; SSE42-NEXT: psrld %xmm2, %xmm9 -; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm9[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm3, %xmm8 +; SSE42-NEXT: psrld %xmm2, %xmm8 +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm2 ; SSE42-NEXT: psrld %xmm6, %xmm2 ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm6, %xmm3 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm4, %xmm6 ; SSE42-NEXT: psrld %xmm3, %xmm6 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm4, %xmm8 @@ -4192,177 +4143,177 @@ define <16 x i32> @lshr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { ; SSE2-LABEL: lshr_v16i32_commute_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pandn %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pandn %xmm6, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: psrld %xmm8, %xmm11 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrld %xmm5, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: psrld %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm12 +; SSE2-NEXT: psrld %xmm5, %xmm12 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: psrld %xmm5, %xmm13 +; SSE2-NEXT: movdqa %xmm10, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm10 +; SSE2-NEXT: psrad $31, %xmm10 +; SSE2-NEXT: pandn %xmm7, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm11[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm12[0,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm7 ; SSE2-NEXT: psrld %xmm6, %xmm7 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: psrld %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: psrld %xmm5, %xmm9 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm2, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm9[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: psrld %xmm5, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: psrld %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm5, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[0,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: psrld %xmm5, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: psrld %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm4, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm10, %xmm5 +; SSE2-NEXT: psrld %xmm2, %xmm5 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: psrld %xmm2, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm2, %xmm10 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm6[0] +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm5[0,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm8, %xmm5 +; SSE2-NEXT: psrld %xmm3, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: psrld %xmm2, %xmm3 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm8, %xmm5 +; SSE2-NEXT: psrld %xmm2, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm2, %xmm8 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm5[0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm3[0,3] +; SSE2-NEXT: movaps %xmm10, %xmm2 +; SSE2-NEXT: movaps %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: lshr_v16i32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm3, %xmm10 -; SSE42-NEXT: movdqa %xmm2, %xmm9 +; SSE42-NEXT: movdqa %xmm2, %xmm10 ; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm2 ; SSE42-NEXT: psrad $31, %xmm2 ; SSE42-NEXT: pandn %xmm7, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm1 -; SSE42-NEXT: psrad $31, %xmm1 -; SSE42-NEXT: pandn %xmm6, %xmm1 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: psrad $31, %xmm0 ; SSE42-NEXT: pandn %xmm5, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm3 -; SSE42-NEXT: psrad $31, %xmm3 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm11 -; SSE42-NEXT: psrld %xmm6, %xmm11 -; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrld %xmm5, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE42-NEXT: psrld %xmm5, %xmm11 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm12 +; SSE42-NEXT: psrld %xmm5, %xmm12 +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm5 +; SSE42-NEXT: psrld %xmm1, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm1 +; SSE42-NEXT: psrad $31, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm9[3,3,3,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm7 +; SSE42-NEXT: psrad $31, %xmm7 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm7 +; SSE42-NEXT: pandn %xmm6, %xmm1 +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrld %xmm6, %xmm0 +; SSE42-NEXT: pblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm11[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3],xmm0[4,5],xmm12[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm7 -; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm8 ; SSE42-NEXT: psrld %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm9 +; SSE42-NEXT: psrld %xmm6, %xmm9 ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm8[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5],xmm9[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm7 -; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm8 ; SSE42-NEXT: psrld %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm2, %xmm9 +; SSE42-NEXT: psrld %xmm6, %xmm9 +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm8[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrld %xmm5, %xmm2 +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrld %xmm3, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm7 -; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm8 -; SSE42-NEXT: psrld %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm6 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm7, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrld %xmm4, %xmm3 -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm7, %xmm8 +; SSE42-NEXT: psrld %xmm5, %xmm8 +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm7, %xmm5 +; SSE42-NEXT: psrld %xmm3, %xmm5 +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrld %xmm3, %xmm7 +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7] +; SSE42-NEXT: movdqa %xmm7, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: lshr_v16i32_commute_swap: @@ -4402,25 +4353,25 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pand %xmm2, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrld %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld %xmm6, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrld %xmm5, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrld %xmm2, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrld %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: psrld %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm3, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm5 @@ -4428,14 +4379,14 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld %xmm0, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrld %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrld %xmm4, %xmm6 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm0, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: movaps %xmm3, %xmm1 @@ -4445,30 +4396,30 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm4 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm4 ; SSE42-NEXT: pand %xmm6, %xmm4 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] -; SSE42-NEXT: pand %xmm3, %xmm5 -; SSE42-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8] +; SSE42-NEXT: pand %xmm6, %xmm5 +; SSE42-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE42-NEXT: pand %xmm2, %xmm5 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psrld %xmm2, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: psrld %xmm2, %xmm6 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: psrld %xmm7, %xmm8 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrld %xmm6, %xmm7 -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm3[0,1,2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: psrld %xmm3, %xmm5 +; SSE42-NEXT: psrld %xmm5, %xmm7 +; SSE42-NEXT: pand %xmm3, %xmm4 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm2, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5],xmm8[6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: psrld %xmm2, %xmm3 @@ -4476,13 +4427,13 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm6 ; SSE42-NEXT: psrld %xmm5, %xmm6 +; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: psrld %xmm4, %xmm5 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: psrld %xmm3, %xmm4 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrld %xmm2, %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] ; SSE42-NEXT: retq ; @@ -4490,7 +4441,7 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -4534,35 +4485,35 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] ; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] -; SSE2-NEXT: pand %xmm5, %xmm10 -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pand %xmm6, %xmm11 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2] +; SSE2-NEXT: pand %xmm6, %xmm9 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,0,3,2] ; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrlq %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,0,3,2] +; SSE2-NEXT: pand %xmm6, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: psrlq %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] ; SSE2-NEXT: psrlq %xmm5, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE2-NEXT: pand %xmm4, %xmm11 ; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrlq %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE2-NEXT: psrlq %xmm11, %xmm4 +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,3,2,3] ; SSE2-NEXT: psrlq %xmm5, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrlq %xmm7, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlq %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; SSE2-NEXT: psrlq %xmm5, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] +; SSE2-NEXT: psrlq %xmm4, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: psrlq %xmm8, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] @@ -4574,37 +4525,37 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm10, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 -; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm10 -; SSE42-NEXT: pand %xmm6, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm6, %xmm10 -; SSE42-NEXT: pand %xmm5, %xmm10 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] -; SSE42-NEXT: pand %xmm5, %xmm9 -; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 +; SSE42-NEXT: movdqa {{.*#+}} xmm11 = [1,2] +; SSE42-NEXT: pand %xmm11, %xmm9 +; SSE42-NEXT: pcmpeqq %xmm11, %xmm9 +; SSE42-NEXT: movdqa {{.*#+}} xmm11 = [4,8] +; SSE42-NEXT: pand %xmm11, %xmm10 ; SSE42-NEXT: pand %xmm4, %xmm9 ; SSE42-NEXT: movdqa %xmm0, %xmm4 ; SSE42-NEXT: psrlq %xmm9, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] -; SSE42-NEXT: psrlq %xmm5, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm11, %xmm10 +; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] +; SSE42-NEXT: psrlq %xmm9, %xmm0 +; SSE42-NEXT: pand %xmm5, %xmm10 +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: psrlq %xmm10, %xmm5 +; SSE42-NEXT: pand %xmm6, %xmm7 +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,3,2,3] +; SSE42-NEXT: psrlq %xmm6, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: psrlq %xmm10, %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; SSE42-NEXT: psrlq %xmm5, %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm4 ; SSE42-NEXT: psrlq %xmm7, %xmm4 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] ; SSE42-NEXT: psrlq %xmm5, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] @@ -4619,11 +4570,11 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -4654,14 +4605,14 @@ define <4 x i32> @ashr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> nounde ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrad %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psrad %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm0, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -4678,12 +4629,12 @@ define <4 x i32> @ashr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> nounde ; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm5 ; SSE42-NEXT: psrad %xmm4, %xmm5 -; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm0 -; SSE42-NEXT: psrad %xmm3, %xmm0 +; SSE42-NEXT: psrad %xmm4, %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm2, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE42-NEXT: retq @@ -4725,29 +4676,29 @@ define <8 x i32> @ashr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32 ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrad %xmm4, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: psrad %xmm6, %xmm7 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: psrad %xmm3, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: psrad %xmm3, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm3, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm5 @@ -4755,13 +4706,13 @@ define <8 x i32> @ashr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrad %xmm3, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrad %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrad %xmm3, %xmm6 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm2, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[0,3] ; SSE2-NEXT: retq ; @@ -4769,28 +4720,28 @@ define <8 x i32> @ashr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32 ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa %xmm0, %xmm5 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE42-NEXT: pslld $31, %xmm5 -; SSE42-NEXT: psrad $31, %xmm5 -; SSE42-NEXT: pand %xmm4, %xmm5 ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: psrad $31, %xmm0 ; SSE42-NEXT: pand %xmm3, %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: psrad %xmm4, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrad %xmm4, %xmm7 -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE42-NEXT: psrad %xmm6, %xmm7 +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: psrad %xmm6, %xmm8 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: psrad %xmm3, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: psrad %xmm3, %xmm6 +; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE42-NEXT: pslld $31, %xmm5 +; SSE42-NEXT: psrad $31, %xmm5 +; SSE42-NEXT: pand %xmm4, %xmm5 ; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm1, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5],xmm8[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm5, %xmm4 @@ -4798,10 +4749,10 @@ define <8 x i32> @ashr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm5, %xmm6 ; SSE42-NEXT: psrad %xmm3, %xmm6 -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm5, %xmm1 ; SSE42-NEXT: psrad %xmm3, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm2, %xmm5 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] @@ -4844,23 +4795,9 @@ define <16 x i32> @ashr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i ; SSE2-LABEL: ashr_v16i32_swap: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm9 -; SSE2-NEXT: psrad $31, %xmm9 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm10 -; SSE2-NEXT: psrad $31, %xmm10 -; SSE2-NEXT: pandn %xmm7, %xmm10 +; SSE2-NEXT: movdqa %xmm0, %xmm9 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: pandn %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm8, %xmm10 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm8 ; SSE2-NEXT: psrad $31, %xmm8 @@ -4868,134 +4805,150 @@ define <16 x i32> @ashr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrad %xmm0, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad %xmm6, %xmm0 +; SSE2-NEXT: psrad %xmm11, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: psrad %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm9 +; SSE2-NEXT: psrad $31, %xmm9 +; SSE2-NEXT: pandn %xmm7, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm10 +; SSE2-NEXT: psrad $31, %xmm10 +; SSE2-NEXT: pandn %xmm6, %xmm10 +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrad %xmm6, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: psrad %xmm6, %xmm8 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrad %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: psrad %xmm1, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: psrad %xmm7, %xmm10 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: psrad %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm5, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrad %xmm2, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: psrad %xmm6, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: psrad %xmm6, %xmm7 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm5, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[2,3,3,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: psrad %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: psrad %xmm3, %xmm5 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: psrad %xmm6, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: psrad %xmm7, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm4, %xmm7 ; SSE2-NEXT: psrad %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrad %xmm5, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrad %xmm3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; SSE2-NEXT: movaps %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: ashr_v16i32_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm8 -; SSE42-NEXT: psrad $31, %xmm8 -; SSE42-NEXT: pandn %xmm7, %xmm8 -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm7 -; SSE42-NEXT: psrad $31, %xmm7 -; SSE42-NEXT: pandn %xmm6, %xmm7 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm6 -; SSE42-NEXT: psrad $31, %xmm6 -; SSE42-NEXT: pandn %xmm5, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm9 +; SSE42-NEXT: psrad $31, %xmm9 +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm0 +; SSE42-NEXT: psrad $31, %xmm0 +; SSE42-NEXT: pandn %xmm5, %xmm0 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm10 +; SSE42-NEXT: psrad %xmm5, %xmm10 +; SSE42-NEXT: pandn %xmm7, %xmm9 +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm11 +; SSE42-NEXT: psrad %xmm5, %xmm11 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: psrad %xmm5, %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,1,1] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm12 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm12 +; SSE42-NEXT: psrad $31, %xmm12 +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm5 ; SSE42-NEXT: psrad $31, %xmm5 ; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm9 -; SSE42-NEXT: psrad %xmm0, %xmm9 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm11 -; SSE42-NEXT: psrad %xmm0, %xmm11 -; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm11[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm0 -; SSE42-NEXT: psrad %xmm6, %xmm0 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pandn %xmm6, %xmm12 +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm6, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: psrad %xmm1, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm10 -; SSE42-NEXT: psrad %xmm1, %xmm10 -; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm10[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm2, %xmm8 +; SSE42-NEXT: psrad %xmm1, %xmm8 +; SSE42-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm1 -; SSE42-NEXT: psrad %xmm6, %xmm1 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrad %xmm10, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm6, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm6 ; SSE42-NEXT: psrad %xmm2, %xmm6 -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm9 -; SSE42-NEXT: psrad %xmm2, %xmm9 -; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm9[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm3, %xmm8 +; SSE42-NEXT: psrad %xmm2, %xmm8 +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm2 ; SSE42-NEXT: psrad %xmm6, %xmm2 ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm6, %xmm3 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm4, %xmm6 ; SSE42-NEXT: psrad %xmm3, %xmm6 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm4, %xmm8 @@ -5043,177 +4996,177 @@ define <16 x i32> @ashr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { ; SSE2-LABEL: ashr_v16i32_commute_swap: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pandn %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pandn %xmm6, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: psrad %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: psrad %xmm8, %xmm11 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrad %xmm5, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: psrad %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm12 +; SSE2-NEXT: psrad %xmm5, %xmm12 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: psrad %xmm5, %xmm13 +; SSE2-NEXT: movdqa %xmm10, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm10 +; SSE2-NEXT: psrad $31, %xmm10 +; SSE2-NEXT: pandn %xmm7, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm5, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm11[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm12[0,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm7 ; SSE2-NEXT: psrad %xmm6, %xmm7 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: psrad %xmm5, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: psrad %xmm5, %xmm9 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrad %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrad %xmm5, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrad %xmm2, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm9[0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: psrad %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: psrad %xmm5, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: psrad %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrad %xmm5, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[0,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: psrad %xmm6, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm10, %xmm6 ; SSE2-NEXT: psrad %xmm5, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: psrad %xmm5, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrad %xmm4, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm10, %xmm5 +; SSE2-NEXT: psrad %xmm2, %xmm5 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: psrad %xmm2, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrad %xmm2, %xmm10 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm6[0] +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm5[0,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm8, %xmm5 +; SSE2-NEXT: psrad %xmm3, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: psrad %xmm2, %xmm3 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm8, %xmm5 +; SSE2-NEXT: psrad %xmm2, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrad %xmm2, %xmm8 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm5[0] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm3[0,3] +; SSE2-NEXT: movaps %xmm10, %xmm2 +; SSE2-NEXT: movaps %xmm8, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: ashr_v16i32_commute_swap: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm3, %xmm10 -; SSE42-NEXT: movdqa %xmm2, %xmm9 +; SSE42-NEXT: movdqa %xmm2, %xmm10 ; SSE42-NEXT: movdqa %xmm1, %xmm8 -; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm9 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm2 ; SSE42-NEXT: psrad $31, %xmm2 ; SSE42-NEXT: pandn %xmm7, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm1 -; SSE42-NEXT: psrad $31, %xmm1 -; SSE42-NEXT: pandn %xmm6, %xmm1 -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero ; SSE42-NEXT: pslld $31, %xmm0 ; SSE42-NEXT: psrad $31, %xmm0 ; SSE42-NEXT: pandn %xmm5, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE42-NEXT: pslld $31, %xmm3 -; SSE42-NEXT: psrad $31, %xmm3 -; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm11 -; SSE42-NEXT: psrad %xmm6, %xmm11 -; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm6 -; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrad %xmm5, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE42-NEXT: psrad %xmm5, %xmm11 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm12 +; SSE42-NEXT: psrad %xmm5, %xmm12 +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm5 +; SSE42-NEXT: psrad %xmm1, %xmm5 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm1 +; SSE42-NEXT: psrad $31, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm9[3,3,3,3] +; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; SSE42-NEXT: pslld $31, %xmm7 +; SSE42-NEXT: psrad $31, %xmm7 +; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm7 +; SSE42-NEXT: pandn %xmm6, %xmm1 +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrad %xmm6, %xmm0 +; SSE42-NEXT: pblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm11[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3],xmm0[4,5],xmm12[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm7 -; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm8 ; SSE42-NEXT: psrad %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm9 +; SSE42-NEXT: psrad %xmm6, %xmm9 ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm8[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm5, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5],xmm9[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm7 -; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm8 ; SSE42-NEXT: psrad %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm2, %xmm9 +; SSE42-NEXT: psrad %xmm6, %xmm9 +; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm8[4,5,6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm2, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrad %xmm5, %xmm2 +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrad %xmm3, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm7 -; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm8 -; SSE42-NEXT: psrad %xmm6, %xmm8 -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm6 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm7, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 -; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] -; SSE42-NEXT: psrad %xmm4, %xmm3 -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm7, %xmm8 +; SSE42-NEXT: psrad %xmm5, %xmm8 +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm7, %xmm5 +; SSE42-NEXT: psrad %xmm3, %xmm5 +; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] +; SSE42-NEXT: psrad %xmm3, %xmm7 +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7] +; SSE42-NEXT: movdqa %xmm7, %xmm3 ; SSE42-NEXT: retq ; ; AVX2-LABEL: ashr_v16i32_commute_swap: @@ -5253,25 +5206,25 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8] +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pand %xmm2, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad %xmm6, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrad %xmm5, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; SSE2-NEXT: psrad %xmm2, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: psrad %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm3, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,3,3,3,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm5 @@ -5279,14 +5232,14 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrad %xmm0, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrad %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: psrad %xmm4, %xmm6 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm0, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: movaps %xmm3, %xmm1 @@ -5296,30 +5249,30 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm4 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] -; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] ; SSE42-NEXT: movdqa %xmm5, %xmm4 ; SSE42-NEXT: pand %xmm6, %xmm4 ; SSE42-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE42-NEXT: pand %xmm3, %xmm4 -; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] -; SSE42-NEXT: pand %xmm3, %xmm5 -; SSE42-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8] +; SSE42-NEXT: pand %xmm6, %xmm5 +; SSE42-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE42-NEXT: pand %xmm2, %xmm5 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psrad %xmm2, %xmm3 +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: psrad %xmm2, %xmm6 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,3,3,3,4,5,6,7] +; SSE42-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,3,3,3,4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: psrad %xmm7, %xmm8 +; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm7 -; SSE42-NEXT: psrad %xmm6, %xmm7 -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm3[0,1,2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, %xmm5 -; SSE42-NEXT: psrad %xmm3, %xmm5 +; SSE42-NEXT: psrad %xmm5, %xmm7 +; SSE42-NEXT: pand %xmm3, %xmm4 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm2, %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5],xmm8[6,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: psrad %xmm2, %xmm3 @@ -5327,13 +5280,13 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, %xmm6 ; SSE42-NEXT: psrad %xmm5, %xmm6 +; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: psrad %xmm4, %xmm5 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: psrad %xmm3, %xmm4 ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] ; SSE42-NEXT: psrad %xmm2, %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] ; SSE42-NEXT: retq ; @@ -5341,7 +5294,7 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 @@ -5440,11 +5393,11 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE2-NEXT: psrlq %xmm8, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3] ; SSE2-NEXT: psrlq %xmm6, %xmm4 -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrlq %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: psrlq %xmm8, %xmm7 ; SSE2-NEXT: psrlq %xmm6, %xmm3 -; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] ; SSE2-NEXT: xorpd %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: retq @@ -5453,22 +5406,22 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42: # %bb.0: ; SSE42-NEXT: movd %edi, %xmm8 ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] ; SSE42-NEXT: movdqa %xmm9, %xmm8 ; SSE42-NEXT: pand %xmm10, %xmm8 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 ; SSE42-NEXT: pand %xmm7, %xmm8 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [16,32] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32] ; SSE42-NEXT: movdqa %xmm9, %xmm7 ; SSE42-NEXT: pand %xmm10, %xmm7 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 ; SSE42-NEXT: pand %xmm6, %xmm7 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [4,8] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [4,8] ; SSE42-NEXT: movdqa %xmm9, %xmm6 ; SSE42-NEXT: pand %xmm10, %xmm6 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm6 ; SSE42-NEXT: pand %xmm5, %xmm6 -; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2] ; SSE42-NEXT: pand %xmm5, %xmm9 ; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 ; SSE42-NEXT: pand %xmm4, %xmm9 @@ -5513,11 +5466,11 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: psrlq %xmm8, %xmm5 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3] ; SSE42-NEXT: psrlq %xmm6, %xmm4 -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; SSE42-NEXT: movdqa %xmm3, %xmm5 -; SSE42-NEXT: psrlq %xmm8, %xmm5 +; SSE42-NEXT: movdqa %xmm3, %xmm7 +; SSE42-NEXT: psrlq %xmm8, %xmm7 ; SSE42-NEXT: psrlq %xmm6, %xmm3 -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4,5,6,7] ; SSE42-NEXT: pxor %xmm4, %xmm3 ; SSE42-NEXT: psubq %xmm4, %xmm3 ; SSE42-NEXT: retq @@ -5526,11 +5479,11 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm4 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 @@ -5679,9 +5632,9 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; SSE42-NEXT: cqto ; SSE42-NEXT: idivq %rcx ; SSE42-NEXT: movq %rax, %xmm0 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSE42-NEXT: pextrq $1, %xmm6, %rcx ; SSE42-NEXT: pextrq $1, %xmm2, %rax +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSE42-NEXT: cqto ; SSE42-NEXT: idivq %rcx ; SSE42-NEXT: movq %rax, %xmm5 @@ -5701,8 +5654,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; SSE42-NEXT: cqto ; SSE42-NEXT: idivq %rcx ; SSE42-NEXT: movq %rax, %xmm2 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE42-NEXT: pextrq $1, %xmm9, %rcx +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE42-NEXT: pextrq $1, %xmm4, %rax ; SSE42-NEXT: cqto ; SSE42-NEXT: idivq %rcx @@ -5719,8 +5672,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,1,1,1] +; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 ; AVX2-NEXT: vblendvpd %ymm5, %ymm6, %ymm3, %ymm5 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 @@ -5738,8 +5691,8 @@ define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, ; AVX2-NEXT: cqto ; AVX2-NEXT: idivq %rcx ; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; AVX2-NEXT: vpextrq $1, %xmm5, %rcx +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; AVX2-NEXT: vpextrq $1, %xmm1, %rax ; AVX2-NEXT: cqto ; AVX2-NEXT: idivq %rcx diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index f4d6b52377f57..9a51f617f5d15 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -414,7 +414,7 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v8f64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -479,7 +479,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v16f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -593,7 +593,7 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -661,7 +661,7 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -946,7 +946,7 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind ; AVX512F-NEXT: vcmpltpd %zmm0, %zmm2, %k0 ; AVX512F-NEXT: vcmpltpd %zmm1, %zmm3, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1044,9 +1044,9 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltps %zmm1, %zmm3, %k1 ; AVX512F-NEXT: vcmpltps %zmm0, %zmm2, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1178,12 +1178,12 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; SSE42: # %bb.0: ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm7 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm6 -; SSE42-NEXT: packssdw %xmm7, %xmm6 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm5 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm4 +; SSE42-NEXT: packssdw %xmm7, %xmm6 ; SSE42-NEXT: packssdw %xmm5, %xmm4 -; SSE42-NEXT: packssdw %xmm6, %xmm4 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: packssdw %xmm6, %xmm4 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm2 ; SSE42-NEXT: packssdw %xmm3, %xmm2 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm1 @@ -1243,7 +1243,7 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; AVX512F-NEXT: vpcmpgtq %zmm2, %zmm0, %k0 ; AVX512F-NEXT: vpcmpgtq %zmm3, %zmm1, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1277,17 +1277,17 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind { ; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packsswb %xmm2, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %ecx ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: packssdw %xmm7, %xmm6 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: packssdw %xmm7, %xmm6 ; SSE-NEXT: packssdw %xmm5, %xmm4 ; SSE-NEXT: packsswb %xmm6, %xmm4 ; SSE-NEXT: pmovmskb %xmm4, %edx @@ -1341,9 +1341,9 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtd %zmm3, %zmm1, %k1 ; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm0, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1536,14 +1536,14 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind { ; SSE-NEXT: shll $16, %ecx ; SSE-NEXT: orl %esi, %ecx ; SSE-NEXT: shlq $32, %rcx -; SSE-NEXT: orq %rdx, %rcx ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: orq %rdx, %rcx ; SSE-NEXT: pmovmskb %xmm4, %edx ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: pmovmskb %xmm5, %esi ; SSE-NEXT: shll $16, %esi -; SSE-NEXT: orl %edx, %esi ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: orl %edx, %esi ; SSE-NEXT: pmovmskb %xmm6, %edx ; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: pmovmskb %xmm7, %edi @@ -1874,9 +1874,9 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind ; AVX512F-NEXT: vcmpltpd %zmm0, %zmm4, %k0 ; AVX512F-NEXT: vcmpltpd %zmm1, %zmm5, %k2 ; AVX512F-NEXT: kunpckbw %k0, %k2, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -2134,12 +2134,12 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm7 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm6 -; SSE42-NEXT: packssdw %xmm7, %xmm6 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm5 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm4 +; SSE42-NEXT: packssdw %xmm7, %xmm6 ; SSE42-NEXT: packssdw %xmm5, %xmm4 -; SSE42-NEXT: packssdw %xmm6, %xmm4 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: packssdw %xmm6, %xmm4 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm2 ; SSE42-NEXT: packssdw %xmm3, %xmm2 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm1 @@ -2150,12 +2150,12 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; SSE42-NEXT: pmovmskb %xmm0, %ecx ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm15 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm13 -; SSE42-NEXT: packssdw %xmm15, %xmm13 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm14 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9 +; SSE42-NEXT: packssdw %xmm15, %xmm13 ; SSE42-NEXT: packssdw %xmm14, %xmm9 -; SSE42-NEXT: packssdw %xmm13, %xmm9 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm12 +; SSE42-NEXT: packssdw %xmm13, %xmm9 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10 ; SSE42-NEXT: packssdw %xmm12, %xmm10 ; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11 @@ -2178,11 +2178,11 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 ; AVX1-NEXT: vpcmpgtq 256(%rbp), %xmm8, %xmm8 ; AVX1-NEXT: vpcmpgtq 240(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm9 +; AVX1-NEXT: vpcmpgtq 224(%rbp), %xmm9, %xmm9 ; AVX1-NEXT: vpackssdw %xmm8, %xmm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-NEXT: vpcmpgtq 224(%rbp), %xmm8, %xmm8 ; AVX1-NEXT: vpcmpgtq 208(%rbp), %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpackssdw %xmm9, %xmm6, %xmm6 ; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 ; AVX1-NEXT: vpcmpgtq 192(%rbp), %xmm7, %xmm7 @@ -2202,13 +2202,13 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; AVX1-NEXT: vpcmpgtq 96(%rbp), %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtq 80(%rbp), %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpcmpgtq 64(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpgtq 64(%rbp), %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtq 48(%rbp), %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtq 32(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtq 16(%rbp), %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 @@ -2227,19 +2227,19 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; AVX2-NEXT: vpcmpgtq 240(%rbp), %ymm7, %ymm7 ; AVX2-NEXT: vpcmpgtq 208(%rbp), %ymm6, %ymm6 ; AVX2-NEXT: vpackssdw %ymm7, %ymm6, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] ; AVX2-NEXT: vpcmpgtq 176(%rbp), %ymm5, %ymm5 +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] ; AVX2-NEXT: vpcmpgtq 144(%rbp), %ymm4, %ymm4 ; AVX2-NEXT: vpackssdw %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] ; AVX2-NEXT: vpackssdw %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] ; AVX2-NEXT: vpcmpgtq 112(%rbp), %ymm3, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] ; AVX2-NEXT: vpcmpgtq 80(%rbp), %ymm2, %ymm2 ; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-NEXT: vpcmpgtq 48(%rbp), %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtq 16(%rbp), %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 @@ -2258,9 +2258,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; AVX512F-NEXT: vpcmpgtq %zmm4, %zmm0, %k0 ; AVX512F-NEXT: vpcmpgtq %zmm5, %zmm1, %k2 ; AVX512F-NEXT: kunpckbw %k0, %k2, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll index e88387a8b7c69..e2db3b8c61f01 100644 --- a/llvm/test/CodeGen/X86/vector-compress.ll +++ b/llvm/test/CodeGen/X86/vector-compress.ll @@ -15,8 +15,8 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> ; AVX2-NEXT: movl %esi, %edi ; AVX2-NEXT: subl %eax, %edi ; AVX2-NEXT: vpextrd $2, %xmm1, %edx -; AVX2-NEXT: subl %edx, %edi ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; AVX2-NEXT: subl %edx, %edi ; AVX2-NEXT: subl %ecx, %edi ; AVX2-NEXT: andl $3, %edi ; AVX2-NEXT: andl $1, %eax @@ -74,8 +74,8 @@ define <4 x float> @test_compress_v4f32(<4 x float> %vec, <4 x i1> %mask, <4 x f ; AVX2-NEXT: vmovd %xmm1, %esi ; AVX2-NEXT: andl $1, %esi ; AVX2-NEXT: movl %esi, %edi -; AVX2-NEXT: subl %edx, %edi ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: subl %edx, %edi ; AVX2-NEXT: subl %ecx, %edi ; AVX2-NEXT: vpextrd $3, %xmm1, %eax ; AVX2-NEXT: subl %eax, %edi @@ -262,8 +262,8 @@ define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> ; AVX2-NEXT: vpextrd $3, %xmm2, %eax ; AVX2-NEXT: addl %edx, %eax ; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: vpextrd $1, %xmm3, %ecx +; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: vmovd %xmm3, %edx ; AVX2-NEXT: andl $1, %edx @@ -390,13 +390,13 @@ define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x f ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $7, %eax +; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rax,4) ; AVX2-NEXT: vpextrd $2, %xmm1, %edx ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $7, %ecx -; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) ; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rdx, %rax ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx @@ -465,10 +465,10 @@ define <4 x i64> @test_compress_v4i64(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> ; AVX2-NEXT: subq %rcx, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rcx -; AVX2-NEXT: movl %edx, %edi -; AVX2-NEXT: subq %rcx, %rdx +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: subq %rcx, %rdi ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: movq %rdi, %r8 ; AVX2-NEXT: subq %rcx, %r8 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx @@ -476,9 +476,10 @@ define <4 x i64> @test_compress_v4i64(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> ; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rcx ; AVX2-NEXT: vmovq %xmm0, (%rsp) ; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%rsi,8) -; AVX2-NEXT: vmovq %xmm1, (%rsp,%rdi,8) -; AVX2-NEXT: andl $3, %edx -; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%rdx,8) +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vmovq %xmm1, (%rsp,%rax,8) +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%rdi,8) ; AVX2-NEXT: cmpq $3, %r8 ; AVX2-NEXT: movl $3, %eax ; AVX2-NEXT: cmovbq %r8, %rax @@ -600,23 +601,23 @@ define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $128, %rsp ; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpextrd $1, %xmm4, %eax ; AVX2-NEXT: vmovaps %ymm3, (%rsp) -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpextrd $1, %xmm3, %eax -; AVX2-NEXT: vmovd %xmm3, %ecx +; AVX2-NEXT: vmovd %xmm4, %ecx +; AVX2-NEXT: vpextrd $2, %xmm4, %edx ; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: vpextrd $2, %xmm3, %eax -; AVX2-NEXT: vpextrd $3, %xmm3, %edx -; AVX2-NEXT: addl %eax, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: andl $15, %edx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vpextrd $3, %xmm4, %eax +; AVX2-NEXT: addl %edx, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: vpextrb $1, %xmm2, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: vmovd %xmm2, %ecx @@ -793,42 +794,42 @@ define <16 x float> @test_compress_v16f32(<16 x float> %vec, <16 x i1> %mask, <1 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rax,4) ; AVX2-NEXT: vpextrb $6, %xmm2, %eax ; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpextrb $7, %xmm2, %edx ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: vpextrb $7, %xmm2, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rax, %rdx ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) ; AVX2-NEXT: vpextrb $8, %xmm2, %eax ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: addq %rdx, %rax +; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: vmovss %xmm1, (%rsp,%rdx,4) +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vmovss %xmm1, (%rsp,%rcx,4) +; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rcx,4) ; AVX2-NEXT: vpextrb $9, %xmm2, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rax,4) ; AVX2-NEXT: vpextrb $10, %xmm2, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $15, %ecx ; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%rcx,4) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%rcx,4) ; AVX2-NEXT: vpextrb $11, %xmm2, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%rax,4) ; AVX2-NEXT: vpextrb $12, %xmm2, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax @@ -842,13 +843,13 @@ define <16 x float> @test_compress_v16f32(<16 x float> %vec, <16 x i1> %mask, <1 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rax,4) ; AVX2-NEXT: vpextrb $14, %xmm2, %edx ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) ; AVX2-NEXT: vpextrb $15, %xmm2, %eax +; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rdx, %rax ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx @@ -1008,8 +1009,8 @@ define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x ; AVX2-NEXT: vmovlps %xmm0, (%rsp) ; AVX2-NEXT: vmovd %xmm2, %eax ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: vmovhps %xmm0, (%rsp,%rax,8) ; AVX2-NEXT: vpextrw $1, %xmm2, %ecx +; AVX2-NEXT: vmovhps %xmm0, (%rsp,%rax,8) ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -1098,53 +1099,53 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> ; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: vpextrb $1, %xmm1, %r11d ; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: movzbl %al, %edx +; AVX2-NEXT: movzbl %al, %esi ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: andb $1, %al ; AVX2-NEXT: subb %r11b, %al -; AVX2-NEXT: vpextrb $2, %xmm1, %esi -; AVX2-NEXT: subb %sil, %al +; AVX2-NEXT: vpextrb $2, %xmm1, %edi ; AVX2-NEXT: vpextrb $3, %xmm1, %r13d +; AVX2-NEXT: subb %dil, %al ; AVX2-NEXT: subb %r13b, %al ; AVX2-NEXT: vpextrb $4, %xmm1, %r12d ; AVX2-NEXT: subb %r12b, %al ; AVX2-NEXT: vpextrb $5, %xmm1, %r15d ; AVX2-NEXT: subb %r15b, %al ; AVX2-NEXT: vpextrb $6, %xmm1, %r14d -; AVX2-NEXT: subb %r14b, %al ; AVX2-NEXT: vpextrb $7, %xmm1, %ebp +; AVX2-NEXT: subb %r14b, %al ; AVX2-NEXT: subb %bpl, %al ; AVX2-NEXT: vpextrb $8, %xmm1, %ebx ; AVX2-NEXT: subb %bl, %al ; AVX2-NEXT: vpextrb $9, %xmm1, %r10d ; AVX2-NEXT: subb %r10b, %al ; AVX2-NEXT: vpextrb $10, %xmm1, %r9d -; AVX2-NEXT: subb %r9b, %al ; AVX2-NEXT: vpextrb $11, %xmm1, %r8d +; AVX2-NEXT: subb %r9b, %al ; AVX2-NEXT: subb %r8b, %al -; AVX2-NEXT: vpextrb $12, %xmm1, %edi -; AVX2-NEXT: subb %dil, %al -; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx ; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx ; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX2-NEXT: subb %cl, %al +; AVX2-NEXT: vpextrb $14, %xmm1, %edx ; AVX2-NEXT: vpextrb $15, %xmm1, %ecx ; AVX2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: subb %dl, %al ; AVX2-NEXT: subb %cl, %al ; AVX2-NEXT: movzbl %al, %eax ; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzbl -40(%rsp,%rax), %eax ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX2-NEXT: vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rdx) +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: vpextrb $1, %xmm0, -40(%rsp,%rsi) ; AVX2-NEXT: movzbl %r11b, %eax ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rdx, %rax +; AVX2-NEXT: addq %rsi, %rax ; AVX2-NEXT: vpextrb $2, %xmm0, -40(%rsp,%rax) -; AVX2-NEXT: movzbl %sil, %ecx +; AVX2-NEXT: movzbl %dil, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx ; AVX2-NEXT: vpextrb $3, %xmm0, -40(%rsp,%rcx) @@ -1155,72 +1156,72 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> ; AVX2-NEXT: movzbl %r12b, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%rax) ; AVX2-NEXT: movzbl %r15b, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, -40(%rsp,%rcx) +; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%rcx) ; AVX2-NEXT: movzbl %r14b, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, -40(%rsp,%rax) +; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%rax) ; AVX2-NEXT: movzbl %bpl, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, -40(%rsp,%rcx) +; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rcx) ; AVX2-NEXT: movzbl %bl, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $8, %xmm0, -40(%rsp,%rax) +; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%rax) ; AVX2-NEXT: movzbl %r10b, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, -40(%rsp,%rcx) +; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%rcx) ; AVX2-NEXT: movzbl %r9b, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $10, %xmm0, -40(%rsp,%rax) +; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%rax) ; AVX2-NEXT: movzbl %r8b, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl %dil, %ecx +; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%rcx) +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $12, %xmm0, -40(%rsp,%rax) +; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rax) ; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, -40(%rsp,%rcx) -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rcx) +; AVX2-NEXT: movzbl %dl, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $14, %xmm0, -40(%rsp,%rax) +; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rax) ; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, -40(%rsp,%rcx) ; AVX2-NEXT: cmpq $15, %rax ; AVX2-NEXT: movl $15, %ecx ; AVX2-NEXT: cmovbq %rax, %rcx @@ -1355,41 +1356,41 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vmovaps %ymm2, (%rsp) ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpaddb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: vmovd %xmm2, %ecx +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm5 +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm4 +; AVX2-NEXT: vpaddb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpextrb $1, %xmm4, %eax +; AVX2-NEXT: vmovd %xmm4, %ecx ; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: vpextrb $3, %xmm2, %edx +; AVX2-NEXT: vpextrb $2, %xmm4, %eax +; AVX2-NEXT: vpextrb $3, %xmm4, %edx ; AVX2-NEXT: addb %al, %dl ; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: vpextrb $5, %xmm2, %ecx +; AVX2-NEXT: vpextrb $4, %xmm4, %eax +; AVX2-NEXT: vpextrb $5, %xmm4, %ecx ; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: vpextrb $6, %xmm2, %eax +; AVX2-NEXT: vpextrb $6, %xmm4, %eax ; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: vpextrb $7, %xmm4, %ecx +; AVX2-NEXT: vpextrb $8, %xmm4, %esi ; AVX2-NEXT: addb %dl, %al -; AVX2-NEXT: vpextrb $7, %xmm2, %ecx -; AVX2-NEXT: vpextrb $8, %xmm2, %edx -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: vpextrb $9, %xmm2, %ecx -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: vpextrb $10, %xmm2, %edx +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: vpextrb $9, %xmm4, %ecx +; AVX2-NEXT: addb %sil, %cl +; AVX2-NEXT: vpextrb $10, %xmm4, %edx ; AVX2-NEXT: addb %cl, %dl +; AVX2-NEXT: vpextrb $11, %xmm4, %ecx +; AVX2-NEXT: vpextrb $12, %xmm4, %esi ; AVX2-NEXT: addb %al, %dl -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: vpextrb $12, %xmm2, %ecx -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: vpextrb $14, %xmm2, %ecx +; AVX2-NEXT: addb %cl, %sil +; AVX2-NEXT: vpextrb $13, %xmm4, %eax +; AVX2-NEXT: vmovaps %ymm2, (%rsp) +; AVX2-NEXT: addb %sil, %al +; AVX2-NEXT: vpextrb $14, %xmm4, %ecx ; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: vpextrb $15, %xmm2, %eax +; AVX2-NEXT: vpextrb $15, %xmm4, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: addb %dl, %al ; AVX2-NEXT: movzbl %al, %eax @@ -1424,60 +1425,60 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $7, %xmm3, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $8, %xmm3, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $9, %xmm3, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $10, %xmm3, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $11, %xmm3, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $12, %xmm3, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $13, %xmm3, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $14, %xmm3, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $15, %xmm3, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vmovd %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx @@ -1485,96 +1486,96 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $1, %xmm1, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $2, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $3, %xmm1, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $4, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $5, %xmm1, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $6, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $7, %xmm1, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $8, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $9, %xmm1, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $10, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $11, %xmm1, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $12, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $13, %xmm1, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: vpextrb $14, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx ; AVX2-NEXT: andl $31, %edx ; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rdx) +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: andl $31, %edx +; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rdx) ; AVX2-NEXT: vpextrb $15, %xmm1, %edx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; AVX2-NEXT: andl $31, %ecx -; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: cmpq $31, %rdx ; AVX2-NEXT: movl $31, %ecx ; AVX2-NEXT: cmovbq %rdx, %rcx @@ -1661,15 +1662,16 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x ; AVX2-NEXT: addl %ecx, %edx ; AVX2-NEXT: vpextrw $4, %xmm2, %eax ; AVX2-NEXT: vpextrw $5, %xmm2, %ecx +; AVX2-NEXT: vpextrw $6, %xmm2, %esi ; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: vpextrw $6, %xmm2, %eax -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: addl %edx, %eax -; AVX2-NEXT: vpextrw $7, %xmm2, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vpextrw $1, %xmm1, %eax +; AVX2-NEXT: addl %ecx, %esi +; AVX2-NEXT: vpextrw $7, %xmm2, %eax +; AVX2-NEXT: addl %edx, %esi +; AVX2-NEXT: addl %esi, %eax +; AVX2-NEXT: vpextrw $1, %xmm1, %ecx +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: vmovd %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx @@ -1722,8 +1724,8 @@ define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x ; AVX2-NEXT: andl $1, %edi ; AVX2-NEXT: addq %rcx, %rdi ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: cmpq $16, %rdi ; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: cmpq $16, %rdi ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX2-NEXT: cmovbw (%rsp,%rsi,2), %ax ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -1807,19 +1809,10 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $128, %rsp -; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 -; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl %ecx, %r13d -; AVX2-NEXT: movl %edx, %r15d -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: subq $96, %rsp ; AVX2-NEXT: movl 360(%rbp), %eax -; AVX2-NEXT: movl 352(%rbp), %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 +; AVX2-NEXT: movl 352(%rbp), %r10d +; AVX2-NEXT: vmovd %r10d, %xmm4 ; AVX2-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX2-NEXT: movl 368(%rbp), %eax ; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 @@ -1881,176 +1874,183 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 ; AVX2-NEXT: movl 344(%rbp), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX2-NEXT: movl 96(%rbp), %eax -; AVX2-NEXT: vmovd %eax, %xmm5 +; AVX2-NEXT: vmovd %eax, %xmm6 ; AVX2-NEXT: movl 104(%rbp), %eax -; AVX2-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm6, %xmm6 ; AVX2-NEXT: movl 112(%rbp), %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $2, %eax, %xmm6, %xmm6 ; AVX2-NEXT: movl 120(%rbp), %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $3, %eax, %xmm6, %xmm6 ; AVX2-NEXT: movl 128(%rbp), %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $4, %eax, %xmm6, %xmm6 ; AVX2-NEXT: movl 136(%rbp), %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $5, %eax, %xmm6, %xmm6 ; AVX2-NEXT: movl 144(%rbp), %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $6, %eax, %xmm6, %xmm6 ; AVX2-NEXT: movl 152(%rbp), %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $7, %eax, %xmm6, %xmm6 ; AVX2-NEXT: movl 160(%rbp), %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6 ; AVX2-NEXT: movl 168(%rbp), %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $9, %eax, %xmm6, %xmm6 ; AVX2-NEXT: movl 176(%rbp), %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $10, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movl %ecx, %ebx ; AVX2-NEXT: movl 184(%rbp), %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $11, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movl %edx, %r11d ; AVX2-NEXT: movl 192(%rbp), %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $12, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movl %esi, %r10d ; AVX2-NEXT: movl 200(%rbp), %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 +; AVX2-NEXT: movl %edi, %r15d ; AVX2-NEXT: movl 208(%rbp), %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 +; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 ; AVX2-NEXT: movl 216(%rbp), %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vmovd %edi, %xmm6 -; AVX2-NEXT: vpinsrb $1, %esi, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $2, %edx, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $3, %r13d, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6 -; AVX2-NEXT: vpinsrb $5, %r9d, %xmm6, %xmm6 -; AVX2-NEXT: movl 16(%rbp), %esi -; AVX2-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6 -; AVX2-NEXT: movl 24(%rbp), %edi -; AVX2-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6 -; AVX2-NEXT: movl 32(%rbp), %r8d -; AVX2-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6 -; AVX2-NEXT: movl 40(%rbp), %r9d -; AVX2-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6 -; AVX2-NEXT: movl 48(%rbp), %r10d -; AVX2-NEXT: vpinsrb $10, %r10d, %xmm6, %xmm6 -; AVX2-NEXT: movl 56(%rbp), %r11d -; AVX2-NEXT: vpinsrb $11, %r11d, %xmm6, %xmm6 -; AVX2-NEXT: movl 64(%rbp), %r14d -; AVX2-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6 -; AVX2-NEXT: movl 72(%rbp), %r12d -; AVX2-NEXT: vpinsrb $13, %r12d, %xmm6, %xmm6 +; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6 +; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX2-NEXT: vmovd %edi, %xmm7 +; AVX2-NEXT: vpinsrb $1, %esi, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $2, %edx, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $4, %r8d, %xmm7, %xmm7 +; AVX2-NEXT: vpinsrb $5, %r9d, %xmm7, %xmm7 +; AVX2-NEXT: movl 16(%rbp), %edi +; AVX2-NEXT: vpinsrb $6, %edi, %xmm7, %xmm7 +; AVX2-NEXT: movl 24(%rbp), %r14d +; AVX2-NEXT: vpinsrb $7, %r14d, %xmm7, %xmm7 +; AVX2-NEXT: movl 32(%rbp), %r12d +; AVX2-NEXT: vpinsrb $8, %r12d, %xmm7, %xmm7 +; AVX2-NEXT: movl 40(%rbp), %r13d +; AVX2-NEXT: vpinsrb $9, %r13d, %xmm7, %xmm7 +; AVX2-NEXT: movl 48(%rbp), %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 +; AVX2-NEXT: movl 56(%rbp), %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 +; AVX2-NEXT: movl 64(%rbp), %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-NEXT: movl 72(%rbp), %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm7, %xmm5 ; AVX2-NEXT: movl 80(%rbp), %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 +; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 ; AVX2-NEXT: movl 88(%rbp), %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm6, %xmm6 -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX2-NEXT: vpaddb %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpaddb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpextrb $1, %xmm4, %eax +; AVX2-NEXT: vpextrb $1, %xmm4, %esi ; AVX2-NEXT: vmovd %xmm4, %ecx -; AVX2-NEXT: addb %al, %cl ; AVX2-NEXT: vpextrb $2, %xmm4, %edx ; AVX2-NEXT: vpextrb $3, %xmm4, %eax +; AVX2-NEXT: addb %sil, %cl ; AVX2-NEXT: addb %dl, %al +; AVX2-NEXT: vpextrb $4, %xmm4, %edx ; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: vpextrb $4, %xmm4, %ecx -; AVX2-NEXT: vpextrb $5, %xmm4, %edx -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: vpextrb $6, %xmm4, %ecx +; AVX2-NEXT: vpextrb $5, %xmm4, %ecx ; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: vpextrb $7, %xmm4, %eax -; AVX2-NEXT: vpextrb $8, %xmm4, %edx -; AVX2-NEXT: addb %al, %dl -; AVX2-NEXT: vpextrb $9, %xmm4, %eax -; AVX2-NEXT: addb %dl, %al -; AVX2-NEXT: vpextrb $10, %xmm4, %edx -; AVX2-NEXT: addb %al, %dl +; AVX2-NEXT: vpextrb $6, %xmm4, %edx ; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: vpextrb $11, %xmm4, %eax -; AVX2-NEXT: vpextrb $12, %xmm4, %ecx +; AVX2-NEXT: addb %al, %dl +; AVX2-NEXT: vpextrb $7, %xmm4, %eax +; AVX2-NEXT: vpextrb $8, %xmm4, %ecx ; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: vpextrb $13, %xmm4, %eax +; AVX2-NEXT: vpextrb $9, %xmm4, %eax ; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: vpextrb $14, %xmm4, %ecx +; AVX2-NEXT: vpextrb $10, %xmm4, %ecx ; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: vpextrb $15, %xmm4, %eax -; AVX2-NEXT: addb %cl, %al +; AVX2-NEXT: addb %dl, %cl +; AVX2-NEXT: vpextrb $11, %xmm4, %eax +; AVX2-NEXT: vpextrb $12, %xmm4, %edx +; AVX2-NEXT: addb %al, %dl +; AVX2-NEXT: vpextrb $13, %xmm4, %eax ; AVX2-NEXT: addb %dl, %al +; AVX2-NEXT: vpextrb $14, %xmm4, %edx +; AVX2-NEXT: vpextrb $15, %xmm4, %esi +; AVX2-NEXT: addb %al, %dl +; AVX2-NEXT: addb %dl, %sil +; AVX2-NEXT: addb %cl, %sil ; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps %ymm2, (%rsp) -; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: movzbl %sil, %eax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movzbl (%rsp,%rax), %esi ; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: andl $1, %ebx -; AVX2-NEXT: addq %rax, %rbx -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rbx) ; AVX2-NEXT: andl $1, %r15d -; AVX2-NEXT: addq %rbx, %r15 -; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r15) -; AVX2-NEXT: andl $1, %r13d -; AVX2-NEXT: addq %r15, %r13 -; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%r13) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addq %r13, %rcx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%r15) +; AVX2-NEXT: andl $1, %r10d +; AVX2-NEXT: addq %r15, %r10 +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%r10) +; AVX2-NEXT: andl $1, %r11d +; AVX2-NEXT: addq %r10, %r11 +; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%r11) +; AVX2-NEXT: andl $1, %ebx +; AVX2-NEXT: addq %r11, %rbx +; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rbx) +; AVX2-NEXT: andl $1, %r8d +; AVX2-NEXT: addq %rbx, %r8 +; AVX2-NEXT: movl %r8d, %eax ; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: andl $1, %esi -; AVX2-NEXT: addq %rax, %rsi -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $1, %r9d +; AVX2-NEXT: addq %r8, %r9 +; AVX2-NEXT: movl %r9d, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) ; AVX2-NEXT: andl $1, %edi -; AVX2-NEXT: addq %rsi, %rdi -; AVX2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi -; AVX2-NEXT: andl $63, %esi -; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rsi) -; AVX2-NEXT: andl $1, %r8d -; AVX2-NEXT: addq %rdi, %r8 +; AVX2-NEXT: addq %r9, %rdi +; AVX2-NEXT: andl $1, %r14d +; AVX2-NEXT: addq %rdi, %r14 ; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi ; AVX2-NEXT: andl $63, %edi -; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rdi) -; AVX2-NEXT: andl $1, %r9d -; AVX2-NEXT: addq %r8, %r9 -; AVX2-NEXT: # kill: def $r8d killed $r8d killed $r8 def $r8 -; AVX2-NEXT: andl $63, %r8d -; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%r8) -; AVX2-NEXT: andl $1, %r10d -; AVX2-NEXT: addq %r9, %r10 -; AVX2-NEXT: # kill: def $r9d killed $r9d killed $r9 def $r9 -; AVX2-NEXT: andl $63, %r9d -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%r9) -; AVX2-NEXT: andl $1, %r11d -; AVX2-NEXT: addq %r10, %r11 -; AVX2-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10 -; AVX2-NEXT: andl $63, %r10d -; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%r10) -; AVX2-NEXT: andl $1, %r14d -; AVX2-NEXT: addq %r11, %r14 -; AVX2-NEXT: # kill: def $r11d killed $r11d killed $r11 def $r11 -; AVX2-NEXT: andl $63, %r11d -; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%r11) +; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rdi) ; AVX2-NEXT: andl $1, %r12d ; AVX2-NEXT: addq %r14, %r12 ; AVX2-NEXT: # kill: def $r14d killed $r14d killed $r14 def $r14 ; AVX2-NEXT: andl $63, %r14d -; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%r14) +; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%r14) +; AVX2-NEXT: movl %r12d, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) +; AVX2-NEXT: andl $1, %r13d +; AVX2-NEXT: addq %r12, %r13 +; AVX2-NEXT: movl %r13d, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 48(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %r13, %rax +; AVX2-NEXT: movl 56(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 64(%rbp), %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 72(%rbp), %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl 80(%rbp), %eax ; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %r12, %rax -; AVX2-NEXT: # kill: def $r12d killed $r12d killed $r12 def $r12 -; AVX2-NEXT: andl $63, %r12d -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%r12) +; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: movl 88(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx @@ -2070,22 +2070,23 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 112(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 120(%rbp), %ecx +; AVX2-NEXT: movl 112(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 120(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: movl 128(%rbp), %edx ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl 136(%rbp), %ecx @@ -2094,22 +2095,23 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 144(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 152(%rbp), %ecx +; AVX2-NEXT: movl 144(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 152(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: movl 160(%rbp), %edx ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl 168(%rbp), %ecx @@ -2118,22 +2120,23 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 176(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 184(%rbp), %ecx +; AVX2-NEXT: movl 176(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 184(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: movl 192(%rbp), %edx ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl 200(%rbp), %ecx @@ -2142,22 +2145,23 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 208(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 216(%rbp), %ecx +; AVX2-NEXT: movl 208(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 216(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: movl 224(%rbp), %edx ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $0, %xmm1, (%rsp,%rax) ; AVX2-NEXT: movl 232(%rbp), %ecx @@ -2166,22 +2170,23 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $1, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 240(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $2, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 248(%rbp), %ecx +; AVX2-NEXT: movl 240(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $3, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 248(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: movl 256(%rbp), %edx ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $4, %xmm1, (%rsp,%rax) ; AVX2-NEXT: movl 264(%rbp), %ecx @@ -2190,22 +2195,23 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $5, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 272(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $6, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 280(%rbp), %ecx +; AVX2-NEXT: movl 272(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $7, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 280(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: movl 288(%rbp), %edx ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $8, %xmm1, (%rsp,%rax) ; AVX2-NEXT: movl 296(%rbp), %ecx @@ -2214,22 +2220,23 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $9, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 304(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $10, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 312(%rbp), %ecx +; AVX2-NEXT: movl 304(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $11, %xmm1, (%rsp,%rax) +; AVX2-NEXT: movl 312(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: movl 320(%rbp), %edx ; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $12, %xmm1, (%rsp,%rax) ; AVX2-NEXT: movl 328(%rbp), %ecx @@ -2238,118 +2245,124 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $13, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 336(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $14, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 344(%rbp), %ecx +; AVX2-NEXT: movl 336(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $15, %xmm1, (%rsp,%rax) -; AVX2-NEXT: movl 352(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movl 344(%rbp), %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 360(%rbp), %ecx +; AVX2-NEXT: movl 352(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: movl 360(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 368(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $1, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 368(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $2, %xmm0, (%rsp,%rax) +; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl 376(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $3, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 384(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $4, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 392(%rbp), %ecx +; AVX2-NEXT: movl 384(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: movl 392(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 400(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $5, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 400(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $6, %xmm0, (%rsp,%rax) +; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl 408(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $7, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 416(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $8, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 424(%rbp), %ecx +; AVX2-NEXT: movl 416(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: movl 424(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 432(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $9, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 432(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rax) +; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl 440(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $11, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 448(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: andl $63, %eax ; AVX2-NEXT: vpextrb $12, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 456(%rbp), %ecx +; AVX2-NEXT: movl 448(%rbp), %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: movl 456(%rbp), %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) +; AVX2-NEXT: movl 464(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $13, %xmm0, (%rsp,%rax) -; AVX2-NEXT: movl 464(%rbp), %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $14, %xmm0, (%rsp,%rax) +; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) ; AVX2-NEXT: movl 472(%rbp), %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: andl $63, %eax -; AVX2-NEXT: vpextrb $15, %xmm0, (%rsp,%rax) ; AVX2-NEXT: vpextrb $15, %xmm0, %eax ; AVX2-NEXT: cmpq $64, %rcx -; AVX2-NEXT: cmovbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload +; AVX2-NEXT: cmovbl %esi, %eax ; AVX2-NEXT: cmpq $63, %rcx ; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: movl $63, %ecx @@ -2916,42 +2929,42 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX512F-NEXT: kxorw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftrw $1, %k0, %k1 ; AVX512F-NEXT: kxorw %k1, %k0, %k0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512F-NEXT: vpcompressd %zmm2, %zmm2 {%k3} {z} +; AVX512F-NEXT: vpmovdb %zmm2, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: kmovw %k0, %edx -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k1} {z} -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; AVX512F-NEXT: vpcompressd %zmm2, %zmm2 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm2, 64(%rsp,%rax) +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: vpcompressd %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vpcompressd %zmm6, %zmm6 {%k3} {z} -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm7 {%k3} {z} = -1 +; AVX512F-NEXT: vpmovdb %zmm2, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm8 {%k2} {z} = -1 -; AVX512F-NEXT: vpmovdb %zmm6, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpmovdb %zmm0, 64(%rsp,%rax) -; AVX512F-NEXT: vpmovdb %zmm3, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-NEXT: andl $31, %ecx -; AVX512F-NEXT: vpmovdb %zmm2, 96(%rsp,%rcx) +; AVX512F-NEXT: vpmovdb %zmm0, 96(%rsp,%rcx) ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 ; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: andl $63, %edx ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 ; AVX512F-NEXT: vmovaps %ymm0, 128(%rsp,%rdx) -; AVX512F-NEXT: vpmovdb %zmm4, %xmm0 -; AVX512F-NEXT: vpmovdb %zmm5, %xmm2 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm0 +; AVX512F-NEXT: vpmovdb %zmm3, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm0, {{[0-9]+}}(%rsp), %ymm2, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm7, %xmm2 -; AVX512F-NEXT: vpmovdb %zmm8, %xmm3 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k3} {z} = -1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, {{[0-9]+}}(%rsp), %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2995,17 +3008,17 @@ define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x ; AVX2-NEXT: vpaddw %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpextrw $1, %xmm4, %eax ; AVX2-NEXT: vmovd %xmm4, %ecx +; AVX2-NEXT: vpextrw $2, %xmm4, %edx +; AVX2-NEXT: vpextrw $3, %xmm4, %esi ; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: vpextrw $2, %xmm4, %eax -; AVX2-NEXT: vpextrw $3, %xmm4, %edx -; AVX2-NEXT: addl %eax, %edx -; AVX2-NEXT: addl %ecx, %edx +; AVX2-NEXT: addl %edx, %esi ; AVX2-NEXT: vpextrw $4, %xmm4, %eax +; AVX2-NEXT: addl %ecx, %esi ; AVX2-NEXT: vpextrw $5, %xmm4, %ecx ; AVX2-NEXT: addl %eax, %ecx ; AVX2-NEXT: vpextrw $6, %xmm4, %eax ; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: addl %edx, %eax +; AVX2-NEXT: addl %esi, %eax ; AVX2-NEXT: vpextrw $7, %xmm4, %ecx ; AVX2-NEXT: addl %eax, %ecx ; AVX2-NEXT: andl $31, %ecx @@ -3121,13 +3134,13 @@ define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x ; AVX2-NEXT: vpextrb $14, %xmm3, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rdx, %rcx -; AVX2-NEXT: vpextrb $15, %xmm3, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vpextrb $15, %xmm3, %r13d +; AVX2-NEXT: andl $1, %r13d +; AVX2-NEXT: addq %rcx, %r13 +; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: cmpq $32, %rax ; AVX2-NEXT: vpextrw $7, %xmm2, %eax +; AVX2-NEXT: cmpq $32, %r13 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload ; AVX2-NEXT: cmovbw (%rsp,%r13,2), %ax ; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -3234,17 +3247,16 @@ define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x ; AVX512F-NEXT: andq $-64, %rsp ; AVX512F-NEXT: subq $128, %rsp ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm5 ; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5 -; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpcompressd %zmm1, %zmm1 {%k2} {z} ; AVX512F-NEXT: vpmovdw %zmm1, (%rsp) +; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 ; AVX512F-NEXT: kshiftrw $8, %k2, %k0 ; AVX512F-NEXT: kxorw %k0, %k2, %k0 ; AVX512F-NEXT: kshiftrw $4, %k0, %k2 @@ -3259,10 +3271,11 @@ define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, (%rsp,%rax,2) -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpsllw $15, %ymm4, %ymm1 -; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, {{[0-9]+}}(%rsp), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm0, {{[0-9]+}}(%rsp), %ymm1, %ymm0 ; AVX512F-NEXT: vpsllw $15, %ymm3, %ymm1 ; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, (%rsp), %ymm2, %ymm1 @@ -3289,438 +3302,438 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $288, %rsp # imm = 0x120 -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 -; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi -; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: vmovss %xmm0, (%rsp) ; AVX2-NEXT: andl $1, %esi ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rsi,4) +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: andl $1, %edx ; AVX2-NEXT: addl %esi, %edx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rdx,4) +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %edx, %ecx ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rcx,4) +; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 ; AVX2-NEXT: andl $1, %r8d ; AVX2-NEXT: addl %ecx, %r8d ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%r8,4) ; AVX2-NEXT: andl $1, %r9d ; AVX2-NEXT: addl %r8d, %r9d -; AVX2-NEXT: movzbl 16(%rbp), %ecx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%r9,4) +; AVX2-NEXT: movzbl 16(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %r9d, %eax +; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 24(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %r9d, %ecx -; AVX2-NEXT: movzbl 24(%rbp), %edx -; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: movzbl 32(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 40(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vmovss %xmm1, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 48(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 32(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 40(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 56(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vmovss %xmm1, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 48(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 64(%rbp), %ecx +; AVX2-NEXT: movzbl 56(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 72(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 80(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 64(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 72(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 88(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 80(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 96(%rbp), %ecx +; AVX2-NEXT: movzbl 88(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 104(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vmovss %xmm2, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 112(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 96(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 104(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm2, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 120(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vmovss %xmm2, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm2, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 112(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm2, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 128(%rbp), %ecx +; AVX2-NEXT: movzbl 120(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm2, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 136(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 144(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm2, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 128(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 136(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 152(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 144(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 160(%rbp), %ecx +; AVX2-NEXT: movzbl 152(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 168(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vmovss %xmm3, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 176(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 160(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 168(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm3, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 184(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vmovss %xmm3, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm3, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 176(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm3, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 192(%rbp), %ecx +; AVX2-NEXT: movzbl 184(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm3, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 200(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 208(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm3, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 192(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 200(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 216(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 208(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 224(%rbp), %ecx +; AVX2-NEXT: movzbl 216(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 232(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vmovss %xmm4, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 240(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 224(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 232(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm4, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 248(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vmovss %xmm4, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm4, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 240(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm4, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 256(%rbp), %ecx +; AVX2-NEXT: movzbl 248(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm4, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 264(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm0 -; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 272(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm4, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 256(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 264(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 280(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm0 +; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 272(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 288(%rbp), %ecx +; AVX2-NEXT: movzbl 280(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 296(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vmovss %xmm5, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 304(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 288(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 296(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm5, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 312(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vmovss %xmm5, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm5, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 304(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm5, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 320(%rbp), %ecx +; AVX2-NEXT: movzbl 312(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm5, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 328(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 336(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm5, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 320(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 328(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 344(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm0 +; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 336(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 352(%rbp), %ecx +; AVX2-NEXT: movzbl 344(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 360(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vmovss %xmm6, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 368(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 352(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 360(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm6, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 376(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vmovss %xmm6, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm6, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 368(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm6, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 384(%rbp), %ecx +; AVX2-NEXT: movzbl 376(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm6, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 392(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm0 -; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 400(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm6, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 384(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 392(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 408(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm0 +; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 400(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 416(%rbp), %ecx +; AVX2-NEXT: movzbl 408(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 424(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vmovss %xmm7, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 432(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 416(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 424(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm7, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 440(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vmovss %xmm7, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm7, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 432(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm7, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 448(%rbp), %ecx +; AVX2-NEXT: movzbl 440(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm7, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 456(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm0 -; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: movzbl 464(%rbp), %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $3, %xmm7, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 448(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movzbl 456(%rbp), %ecx ; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4) -; AVX2-NEXT: movzbl 472(%rbp), %edx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $1, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax def $rax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm0 +; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $63, %eax +; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4) +; AVX2-NEXT: movzbl 464(%rbp), %eax +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4) -; AVX2-NEXT: andl $63, %edx -; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rdx,4) +; AVX2-NEXT: movzbl 472(%rbp), %ecx +; AVX2-NEXT: movzbl %cl, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rcx,4) +; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2 @@ -4392,20 +4405,10 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i } define <4 x i32> @test_compress_all_const() nounwind { -; AVX2-LABEL: test_compress_all_const: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = [5,9,0,0] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: test_compress_all_const: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,9,0,0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: test_compress_all_const: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,9,0,0] -; AVX512VL-NEXT: retq +; CHECK-LABEL: test_compress_all_const: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [5,9,0,0] +; CHECK-NEXT: retq %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> , <4 x i1> , <4 x i32> undef) @@ -4510,68 +4513,68 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind { ; AVX2-NEXT: vpextrb $4, %xmm1, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpextrb $5, %xmm0, -24(%rsp,%rcx) ; AVX2-NEXT: vpextrb $5, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $5, %xmm0, -24(%rsp,%rax) ; AVX2-NEXT: vpextrb $6, %xmm1, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $15, %ecx ; AVX2-NEXT: vpextrb $6, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpextrb $7, %xmm0, -24(%rsp,%rcx) ; AVX2-NEXT: vpextrb $7, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $7, %xmm0, -24(%rsp,%rax) ; AVX2-NEXT: vpextrb $8, %xmm1, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $15, %ecx ; AVX2-NEXT: vpextrb $8, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpextrb $9, %xmm0, -24(%rsp,%rcx) ; AVX2-NEXT: vpextrb $9, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $9, %xmm0, -24(%rsp,%rax) ; AVX2-NEXT: vpextrb $10, %xmm1, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $15, %ecx ; AVX2-NEXT: vpextrb $10, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpextrb $11, %xmm0, -24(%rsp,%rcx) ; AVX2-NEXT: vpextrb $11, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $11, %xmm0, -24(%rsp,%rax) ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $15, %ecx ; AVX2-NEXT: vpextrb $12, %xmm0, -24(%rsp,%rcx) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpextrb $13, %xmm0, -24(%rsp,%rcx) ; AVX2-NEXT: vpextrb $13, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpextrb $14, %xmm1, %edx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $13, %xmm0, -24(%rsp,%rax) -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addl %ecx, %edx ; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx ; AVX2-NEXT: andl $15, %ecx ; AVX2-NEXT: vpextrb $14, %xmm0, -24(%rsp,%rcx) -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpextrb $15, %xmm0, -24(%rsp,%rax) +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: vpextrb $15, %xmm0, -24(%rsp,%rdx) ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX2-NEXT: retq ; @@ -4728,9 +4731,9 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i ; AVX2-NEXT: shll $2, %eax ; AVX2-NEXT: movl %edx, -24(%rsp,%rax) ; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vpextrb $4, %xmm0, %edx ; AVX2-NEXT: vpextrb $8, %xmm0, %ecx +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: # kill: def $dl killed $dl killed $edx ; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx @@ -4753,16 +4756,16 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: vmovd %edi, %xmm0 ; AVX512F-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; AVX512F-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vpextrb $4, %xmm0, %edx ; AVX512F-NEXT: vpextrb $8, %xmm0, %ecx +; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: # kill: def $al killed $al killed $eax ; AVX512F-NEXT: # kill: def $dl killed $dl killed $edx ; AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx @@ -4786,14 +4789,14 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i ; AVX512VL-NEXT: korw %k1, %k0, %k0 ; AVX512VL-NEXT: movb $7, %al ; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: kandw %k1, %k0, %k1 ; AVX512VL-NEXT: vmovd %edi, %xmm0 ; AVX512VL-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX512VL-NEXT: kandw %k1, %k0, %k1 ; AVX512VL-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ; AVX512VL-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} -; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: vpextrb $4, %xmm0, %edx ; AVX512VL-NEXT: vpextrb $8, %xmm0, %ecx +; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax ; AVX512VL-NEXT: # kill: def $dl killed $dl killed $edx ; AVX512VL-NEXT: # kill: def $cl killed $cl killed $ecx @@ -4820,12 +4823,12 @@ define <4 x i32> @test_compress_v4i32_zero_passthru(<4 x i32> %vec, <4 x i1> %ma ; AVX2-NEXT: vpextrd $2, %xmm1, %eax ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rcx,4) ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; AVX2-NEXT: andl $3, %eax -; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rax,4) ; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: cmpq $3, %rcx ; AVX2-NEXT: movl $3, %edx diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 49062eaef3188..134d6cfb2bab8 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -518,11 +518,17 @@ define <4 x double> @constrained_vector_fmul_v4f64() #0 { ; CHECK-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fmul_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fmul_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] +; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_fmul_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] +; AVX512-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: retq entry: %mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64( <4 x double> @constrained_vector_fadd_v3f32() #0 { ; AVX: # %bb.0: # %entry ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-NEXT: retq entry: @@ -652,11 +658,17 @@ define <4 x double> @constrained_vector_fadd_v4f64() #0 { ; CHECK-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fadd_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fadd_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] +; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_fadd_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] +; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: retq entry: %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64( <4 x double> @constrained_vector_fsub_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fsub_v3f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; CHECK-NEXT: movaps %xmm1, %xmm2 -; CHECK-NEXT: subss %xmm0, %xmm2 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: subss %xmm1, %xmm3 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fsub_v3f32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-NEXT: retq entry: @@ -789,11 +801,17 @@ define <4 x double> @constrained_vector_fsub_v4f64() #0 { ; CHECK-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fsub_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] -; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fsub_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] +; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_fsub_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] +; AVX512-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: retq entry: %sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64( <4 x double> @constrained_vector_fptosi_v3i32_v3f32() #0 { ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -4155,9 +4173,9 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32() #0 { ; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx ; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -4208,27 +4226,27 @@ define <2 x i64> @constrained_vector_fptosi_v2i64_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i64_v2f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rcx, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v2i64_v2f32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vmovq %rcx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_fptosi_v2i64_v2f32: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX512F-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovq %rcx, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-NEXT: retq ; @@ -4259,8 +4277,8 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 { ; AVX1-NEXT: vmovq %rax, %xmm0 ; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -4271,8 +4289,8 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 { ; AVX512-NEXT: vmovq %rax, %xmm0 ; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -4385,8 +4403,8 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() #0 { ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -4394,9 +4412,9 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() #0 { ; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx ; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -4449,27 +4467,27 @@ define <2 x i64> @constrained_vector_fptosi_v2i64_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i64_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rcx, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v2i64_v2f64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vmovq %rcx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_fptosi_v2i64_v2f64: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovq %rcx, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-NEXT: retq ; @@ -4501,8 +4519,8 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 { ; AVX1-NEXT: vmovq %rax, %xmm0 ; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -4513,8 +4531,8 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 { ; AVX512-NEXT: vmovq %rax, %xmm0 ; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -4613,9 +4631,9 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i32_v2f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq ; @@ -4648,8 +4666,8 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() #0 { ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -4667,9 +4685,9 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() #0 { ; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx ; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -4845,9 +4863,9 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 { ; AVX512F-LABEL: constrained_vector_fptoui_v2i64_v2f32: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX512F-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovq %rcx, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-NEXT: retq ; @@ -4964,8 +4982,8 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 { ; AVX512-NEXT: vmovq %rax, %xmm0 ; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -5017,10 +5035,10 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 { ; CHECK-NEXT: # %bb.5: # %entry ; CHECK-NEXT: movaps %xmm2, %xmm5 ; CHECK-NEXT: .LBB124_6: # %entry -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; CHECK-NEXT: subss %xmm5, %xmm4 ; CHECK-NEXT: cvttss2si %xmm4, %rax ; CHECK-NEXT: setbe %cl +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: shlq $63, %rcx ; CHECK-NEXT: xorq %rax, %rcx @@ -5080,10 +5098,10 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 { ; AVX1-NEXT: # %bb.5: # %entry ; AVX1-NEXT: vmovaps %xmm0, %xmm5 ; AVX1-NEXT: .LBB124_6: # %entry -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubss %xmm5, %xmm4, %xmm3 -; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: vsubss %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm4, %rax ; AVX1-NEXT: setbe %cl +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: movzbl %cl, %ecx ; AVX1-NEXT: shlq $63, %rcx ; AVX1-NEXT: xorq %rax, %rcx @@ -5162,9 +5180,9 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i32_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq ; @@ -5197,8 +5215,8 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() #0 { ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -5216,9 +5234,9 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() #0 { ; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx ; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -5248,7 +5266,7 @@ define <4 x i32> @constrained_vector_fptoui_v4i32_v4f64() #0 { ; ; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] ; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] ; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 @@ -5399,9 +5417,9 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 { ; AVX512F-LABEL: constrained_vector_fptoui_v2i64_v2f64: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vmovq %rcx, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-NEXT: retq ; @@ -5519,8 +5537,8 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 { ; AVX512-NEXT: vmovq %rax, %xmm0 ; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -5572,10 +5590,10 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 { ; CHECK-NEXT: # %bb.5: # %entry ; CHECK-NEXT: movapd %xmm2, %xmm5 ; CHECK-NEXT: .LBB132_6: # %entry -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; CHECK-NEXT: subsd %xmm5, %xmm4 ; CHECK-NEXT: cvttsd2si %xmm4, %rax ; CHECK-NEXT: setbe %cl +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; CHECK-NEXT: movzbl %cl, %ecx ; CHECK-NEXT: shlq $63, %rcx ; CHECK-NEXT: xorq %rax, %rcx @@ -5635,10 +5653,10 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 { ; AVX1-NEXT: # %bb.5: # %entry ; AVX1-NEXT: vmovapd %xmm0, %xmm5 ; AVX1-NEXT: .LBB132_6: # %entry -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubsd %xmm5, %xmm4, %xmm3 -; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: vsubsd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vcvttsd2si %xmm4, %rax ; AVX1-NEXT: setbe %cl +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: movzbl %cl, %ecx ; AVX1-NEXT: shlq $63, %rcx ; AVX1-NEXT: xorq %rax, %rcx @@ -6728,11 +6746,11 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: movd %xmm2, %eax ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2ss %eax, %xmm2 -; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -6743,9 +6761,9 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX-NEXT: vpextrd $2, %xmm0, %eax ; AVX-NEXT: vcvtsi2ss %eax, %xmm3, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-NEXT: retq entry: @@ -6805,10 +6823,9 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 ; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: cvtsi2ss %rdx, %xmm2 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64: @@ -6817,10 +6834,10 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -6831,10 +6848,10 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX512-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -6894,14 +6911,14 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2sd %rax, %xmm0 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: cvtsi2sd %rax, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2sd %rax, %xmm0 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; CHECK-NEXT: movapd %xmm2, %xmm0 ; CHECK-NEXT: movapd %xmm3, %xmm1 ; CHECK-NEXT: retq @@ -6913,12 +6930,12 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -6929,12 +6946,12 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; @@ -6959,9 +6976,7 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-NEXT: cvtsi2ss %rax, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; CHECK-NEXT: movq %xmm1, %rax -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: cvtsi2ss %rax, %xmm3 ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2ss %rax, %xmm1 @@ -6969,6 +6984,7 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: movaps %xmm1, %xmm0 @@ -6980,13 +6996,13 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -6997,13 +7013,13 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -7453,11 +7469,11 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; CHECK-NEXT: movd %xmm2, %eax ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2ss %rax, %xmm2 -; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -7468,9 +7484,9 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vpextrd $2, %xmm0, %eax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX1-NEXT: retq ; @@ -7480,9 +7496,9 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vpextrd $2, %xmm0, %eax ; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX512-NEXT: retq entry: @@ -7700,10 +7716,10 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -7734,10 +7750,10 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i32(<4 x i32> %x) #0 { ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] -; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vorpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vsubpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i32: @@ -7863,19 +7879,19 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 ; AVX1-NEXT: vmovd %xmm1, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vextractps $2, %xmm0, %eax +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: movl %eax, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 ; AVX1-NEXT: vpextrd $1, %xmm1, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; AVX1-NEXT: vpextrd $3, %xmm0, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax @@ -7893,12 +7909,12 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; AVX512F-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm3 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm4, %xmm0 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; @@ -7994,13 +8010,13 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 @@ -8014,13 +8030,13 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-ext-logic.ll b/llvm/test/CodeGen/X86/vector-ext-logic.ll index 341cc07994c77..30074c1c32434 100644 --- a/llvm/test/CodeGen/X86/vector-ext-logic.ll +++ b/llvm/test/CodeGen/X86/vector-ext-logic.ll @@ -265,8 +265,8 @@ define <8 x i32> @bool_zext_and(<8 x i1> %x, <8 x i1> %y) { ; ; AVX2-LABEL: bool_zext_and: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-extend-inreg.ll b/llvm/test/CodeGen/X86/vector-extend-inreg.ll index 889ab6a0818e2..c7a5f215593be 100644 --- a/llvm/test/CodeGen/X86/vector-extend-inreg.ll +++ b/llvm/test/CodeGen/X86/vector-extend-inreg.ll @@ -34,7 +34,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun ; X86-SSE-NEXT: leal (%ecx,%ecx), %eax ; X86-SSE-NEXT: andl $31, %eax ; X86-SSE-NEXT: movl 128(%esp,%eax,4), %eax -; X86-SSE-NEXT: leal 1(%ecx,%ecx), %ecx +; X86-SSE-NEXT: leal 1(,%ecx,2), %ecx ; X86-SSE-NEXT: andl $31, %ecx ; X86-SSE-NEXT: movl (%esp,%ecx,4), %edx ; X86-SSE-NEXT: movl %ebp, %esp @@ -80,7 +80,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun ; X86-AVX-NEXT: leal (%ecx,%ecx), %eax ; X86-AVX-NEXT: andl $31, %eax ; X86-AVX-NEXT: movl 128(%esp,%eax,4), %eax -; X86-AVX-NEXT: leal 1(%ecx,%ecx), %ecx +; X86-AVX-NEXT: leal 1(,%ecx,2), %ecx ; X86-AVX-NEXT: andl $31, %ecx ; X86-AVX-NEXT: movl (%esp,%ecx,4), %edx ; X86-AVX-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index fd0525e6d56a2..ce979afe57c0d 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -37,19 +37,19 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: psrlq %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] ; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllq %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllq %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: psllq %xmm2, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pandn %xmm3, %xmm4 ; SSE41-NEXT: psrlq $1, %xmm1 @@ -57,36 +57,37 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE41-NEXT: psrlq %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE41-NEXT: psrlq %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllq %xmm2, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -97,7 +98,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -119,7 +120,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -156,7 +157,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -169,7 +171,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX2-LABEL: var_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -188,13 +190,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X86-SSE2-NEXT: psrlq %xmm5, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 -; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] ; X86-SSE2-NEXT: pand %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psllq %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: psllq %xmm2, %xmm4 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) @@ -214,14 +216,14 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld %xmm7, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psrld %xmm7, %xmm8 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 @@ -239,7 +241,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; SSE41-LABEL: var_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pandn %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -387,9 +389,9 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; X86-SSE2-LABEL: var_funnnel_v4i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] ; X86-SSE2-NEXT: psrld $1, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 @@ -397,22 +399,22 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 -; X86-SSE2-NEXT: psrld %xmm6, %xmm7 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld %xmm7, %xmm0 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: psrld %xmm5, %xmm1 -; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -549,7 +551,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -571,7 +573,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 @@ -600,7 +602,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; XOPAVX2-LABEL: var_funnnel_v8i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -793,7 +795,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512F-LABEL: var_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -811,7 +813,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512VL-LABEL: var_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -903,7 +905,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4 @@ -968,44 +970,44 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: psrlq $1, %xmm1 -; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: psllq %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,63] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: psrlq $1, %xmm1 +; SSE-NEXT: psrlq %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: psllq %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; -; SSE41-LABEL: splatvar_funnnel_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pandn %xmm3, %xmm4 -; SSE41-NEXT: psrlq $1, %xmm1 -; SSE41-NEXT: psrlq %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1027,7 +1029,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1063,16 +1065,28 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatvar_funnnel_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_funnnel_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: @@ -1094,8 +1108,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; SSE-LABEL: splatvar_funnnel_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: psllq %xmm2, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: psllq %xmm2, %xmm1 @@ -1105,8 +1119,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; AVX-LABEL: splatvar_funnnel_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 @@ -1115,8 +1129,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; AVX512F-LABEL: splatvar_funnnel_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0 @@ -1125,8 +1139,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; AVX512VL-LABEL: splatvar_funnnel_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512VL-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0 @@ -1135,8 +1149,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; AVX512BW-LABEL: splatvar_funnnel_v4i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512BW-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 @@ -1155,8 +1169,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 @@ -1171,8 +1185,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; XOP-LABEL: splatvar_funnnel_v4i32: ; XOP: # %bb.0: -; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; XOP-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 @@ -1182,8 +1196,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; X86-SSE2-LABEL: splatvar_funnnel_v4i32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm3 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-SSE2-NEXT: psllq %xmm2, %xmm1 @@ -1196,33 +1210,21 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % } define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: psrlw %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: psllw %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_funnnel_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [15,0] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pandn %xmm3, %xmm4 -; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: psrlw %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: psllw %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: psrlw $1, %xmm1 +; SSE-NEXT: psrlw %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: psllw %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1233,7 +1235,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1244,7 +1246,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1255,7 +1257,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1276,7 +1278,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1293,7 +1295,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; XOP-LABEL: splatvar_funnnel_v8i16: ; XOP: # %bb.0: -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1325,9 +1327,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: psllw %xmm2, %xmm3 -; SSE-NEXT: psrlw $8, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: psllw %xmm2, %xmm1 +; SSE-NEXT: psrlw $8, %xmm3 ; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: packuswb %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -1338,11 +1340,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm3, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: @@ -1350,11 +1352,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpsrlw $8, %xmm3, %xmm1 ; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: @@ -1362,11 +1364,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm1 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v16i8: @@ -1374,11 +1376,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: @@ -1386,11 +1388,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: @@ -1398,11 +1400,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: @@ -1410,11 +1412,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatvar_funnnel_v16i8: @@ -1433,9 +1435,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: psllw %xmm2, %xmm3 -; X86-SSE2-NEXT: psrlw $8, %xmm3 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X86-SSE2-NEXT: psllw %xmm2, %xmm1 +; X86-SSE2-NEXT: psrlw $8, %xmm3 ; X86-SSE2-NEXT: psrlw $8, %xmm1 ; X86-SSE2-NEXT: packuswb %xmm3, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -1452,19 +1454,20 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; SSE-LABEL: sink_splatvar: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movd %esi, %xmm0 -; SSE-NEXT: movq $-1024, %rax # imm = 0xFC00 +; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: .p2align 4 ; SSE-NEXT: .LBB8_1: # %loop ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE-NEXT: movdqu 1024(%rdi,%rax), %xmm1 +; SSE-NEXT: movdqu (%rdi,%rax,4), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: psllq %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: psllq %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; SSE-NEXT: movups %xmm1, 1024(%rdi,%rax) -; SSE-NEXT: addq $16, %rax +; SSE-NEXT: movups %xmm1, (%rdi,%rax,4) +; SSE-NEXT: addq $4, %rax +; SSE-NEXT: cmpq $256, %rax # imm = 0x100 ; SSE-NEXT: jne .LBB8_1 ; SSE-NEXT: # %bb.2: # %end ; SSE-NEXT: retq @@ -1472,19 +1475,20 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; AVX1-LABEL: sink_splatvar: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX1-NEXT: xorl %eax, %eax ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB8_1: # %loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 +; AVX1-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; AVX1-NEXT: vpsllq %xmm0, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vpsllq %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX1-NEXT: vmovups %xmm1, 1024(%rdi,%rax) -; AVX1-NEXT: addq $16, %rax +; AVX1-NEXT: vmovups %xmm1, (%rdi,%rax,4) +; AVX1-NEXT: addq $4, %rax +; AVX1-NEXT: cmpq $256, %rax # imm = 0x100 ; AVX1-NEXT: jne .LBB8_1 ; AVX1-NEXT: # %bb.2: # %end ; AVX1-NEXT: retq @@ -1493,7 +1497,7 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovd %esi, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] @@ -1501,12 +1505,13 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; AVX2-NEXT: .p2align 4 ; AVX2-NEXT: .LBB8_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2 +; AVX2-NEXT: vmovdqu (%rdi,%rax,4), %xmm2 ; AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax) -; AVX2-NEXT: addq $16, %rax +; AVX2-NEXT: vmovdqu %xmm2, (%rdi,%rax,4) +; AVX2-NEXT: addq $4, %rax +; AVX2-NEXT: cmpq $256, %rax # imm = 0x100 ; AVX2-NEXT: jne .LBB8_1 ; AVX2-NEXT: # %bb.2: # %end ; AVX2-NEXT: retq @@ -1515,14 +1520,15 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vmovd %esi, %xmm0 ; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512F-NEXT: xorl %eax, %eax ; AVX512F-NEXT: .p2align 4 ; AVX512F-NEXT: .LBB8_1: # %loop ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 +; AVX512F-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; AVX512F-NEXT: vprolvd %zmm0, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) -; AVX512F-NEXT: addq $16, %rax +; AVX512F-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; AVX512F-NEXT: addq $4, %rax +; AVX512F-NEXT: cmpq $256, %rax # imm = 0x100 ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %end ; AVX512F-NEXT: vzeroupper @@ -1531,14 +1537,15 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; AVX512VL-LABEL: sink_splatvar: ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vpbroadcastd %esi, %xmm0 -; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: .p2align 4 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 +; AVX512VL-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; AVX512VL-NEXT: vprolvd %xmm0, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) -; AVX512VL-NEXT: addq $16, %rax +; AVX512VL-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; AVX512VL-NEXT: addq $4, %rax +; AVX512VL-NEXT: cmpq $256, %rax # imm = 0x100 ; AVX512VL-NEXT: jne .LBB8_1 ; AVX512VL-NEXT: # %bb.2: # %end ; AVX512VL-NEXT: retq @@ -1547,14 +1554,15 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: vmovd %esi, %xmm0 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512BW-NEXT: xorl %eax, %eax ; AVX512BW-NEXT: .p2align 4 ; AVX512BW-NEXT: .LBB8_1: # %loop ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 +; AVX512BW-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; AVX512BW-NEXT: vprolvd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) -; AVX512BW-NEXT: addq $16, %rax +; AVX512BW-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; AVX512BW-NEXT: addq $4, %rax +; AVX512BW-NEXT: cmpq $256, %rax # imm = 0x100 ; AVX512BW-NEXT: jne .LBB8_1 ; AVX512BW-NEXT: # %bb.2: # %end ; AVX512BW-NEXT: vzeroupper @@ -1564,14 +1572,15 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; AVX512VBMI2: # %bb.0: # %entry ; AVX512VBMI2-NEXT: vmovd %esi, %xmm0 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512VBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512VBMI2-NEXT: xorl %eax, %eax ; AVX512VBMI2-NEXT: .p2align 4 ; AVX512VBMI2-NEXT: .LBB8_1: # %loop ; AVX512VBMI2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 +; AVX512VBMI2-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; AVX512VBMI2-NEXT: vprolvd %zmm0, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) -; AVX512VBMI2-NEXT: addq $16, %rax +; AVX512VBMI2-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; AVX512VBMI2-NEXT: addq $4, %rax +; AVX512VBMI2-NEXT: cmpq $256, %rax # imm = 0x100 ; AVX512VBMI2-NEXT: jne .LBB8_1 ; AVX512VBMI2-NEXT: # %bb.2: # %end ; AVX512VBMI2-NEXT: vzeroupper @@ -1580,14 +1589,15 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; AVX512VLBW-LABEL: sink_splatvar: ; AVX512VLBW: # %bb.0: # %entry ; AVX512VLBW-NEXT: vpbroadcastd %esi, %xmm0 -; AVX512VLBW-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512VLBW-NEXT: xorl %eax, %eax ; AVX512VLBW-NEXT: .p2align 4 ; AVX512VLBW-NEXT: .LBB8_1: # %loop ; AVX512VLBW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VLBW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 +; AVX512VLBW-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; AVX512VLBW-NEXT: vprolvd %xmm0, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) -; AVX512VLBW-NEXT: addq $16, %rax +; AVX512VLBW-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; AVX512VLBW-NEXT: addq $4, %rax +; AVX512VLBW-NEXT: cmpq $256, %rax # imm = 0x100 ; AVX512VLBW-NEXT: jne .LBB8_1 ; AVX512VLBW-NEXT: # %bb.2: # %end ; AVX512VLBW-NEXT: retq @@ -1595,14 +1605,15 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; AVX512VLVBMI2-LABEL: sink_splatvar: ; AVX512VLVBMI2: # %bb.0: # %entry ; AVX512VLVBMI2-NEXT: vpbroadcastd %esi, %xmm0 -; AVX512VLVBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512VLVBMI2-NEXT: xorl %eax, %eax ; AVX512VLVBMI2-NEXT: .p2align 4 ; AVX512VLVBMI2-NEXT: .LBB8_1: # %loop ; AVX512VLVBMI2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VLVBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 +; AVX512VLVBMI2-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 ; AVX512VLVBMI2-NEXT: vprolvd %xmm0, %xmm1, %xmm1 -; AVX512VLVBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) -; AVX512VLVBMI2-NEXT: addq $16, %rax +; AVX512VLVBMI2-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; AVX512VLVBMI2-NEXT: addq $4, %rax +; AVX512VLVBMI2-NEXT: cmpq $256, %rax # imm = 0x100 ; AVX512VLVBMI2-NEXT: jne .LBB8_1 ; AVX512VLVBMI2-NEXT: # %bb.2: # %end ; AVX512VLVBMI2-NEXT: retq @@ -1611,13 +1622,14 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; XOPAVX1: # %bb.0: # %entry ; XOPAVX1-NEXT: vmovd %esi, %xmm0 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; XOPAVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; XOPAVX1-NEXT: xorl %eax, %eax ; XOPAVX1-NEXT: .p2align 4 ; XOPAVX1-NEXT: .LBB8_1: # %loop ; XOPAVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; XOPAVX1-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1 -; XOPAVX1-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) -; XOPAVX1-NEXT: addq $16, %rax +; XOPAVX1-NEXT: vprotd %xmm0, (%rdi,%rax,4), %xmm1 +; XOPAVX1-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; XOPAVX1-NEXT: addq $4, %rax +; XOPAVX1-NEXT: cmpq $256, %rax # imm = 0x100 ; XOPAVX1-NEXT: jne .LBB8_1 ; XOPAVX1-NEXT: # %bb.2: # %end ; XOPAVX1-NEXT: retq @@ -1626,13 +1638,14 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) { ; XOPAVX2: # %bb.0: # %entry ; XOPAVX2-NEXT: vmovd %esi, %xmm0 ; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; XOPAVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; XOPAVX2-NEXT: xorl %eax, %eax ; XOPAVX2-NEXT: .p2align 4 ; XOPAVX2-NEXT: .LBB8_1: # %loop ; XOPAVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; XOPAVX2-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1 -; XOPAVX2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) -; XOPAVX2-NEXT: addq $16, %rax +; XOPAVX2-NEXT: vprotd %xmm0, (%rdi,%rax,4), %xmm1 +; XOPAVX2-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; XOPAVX2-NEXT: addq $4, %rax +; XOPAVX2-NEXT: cmpq $256, %rax # imm = 0x100 ; XOPAVX2-NEXT: jne .LBB8_1 ; XOPAVX2-NEXT: # %bb.2: # %end ; XOPAVX2-NEXT: retq @@ -1760,7 +1773,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,14] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,14] ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1820,9 +1833,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-NEXT: psrld $27, %xmm2 ; SSE2-NEXT: psrld $28, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -1891,7 +1904,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,7] ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1934,9 +1947,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-NEXT: psrld $27, %xmm2 ; X86-SSE2-NEXT: psrld $28, %xmm1 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -1983,9 +1996,9 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -1996,7 +2009,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2040,9 +2053,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,128,64,32,16,8,4,2] -; SSE-NEXT: psrlw $8, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,2,4,8,16,32,64,128] +; SSE-NEXT: psrlw $8, %xmm2 ; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: packuswb %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -2052,38 +2065,38 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512F-LABEL: constant_funnnel_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] -; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] -; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0 @@ -2101,7 +2114,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] ; AVX512VBMI2-NEXT: vpsllvw %zmm0, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 @@ -2147,9 +2160,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [1,128,64,32,16,8,4,2] -; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,2,4,8,16,32,64,128] +; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: psrlw $8, %xmm1 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index fdd0d68b89003..b2d1671d6e310 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -24,7 +24,7 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] ; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 @@ -32,9 +32,9 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7] ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] @@ -45,8 +45,8 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] @@ -125,7 +125,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63] +; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -163,7 +163,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vxorps %xmm3, %xmm4, %xmm5 @@ -187,14 +187,14 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrldq {{.*#+}} xmm7 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7 ; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7] ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero @@ -280,8 +280,8 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm3, %xmm3 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [4294967265,4294967265,4294967265,4294967265] @@ -417,7 +417,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 @@ -433,8 +433,8 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; XOPAVX1-LABEL: var_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm3, %xmm3 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [65521,65521,65521,65521,65521,65521,65521,65521] @@ -594,26 +594,26 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX512F-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6 @@ -628,16 +628,16 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -736,7 +736,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 -; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] ; XOPAVX2-NEXT: vpaddb %xmm6, %xmm5, %xmm7 ; XOPAVX2-NEXT: vpshlb %xmm7, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 @@ -760,25 +760,26 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -789,7 +790,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -811,7 +812,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -848,25 +849,26 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -889,19 +891,19 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-NEXT: vpsllq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm5[1,3] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-NEXT: vpsllq %xmm2, %ymm3, %ymm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0 @@ -910,8 +912,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; ; AVX512F-LABEL: splatvar_funnnel_v8i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512F-NEXT: vpsllq %xmm2, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0 @@ -920,8 +922,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512VL-NEXT: vpsllq %xmm2, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0 @@ -930,8 +932,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512BW-NEXT: vpsllq %xmm2, %ymm3, %ymm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0 @@ -949,8 +951,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0 @@ -972,19 +974,19 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] ; XOPAVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm5[1,3] ; XOPAVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm3, %ymm3 ; XOPAVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0 @@ -998,7 +1000,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 @@ -1006,12 +1008,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm2 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i16: @@ -1086,7 +1088,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 @@ -1094,12 +1096,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: @@ -1122,8 +1124,8 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] @@ -1145,11 +1147,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: @@ -1157,11 +1159,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: @@ -1169,11 +1171,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: @@ -1181,11 +1183,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512BW-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $8, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512BW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: @@ -1193,11 +1195,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512VBMI2-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %ymm3, %ymm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: @@ -1205,11 +1207,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512VLBW-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpsrlw $8, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX10-LABEL: splatvar_funnnel_v32i8: @@ -1217,11 +1219,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX10-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX10-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX10-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX10-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX10-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX10-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX10-NEXT: vpsrlw $8, %ymm3, %ymm1 ; AVX10-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX10-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX10-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX10-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: @@ -1233,14 +1235,14 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] -; XOPAVX1-NEXT: vpperm %xmm4, %xmm5, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; XOPAVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; XOPAVX1-NEXT: vpsllw %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpperm %xmm4, %xmm5, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX1-NEXT: vpperm %xmm1, %xmm5, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpperm %xmm1, %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: @@ -1252,14 +1254,14 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; XOPAVX2-NEXT: vpsllw %xmm2, %xmm5, %xmm5 ; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; XOPAVX2-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] -; XOPAVX2-NEXT: vpperm %xmm4, %xmm5, %xmm3, %xmm3 -; XOPAVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; XOPAVX2-NEXT: vpsllw %xmm2, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; XOPAVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpperm %xmm4, %xmm5, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpperm %xmm1, %xmm5, %xmm3, %xmm2 +; XOPAVX2-NEXT: vpperm %xmm1, %xmm4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %splat) @@ -1274,9 +1276,9 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovd %edx, %xmm1 ; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX1-NEXT: xorl %eax, %eax ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [31,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [31,0,0,0] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: .p2align 4 @@ -1285,31 +1287,32 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm3 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 +; AVX1-NEXT: vmovdqu (%rdi,%rax,4), %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] +; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm5 -; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] -; AVX1-NEXT: vpsllq %xmm1, %xmm7, %xmm8 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm9 -; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm9[1,3],xmm8[1,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] -; AVX1-NEXT: vpsllq %xmm1, %xmm9, %xmm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm11 -; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,3],xmm10[1,3] -; AVX1-NEXT: vpsllq %xmm2, %xmm7, %xmm7 -; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3] -; AVX1-NEXT: vblendvps %xmm4, %xmm8, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq %xmm2, %xmm9, %xmm5 +; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm8 +; AVX1-NEXT: vmovdqu 16(%rdi,%rax,4), %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] +; AVX1-NEXT: vpsllq %xmm1, %xmm10, %xmm11 +; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm8[1,3],xmm7[1,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,0,1,1] +; AVX1-NEXT: vpsllq %xmm1, %xmm8, %xmm9 ; AVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm6 +; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] +; AVX1-NEXT: vblendvps %xmm4, %xmm7, %xmm5, %xmm4 +; AVX1-NEXT: vpsllq %xmm2, %xmm10, %xmm5 +; AVX1-NEXT: vpsllq %xmm2, %xmm8, %xmm6 +; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,3],xmm11[1,3] ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,3],xmm5[1,3] -; AVX1-NEXT: vblendvps %xmm3, %xmm10, %xmm5, %xmm3 -; AVX1-NEXT: vmovups %xmm4, 4096(%rdi,%rax,4) -; AVX1-NEXT: vmovups %xmm3, 4112(%rdi,%rax,4) +; AVX1-NEXT: vblendvps %xmm3, %xmm7, %xmm5, %xmm3 +; AVX1-NEXT: vmovups %xmm4, (%rdi,%rax,4) +; AVX1-NEXT: vmovups %xmm3, 16(%rdi,%rax,4) ; AVX1-NEXT: addq $8, %rax +; AVX1-NEXT: cmpq $1024, %rax # imm = 0x400 ; AVX1-NEXT: jne .LBB8_1 ; AVX1-NEXT: # %bb.2: # %exit ; AVX1-NEXT: retq @@ -1320,7 +1323,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX2-NEXT: vmovd %ecx, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] @@ -1331,13 +1334,14 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm5 ; AVX2-NEXT: vblendvps %ymm5, %ymm0, %ymm1, %ymm5 ; AVX2-NEXT: vandps %ymm3, %ymm5, %ymm5 -; AVX2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm6 +; AVX2-NEXT: vmovdqu (%rdi,%rax,4), %ymm6 ; AVX2-NEXT: vpsllvd %ymm5, %ymm6, %ymm7 ; AVX2-NEXT: vpsubd %ymm5, %ymm4, %ymm5 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vmovdqu %ymm5, 4096(%rdi,%rax,4) +; AVX2-NEXT: vmovdqu %ymm5, (%rdi,%rax,4) ; AVX2-NEXT: addq $8, %rax +; AVX2-NEXT: cmpq $1024, %rax # imm = 0x400 ; AVX2-NEXT: jne .LBB8_1 ; AVX2-NEXT: # %bb.2: # %exit ; AVX2-NEXT: vzeroupper @@ -1349,7 +1353,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512F-NEXT: vmovd %ecx, %xmm1 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512F-NEXT: xorl %eax, %eax ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: .p2align 4 ; AVX512F-NEXT: .LBB8_1: # %loop @@ -1357,10 +1361,11 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 ; AVX512F-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 +; AVX512F-NEXT: vmovdqu (%rdi,%rax,4), %ymm4 ; AVX512F-NEXT: vprolvd %zmm3, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) +; AVX512F-NEXT: vmovdqu %ymm3, (%rdi,%rax,4) ; AVX512F-NEXT: addq $8, %rax +; AVX512F-NEXT: cmpq $1024, %rax # imm = 0x400 ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %exit ; AVX512F-NEXT: vzeroupper @@ -1370,17 +1375,18 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vpbroadcastd %edx, %ymm0 ; AVX512VL-NEXT: vpbroadcastd %ecx, %ymm1 -; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512VL-NEXT: xorl %eax, %eax ; AVX512VL-NEXT: .p2align 4 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1} -; AVX512VL-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3 +; AVX512VL-NEXT: vmovdqu (%rdi,%rax,4), %ymm3 ; AVX512VL-NEXT: vprolvd %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4) +; AVX512VL-NEXT: vmovdqu %ymm2, (%rdi,%rax,4) ; AVX512VL-NEXT: addq $8, %rax +; AVX512VL-NEXT: cmpq $1024, %rax # imm = 0x400 ; AVX512VL-NEXT: jne .LBB8_1 ; AVX512VL-NEXT: # %bb.2: # %exit ; AVX512VL-NEXT: vzeroupper @@ -1392,7 +1398,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512BW-NEXT: vmovd %ecx, %xmm1 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512BW-NEXT: xorl %eax, %eax ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: .p2align 4 ; AVX512BW-NEXT: .LBB8_1: # %loop @@ -1400,10 +1406,11 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 ; AVX512BW-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 +; AVX512BW-NEXT: vmovdqu (%rdi,%rax,4), %ymm4 ; AVX512BW-NEXT: vprolvd %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) +; AVX512BW-NEXT: vmovdqu %ymm3, (%rdi,%rax,4) ; AVX512BW-NEXT: addq $8, %rax +; AVX512BW-NEXT: cmpq $1024, %rax # imm = 0x400 ; AVX512BW-NEXT: jne .LBB8_1 ; AVX512BW-NEXT: # %bb.2: # %exit ; AVX512BW-NEXT: vzeroupper @@ -1415,7 +1422,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX512VBMI2-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512VBMI2-NEXT: vmovd %ecx, %xmm1 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX512VBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512VBMI2-NEXT: xorl %eax, %eax ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: .p2align 4 ; AVX512VBMI2-NEXT: .LBB8_1: # %loop @@ -1423,10 +1430,11 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX512VBMI2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX512VBMI2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 ; AVX512VBMI2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 -; AVX512VBMI2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 +; AVX512VBMI2-NEXT: vmovdqu (%rdi,%rax,4), %ymm4 ; AVX512VBMI2-NEXT: vprolvd %zmm3, %zmm4, %zmm3 -; AVX512VBMI2-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) +; AVX512VBMI2-NEXT: vmovdqu %ymm3, (%rdi,%rax,4) ; AVX512VBMI2-NEXT: addq $8, %rax +; AVX512VBMI2-NEXT: cmpq $1024, %rax # imm = 0x400 ; AVX512VBMI2-NEXT: jne .LBB8_1 ; AVX512VBMI2-NEXT: # %bb.2: # %exit ; AVX512VBMI2-NEXT: vzeroupper @@ -1436,17 +1444,18 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX512VLBW: # %bb.0: # %entry ; AVX512VLBW-NEXT: vpbroadcastd %edx, %ymm0 ; AVX512VLBW-NEXT: vpbroadcastd %ecx, %ymm1 -; AVX512VLBW-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512VLBW-NEXT: xorl %eax, %eax ; AVX512VLBW-NEXT: .p2align 4 ; AVX512VLBW-NEXT: .LBB8_1: # %loop ; AVX512VLBW-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX512VLBW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1} -; AVX512VLBW-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3 +; AVX512VLBW-NEXT: vmovdqu (%rdi,%rax,4), %ymm3 ; AVX512VLBW-NEXT: vprolvd %ymm2, %ymm3, %ymm2 -; AVX512VLBW-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4) +; AVX512VLBW-NEXT: vmovdqu %ymm2, (%rdi,%rax,4) ; AVX512VLBW-NEXT: addq $8, %rax +; AVX512VLBW-NEXT: cmpq $1024, %rax # imm = 0x400 ; AVX512VLBW-NEXT: jne .LBB8_1 ; AVX512VLBW-NEXT: # %bb.2: # %exit ; AVX512VLBW-NEXT: vzeroupper @@ -1456,17 +1465,18 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX10: # %bb.0: # %entry ; AVX10-NEXT: vpbroadcastd %edx, %ymm0 ; AVX10-NEXT: vpbroadcastd %ecx, %ymm1 -; AVX10-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX10-NEXT: xorl %eax, %eax ; AVX10-NEXT: .p2align 4 ; AVX10-NEXT: .LBB8_1: # %loop ; AVX10-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX10-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX10-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX10-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1} -; AVX10-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3 +; AVX10-NEXT: vmovdqu (%rdi,%rax,4), %ymm3 ; AVX10-NEXT: vprolvd %ymm2, %ymm3, %ymm2 -; AVX10-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4) +; AVX10-NEXT: vmovdqu %ymm2, (%rdi,%rax,4) ; AVX10-NEXT: addq $8, %rax +; AVX10-NEXT: cmpq $1024, %rax # imm = 0x400 ; AVX10-NEXT: jne .LBB8_1 ; AVX10-NEXT: # %bb.2: # %exit ; AVX10-NEXT: vzeroupper @@ -1480,7 +1490,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; XOPAVX1-NEXT: vmovd %ecx, %xmm1 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; XOPAVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; XOPAVX1-NEXT: xorl %eax, %eax ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 @@ -1493,12 +1503,13 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] ; XOPAVX1-NEXT: vpmovsxbd %xmm5, %xmm5 ; XOPAVX1-NEXT: vblendvps %xmm5, %xmm3, %xmm4, %xmm5 -; XOPAVX1-NEXT: vprotd %xmm5, 4112(%rdi,%rax,4), %xmm5 +; XOPAVX1-NEXT: vprotd %xmm5, 16(%rdi,%rax,4), %xmm5 ; XOPAVX1-NEXT: vblendvps %xmm6, %xmm0, %xmm1, %xmm6 -; XOPAVX1-NEXT: vprotd %xmm6, 4096(%rdi,%rax,4), %xmm6 -; XOPAVX1-NEXT: vmovdqu %xmm6, 4096(%rdi,%rax,4) -; XOPAVX1-NEXT: vmovdqu %xmm5, 4112(%rdi,%rax,4) +; XOPAVX1-NEXT: vprotd %xmm6, (%rdi,%rax,4), %xmm6 +; XOPAVX1-NEXT: vmovdqu %xmm6, (%rdi,%rax,4) +; XOPAVX1-NEXT: vmovdqu %xmm5, 16(%rdi,%rax,4) ; XOPAVX1-NEXT: addq $8, %rax +; XOPAVX1-NEXT: cmpq $1024, %rax # imm = 0x400 ; XOPAVX1-NEXT: jne .LBB8_1 ; XOPAVX1-NEXT: # %bb.2: # %exit ; XOPAVX1-NEXT: vzeroupper @@ -1510,7 +1521,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; XOPAVX2-NEXT: vpbroadcastd %xmm0, %ymm0 ; XOPAVX2-NEXT: vmovd %ecx, %xmm1 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; XOPAVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; XOPAVX2-NEXT: xorl %eax, %eax ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: .p2align 4 ; XOPAVX2-NEXT: .LBB8_1: # %loop @@ -1519,11 +1530,12 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; XOPAVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 ; XOPAVX2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 ; XOPAVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; XOPAVX2-NEXT: vprotd %xmm4, 4112(%rdi,%rax,4), %xmm4 -; XOPAVX2-NEXT: vprotd %xmm3, 4096(%rdi,%rax,4), %xmm3 -; XOPAVX2-NEXT: vmovdqu %xmm3, 4096(%rdi,%rax,4) -; XOPAVX2-NEXT: vmovdqu %xmm4, 4112(%rdi,%rax,4) +; XOPAVX2-NEXT: vprotd %xmm4, 16(%rdi,%rax,4), %xmm4 +; XOPAVX2-NEXT: vprotd %xmm3, (%rdi,%rax,4), %xmm3 +; XOPAVX2-NEXT: vmovdqu %xmm3, (%rdi,%rax,4) +; XOPAVX2-NEXT: vmovdqu %xmm4, 16(%rdi,%rax,4) ; XOPAVX2-NEXT: addq $8, %rax +; XOPAVX2-NEXT: cmpq $1024, %rax # imm = 0x400 ; XOPAVX2-NEXT: jne .LBB8_1 ; XOPAVX2-NEXT: # %bb.2: # %exit ; XOPAVX2-NEXT: vzeroupper @@ -1611,7 +1623,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,14,50,60] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,14,50,60] ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1633,11 +1645,11 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -1671,8 +1683,8 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX1-NEXT: vpsrld $26, %xmm1, %xmm4 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1709,7 +1721,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11] ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1732,9 +1744,9 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; XOPAVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq @@ -1755,14 +1767,14 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 # [512,1024,2048,4096,8192,16384,32768,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,6],xmm2[7] ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 # [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,6],xmm2[7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,512,1024,2048,4096,8192,16384,32768] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1799,9 +1811,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512BW-LABEL: constant_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1811,7 +1823,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1834,13 +1846,13 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 -; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -1863,11 +1875,11 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2] ; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 @@ -1885,33 +1897,33 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: constant_funnnel_v32i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i8: @@ -1962,11 +1974,11 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX10_256: # %bb.0: ; AVX10_256-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX10_256-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX10_256-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX10_256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX10_256-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX10_256-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX10_256-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX10_256-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX10_256-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX10_256-NEXT: retq ; ; XOPAVX1-LABEL: constant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index 1d807fa85ddc5..8a70bb00459d4 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -184,7 +184,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -200,7 +200,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -222,28 +222,28 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpandq %zmm8, %zmm2, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm9 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vpxor %ymm4, %ymm8, %ymm9 ; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm7 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm10, %ymm7 ; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm7 -; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm7 +; AVX512F-NEXT: vpand %ymm5, %ymm7, %ymm7 ; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm7 ; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm6 ; AVX512F-NEXT: vpxor %ymm2, %ymm8, %ymm7 @@ -254,36 +254,36 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6 -; AVX512F-NEXT: vpand %ymm4, %ymm6, %ymm4 +; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm5 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm5 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpsllw $4, %ymm5, %ymm6 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $2, %ymm5, %ymm6 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm6 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm5 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; @@ -291,28 +291,28 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm5 -; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm3 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm7 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm7 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpandq %zmm8, %zmm2, %zmm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VL-NEXT: vpxor %ymm3, %ymm8, %ymm9 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512VL-NEXT: vpxor %ymm4, %ymm8, %ymm9 ; AVX512VL-NEXT: vpsllw $5, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm7 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm7 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512VL-NEXT: vpand %ymm7, %ymm10, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm7 -; AVX512VL-NEXT: vpand %ymm4, %ymm7, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm7 +; AVX512VL-NEXT: vpand %ymm5, %ymm7, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm7 ; AVX512VL-NEXT: vpand %ymm6, %ymm7, %ymm6 ; AVX512VL-NEXT: vpxor %ymm2, %ymm8, %ymm7 @@ -323,36 +323,36 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6 -; AVX512VL-NEXT: vpand %ymm4, %ymm6, %ymm4 +; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512VL-NEXT: vpsllw $4, %ymm5, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $2, %ymm5, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -426,7 +426,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -448,7 +448,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -487,8 +487,8 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512F-NEXT: vpsllq %xmm2, %zmm3, %zmm3 ; AVX512F-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] ; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm0 @@ -497,8 +497,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; ; AVX512VL-LABEL: splatvar_funnnel_v16i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512VL-NEXT: vpsllq %xmm2, %zmm3, %zmm3 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] ; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm0 @@ -507,8 +507,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; ; AVX512BW-LABEL: splatvar_funnnel_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512BW-NEXT: vpsllq %xmm2, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] ; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm0 @@ -523,8 +523,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] ; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm0 @@ -551,11 +551,11 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512F-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -569,11 +569,11 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -621,8 +621,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512F-NEXT: vpsllw %xmm2, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] @@ -643,8 +643,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512VL-NEXT: vpsllw %xmm2, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] @@ -666,11 +666,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: @@ -678,11 +678,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm3, %zmm3 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: @@ -690,11 +690,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: @@ -702,11 +702,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm3, %zmm3 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %splat) @@ -928,44 +928,44 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: constant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: constant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index b763b7bac2432..cc75f7b6d617e 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -36,19 +36,19 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psllq %xmm1, %xmm5 -; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE2-NEXT: psrlq %xmm2, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [63,63] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [63,63] ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: psubq %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm2, %xmm1 @@ -57,37 +57,38 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psllq %xmm1, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlq %xmm3, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: psrlq %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -155,12 +156,12 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 ; X86-SSE2-NEXT: psllq %xmm1, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm5, %xmm0 ; X86-SSE2-NEXT: retl @@ -414,7 +415,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -425,7 +426,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -724,43 +725,43 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,63] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psllq %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: psrlq %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [63,63] +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: psubq %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psllq %xmm1, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: psrlq %xmm3, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: retq ; -; SSE41-LABEL: splatvar_funnnel_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [63,63] -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: psllq %xmm1, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: psrlq %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpsllq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: @@ -935,16 +936,16 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: pslld %xmm1, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld %xmm1, %xmm0 +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [15,0] +; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pandn %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -957,7 +958,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -968,7 +969,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -979,7 +980,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -990,7 +991,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -1001,7 +1002,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -1044,9 +1045,9 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; X86-SSE2-NEXT: pslld %xmm1, %xmm2 -; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld %xmm1, %xmm0 +; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm0 ; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 ; X86-SSE2-NEXT: retl @@ -1062,9 +1063,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; SSE-NEXT: psllw %xmm1, %xmm2 -; SSE-NEXT: psrlw $8, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psllw %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm2 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq @@ -1074,11 +1075,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: @@ -1086,11 +1087,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: @@ -1098,11 +1099,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VL-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v16i8: @@ -1110,11 +1111,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: @@ -1122,11 +1123,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: @@ -1134,11 +1135,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VBMI2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: @@ -1146,11 +1147,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i8: @@ -1172,9 +1173,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; X86-SSE2-NEXT: psllw %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: psllw %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE2-NEXT: retl @@ -1237,7 +1238,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1251,7 +1252,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1265,7 +1266,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1303,8 +1304,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1347,7 +1348,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1361,7 +1362,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1375,7 +1376,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1395,8 +1396,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1409,27 +1410,18 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { } define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { -; SSE2-LABEL: constant_funnnel_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pmulhuw %xmm1, %xmm2 -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_funnnel_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmulhuw %xmm1, %xmm2 -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: constant_funnnel_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmulhuw %xmm1, %xmm2 +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: constant_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 @@ -1437,7 +1429,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; ; AVX512F-LABEL: constant_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 @@ -1454,9 +1446,9 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [16,15,14,13,12,11,10,9] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [16,15,14,13,12,11,10,9] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1472,7 +1464,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1506,9 +1498,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,128,64,32,16,8,4,2] -; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] +; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1517,9 +1509,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -1528,9 +1520,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -1539,20 +1531,20 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1564,20 +1556,20 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: constant_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1589,9 +1581,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: retq @@ -1606,9 +1598,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,128,64,32,16,8,4,2] -; X86-SSE2-NEXT: psrlw $8, %xmm1 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128] +; X86-SSE2-NEXT: psrlw $8, %xmm1 ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 9e872cc6d74a9..05461c8621150 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -22,30 +22,30 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [63,63,63,63] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [63,63,63,63] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm6 +; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpsubq %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpand %xmm2, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlq %xmm6, %xmm5, %xmm8 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpsubq %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] @@ -324,7 +324,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -566,24 +566,25 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm1, %xmm4, %xmm2 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -666,19 +667,19 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm3[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpsllq %xmm1, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 @@ -752,19 +753,19 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw %xmm1, %xmm4, %xmm2 -; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm1 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i16: @@ -886,11 +887,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: @@ -898,11 +899,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: @@ -910,11 +911,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: @@ -922,11 +923,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: @@ -934,11 +935,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: @@ -946,11 +947,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512VBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: @@ -958,11 +959,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: @@ -1023,7 +1024,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -1036,7 +1037,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1049,7 +1050,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1086,9 +1087,9 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] @@ -1107,7 +1108,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -1120,7 +1121,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1133,7 +1134,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1170,7 +1171,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -1204,9 +1205,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1221,7 +1222,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1255,11 +1256,11 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 @@ -1277,9 +1278,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1288,9 +1289,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1299,22 +1300,20 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1325,22 +1324,20 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: constant_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] -; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] -; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1351,9 +1348,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: retq @@ -1619,7 +1616,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index 436fbe31f7a34..12afd8f2ed5e9 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -66,7 +66,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] @@ -96,7 +96,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -106,7 +106,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -130,30 +130,30 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] +; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm5 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm5 = zmm3 ^ (zmm4 & (zmm5 ^ zmm3)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm5 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm6 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4)) +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm5 ^ (zmm7 & (zmm6 ^ zmm5)) ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 +; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm5 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8 -; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm5 = zmm3 ^ (zmm4 & (zmm5 ^ zmm3)) ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3)) @@ -186,7 +186,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm8) ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 @@ -307,12 +307,12 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512F-NEXT: vpsrlw %xmm3, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: @@ -324,12 +324,12 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -418,11 +418,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: @@ -430,11 +430,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: @@ -442,11 +442,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: @@ -454,11 +454,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat) @@ -594,9 +594,9 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -605,9 +605,9 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -616,9 +616,9 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq @@ -627,9 +627,9 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll index 9ecc6296a844a..931bdb94d3459 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -320,8 +320,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -364,7 +364,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -378,7 +378,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -392,7 +392,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] +; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -412,8 +412,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index 322ebe22671e6..486af8e4b5f10 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -34,14 +34,14 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrld %xmm7, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psrld %xmm7, %xmm8 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 @@ -59,7 +59,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; ; SSE41-LABEL: var_funnnel_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pandn %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -207,9 +207,9 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; ; X86-SSE2-LABEL: var_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 -; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] ; X86-SSE2-NEXT: psrld $1, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 @@ -217,22 +217,22 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 -; X86-SSE2-NEXT: psrld %xmm6, %xmm7 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld %xmm7, %xmm0 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: psrld %xmm5, %xmm1 -; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -261,14 +261,14 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld %xmm7, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: psrld %xmm6, %xmm7 +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: psrld %xmm7, %xmm8 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pslld $23, %xmm3 @@ -287,7 +287,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; SSE41-LABEL: splatvar_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pandn %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -450,10 +450,10 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i32: ; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] ; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 -; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] ; X86-SSE2-NEXT: psrld $1, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 @@ -461,22 +461,22 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: psrld %xmm7, %xmm2 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 -; X86-SSE2-NEXT: psrld %xmm6, %xmm7 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld %xmm7, %xmm0 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: psrld %xmm5, %xmm1 -; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] -; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE2-NEXT: pslld $23, %xmm3 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE2-NEXT: cvttps2dq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -498,12 +498,12 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE2-NEXT: psrld $28, %xmm1 ; SSE2-NEXT: psrld $27, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -566,7 +566,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,0,0] +; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0] ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -596,12 +596,12 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: psrld $28, %xmm1 ; X86-SSE2-NEXT: psrld $27, %xmm2 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index a56b0a6351a3b..335d915bbb545 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -36,57 +36,58 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; SSE2-NEXT: psrlq %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: paddq %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllq %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psllq %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: psllq %xmm2, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: psrlq %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE41-NEXT: psrlq %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pandn %xmm3, %xmm2 ; SSE41-NEXT: paddq %xmm0, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllq %xmm2, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -97,7 +98,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -119,7 +120,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -157,7 +158,8 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 @@ -170,7 +172,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX2-LABEL: var_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -188,14 +190,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; X86-SSE2-NEXT: psrlq %xmm5, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 -; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] ; X86-SSE2-NEXT: pandn %xmm4, %xmm2 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psllq %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: psllq %xmm2, %xmm4 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) @@ -222,10 +224,10 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm1 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] @@ -240,7 +242,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; SSE41-LABEL: var_funnnel_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -250,21 +252,21 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm8 ; SSE41-NEXT: psrld %xmm7, %xmm8 -; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: psrld %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: psrld %xmm4, %xmm7 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7] ; SSE41-NEXT: pandn %xmm3, %xmm2 ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v4i32: @@ -275,14 +277,14 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 @@ -406,10 +408,10 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: psrld %xmm5, %xmm1 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; X86-SSE2-NEXT: pandn %xmm4, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] @@ -459,8 +461,8 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; SSE2-NEXT: pandn %xmm3, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $23, %xmm1 @@ -484,7 +486,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; SSE41-LABEL: var_funnnel_v8i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: psllw $4, %xmm4 @@ -534,17 +536,17 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm5 ; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm6 -; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 -; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 -; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 -; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5 +; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 +; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5 +; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5 +; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 @@ -606,7 +608,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -628,7 +630,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -658,7 +660,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; XOPAVX2-LABEL: var_funnnel_v8i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 @@ -702,8 +704,8 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; X86-SSE2-NEXT: pandn %xmm3, %xmm1 ; X86-SSE2-NEXT: psrlw $1, %xmm3 ; X86-SSE2-NEXT: pand %xmm4, %xmm3 -; X86-SSE2-NEXT: por %xmm1, %xmm3 ; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pslld $23, %xmm1 @@ -799,15 +801,15 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: psllw $5, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: paddb %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrlw $4, %xmm6 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm6 ; SSE41-NEXT: psrlw $2, %xmm6 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; SSE41-NEXT: paddb %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm6 @@ -818,17 +820,17 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: pandn %xmm5, %xmm3 ; SSE41-NEXT: psllw $5, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: paddb %xmm3, %xmm4 ; SSE41-NEXT: paddb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm5 ; SSE41-NEXT: psllw $4, %xmm5 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psllw $2, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 @@ -845,68 +847,68 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 +; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6 +; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm5 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsllw $5, %xmm4, %xmm4 -; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 -; AVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm5 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 +; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm6 +; AVX2-NEXT: vpblendvb %xmm4, %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm4 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %xmm6, %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm4 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpaddb %xmm6, %xmm6, %xmm5 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllw $5, %xmm2, %xmm2 -; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $4, %xmm0, %xmm4 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero @@ -923,7 +925,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512VL-LABEL: var_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero @@ -1007,7 +1009,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; XOPAVX2-LABEL: var_funnnel_v16i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0 @@ -1089,44 +1091,44 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: psrlq %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: paddq %xmm0, %xmm0 -; SSE2-NEXT: psllq %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,63] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: psrlq %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: paddq %xmm0, %xmm0 +; SSE-NEXT: psllq %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; -; SSE41-LABEL: splatvar_funnnel_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: psrlq %xmm4, %xmm1 -; SSE41-NEXT: pandn %xmm3, %xmm2 -; SSE41-NEXT: paddq %xmm0, %xmm0 -; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1148,7 +1150,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1185,16 +1187,28 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatvar_funnnel_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_funnnel_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: @@ -1216,8 +1230,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; SSE-LABEL: splatvar_funnnel_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: psrlq %xmm2, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: psrlq %xmm2, %xmm1 @@ -1227,8 +1241,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; AVX-LABEL: splatvar_funnnel_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 @@ -1237,8 +1251,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; AVX512F-LABEL: splatvar_funnnel_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 @@ -1258,8 +1272,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; AVX512BW-LABEL: splatvar_funnnel_v4i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 @@ -1296,8 +1310,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; ; XOP-LABEL: splatvar_funnnel_v4i32: ; XOP: # %bb.0: -; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; XOP-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 ; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; XOP-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 @@ -1307,8 +1321,8 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; X86-SSE2-LABEL: splatvar_funnnel_v4i32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; X86-SSE2-NEXT: psrlq %xmm2, %xmm3 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-SSE2-NEXT: psrlq %xmm2, %xmm1 @@ -1321,33 +1335,21 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % } define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: psrlw %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: paddw %xmm0, %xmm0 -; SSE2-NEXT: psllw %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_funnnel_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [15,0] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: psrlw %xmm4, %xmm1 -; SSE41-NEXT: pandn %xmm3, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: psllw %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: psrlw %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: paddw %xmm0, %xmm0 +; SSE-NEXT: psllw %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1358,7 +1360,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1369,7 +1371,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1380,7 +1382,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1401,7 +1403,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1419,7 +1421,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; XOP-LABEL: splatvar_funnnel_v8i16: ; XOP: # %bb.0: -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1445,35 +1447,20 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % } define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: psrlw %xmm2, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psrlw %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: packuswb %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_funnnel_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: psrlw %xmm2, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: psrlw %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: packuswb %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: psrlw %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: psrlw %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: @@ -1481,11 +1468,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i8: @@ -1493,12 +1480,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm1 ; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: @@ -1506,12 +1493,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm1 ; AVX512F-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: @@ -1519,12 +1506,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm1 ; AVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v16i8: @@ -1532,19 +1519,19 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm1 ; AVX512BW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78] -; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 @@ -1593,9 +1580,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm4 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X86-SSE2-NEXT: pand %xmm3, %xmm4 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X86-SSE2-NEXT: psrlw %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 ; X86-SSE2-NEXT: pand %xmm1, %xmm3 ; X86-SSE2-NEXT: packuswb %xmm4, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 @@ -1679,7 +1666,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,14] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,14] ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1740,9 +1727,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-NEXT: psrld $5, %xmm2 ; SSE2-NEXT: psrld $4, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -1811,7 +1798,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,7] ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1855,9 +1842,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-NEXT: psrld $5, %xmm2 ; X86-SSE2-NEXT: psrld $4, %xmm1 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -1920,9 +1907,9 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -1933,7 +1920,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2013,11 +2000,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; SSE41-NEXT: paddb %xmm0, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_funnnel_v16i8: @@ -2032,10 +2019,10 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -2082,7 +2069,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; ; AVX512BW-LABEL: constant_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0 @@ -2099,7 +2086,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 0fa2c858ff000..05046a28291f9 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -24,7 +24,7 @@ declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 @@ -35,18 +35,18 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm7 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] @@ -126,7 +126,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63] +; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -164,7 +164,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -172,32 +172,32 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; AVX1-NEXT: vpsrld %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7 ; AVX1-NEXT: vpsrld %xmm7, %xmm6, %xmm7 +; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX1-NEXT: vpsrld %xmm9, %xmm6, %xmm9 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero +; AVX1-NEXT: vpsrld %xmm10, %xmm6, %xmm6 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX1-NEXT: vpsrld %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero -; AVX1-NEXT: vpsrld %xmm9, %xmm6, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm9[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] ; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 -; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpmulld %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm8 -; AVX1-NEXT: vpsrld %xmm8, %xmm1, %xmm8 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm9 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld %xmm9, %xmm1, %xmm9 +; AVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm10 +; AVX1-NEXT: vpsrld %xmm10, %xmm1, %xmm10 +; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsrld %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm9[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 @@ -282,7 +282,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 @@ -320,24 +320,24 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5 ; AVX1-NEXT: vpsllw $4, %xmm4, %xmm6 ; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm8 -; AVX1-NEXT: vpblendvb %xmm5, %xmm8, %xmm7, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm7 -; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlw $2, %xmm5, %xmm7 -; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm7 -; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm7 +; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm6, %xmm7 +; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlw $2, %xmm6, %xmm7 +; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm7 +; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm7, %xmm6, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm6 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm4, %xmm7 @@ -352,21 +352,21 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 ; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 ; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllw $12, %xmm2, %xmm6 -; AVX1-NEXT: vpsllw $4, %xmm2, %xmm7 -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm7 +; AVX1-NEXT: vpsllw $12, %xmm2, %xmm7 +; AVX1-NEXT: vpsllw $4, %xmm2, %xmm8 +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm8 -; AVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6 -; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm6 +; AVX1-NEXT: vpblendvb %xmm7, %xmm8, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm6 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm8 +; AVX1-NEXT: vpblendvb %xmm7, %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm8 +; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpblendvb %xmm7, %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm8 ; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm7, %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 @@ -445,7 +445,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -462,7 +462,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; XOPAVX1-LABEL: var_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 @@ -511,60 +511,59 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm6 +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vpsllw $5, %xmm7, %xmm8 -; AVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpsrlw $2, %xmm6, %xmm9 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm9 -; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm9 -; AVX1-NEXT: vpsrlw $1, %xmm9, %xmm10 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm6, %xmm10, %xmm10 -; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpblendvb %xmm8, %xmm10, %xmm9, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpsllw $5, %xmm8, %xmm7 +; AVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $2, %xmm4, %xmm9 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9 +; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm10 +; AVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm9 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm9 +; AVX1-NEXT: vpaddb %xmm10, %xmm10, %xmm10 +; AVX1-NEXT: vpblendvb %xmm10, %xmm9, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 ; AVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm9 ; AVX1-NEXT: vpsllw $4, %xmm9, %xmm10 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vpand %xmm11, %xmm10, %xmm10 -; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7 -; AVX1-NEXT: vpsllw $5, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpxor %xmm3, %xmm8, %xmm8 +; AVX1-NEXT: vpsllw $5, %xmm8, %xmm8 +; AVX1-NEXT: vpblendvb %xmm8, %xmm10, %xmm9, %xmm9 ; AVX1-NEXT: vpsllw $2, %xmm9, %xmm10 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX1-NEXT: vpand %xmm12, %xmm10, %xmm10 -; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpblendvb %xmm8, %xmm10, %xmm9, %xmm9 ; AVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm10 -; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpblendvb %xmm7, %xmm10, %xmm9, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm8 -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vpsllw $5, %xmm2, %xmm8 -; AVX1-NEXT: vpblendvb %xmm8, %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm5 -; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpblendvb %xmm8, %xmm10, %xmm9, %xmm8 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm9 +; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5 +; AVX1-NEXT: vpsllw $5, %xmm2, %xmm9 +; AVX1-NEXT: vpblendvb %xmm9, %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5 +; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm11, %xmm4 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm11, %xmm5 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 ; AVX1-NEXT: vpand %xmm3, %xmm12, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 @@ -572,8 +571,9 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm2 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v32i8: @@ -581,30 +581,30 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4 -; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5 -; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm5 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 +; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX2-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm4 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm4 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX2-NEXT: vpaddb %ymm6, %ymm6, %ymm5 ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -613,62 +613,62 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm5 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm5 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -736,7 +736,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v32i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 @@ -790,25 +790,26 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -819,7 +820,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -841,7 +842,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -879,25 +880,26 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: # xmm3 = mem[0,0] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpaddq %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -920,19 +922,19 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2] ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-NEXT: vpsrlq %xmm2, %xmm4, %xmm4 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,2],xmm5[0,2] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-NEXT: vpsrlq %xmm2, %ymm3, %ymm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-NEXT: vpsrlq %xmm2, %ymm0, %ymm0 @@ -941,8 +943,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; ; AVX512F-LABEL: splatvar_funnnel_v8i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512F-NEXT: vpsrlq %xmm2, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX512F-NEXT: vpsrlq %xmm2, %ymm0, %ymm0 @@ -951,8 +953,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512VL-NEXT: vpsrlq %xmm2, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX512VL-NEXT: vpsrlq %xmm2, %ymm0, %ymm0 @@ -961,8 +963,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512BW-NEXT: vpsrlq %xmm2, %ymm3, %ymm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX512BW-NEXT: vpsrlq %xmm2, %ymm0, %ymm0 @@ -980,8 +982,8 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX512VLBW-NEXT: vpsrlq %xmm2, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX512VLBW-NEXT: vpsrlq %xmm2, %ymm0, %ymm0 @@ -1004,19 +1006,19 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2] ; XOPAVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,2],xmm5[0,2] ; XOPAVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v8i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; XOPAVX2-NEXT: vpsrlq %xmm2, %ymm3, %ymm3 ; XOPAVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; XOPAVX2-NEXT: vpsrlq %xmm2, %ymm0, %ymm0 @@ -1030,7 +1032,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 @@ -1038,12 +1040,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm2 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i16: @@ -1119,7 +1121,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] +; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 @@ -1127,12 +1129,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: @@ -1180,11 +1182,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: @@ -1193,11 +1195,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm1 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: @@ -1205,12 +1207,12 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm1 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: @@ -1219,11 +1221,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm1 ; AVX512BW-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: @@ -1244,12 +1246,12 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VLBW-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpand %ymm4, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: @@ -1285,14 +1287,14 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOPAVX1-NEXT: vpperm %xmm4, %xmm5, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpperm %xmm4, %xmm5, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX1-NEXT: vpperm %xmm1, %xmm5, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpperm %xmm1, %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_funnnel_v32i8: @@ -1304,14 +1306,14 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; XOPAVX2-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 ; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; XOPAVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOPAVX2-NEXT: vpperm %xmm4, %xmm5, %xmm3, %xmm3 -; XOPAVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; XOPAVX2-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; XOPAVX2-NEXT: vpsrlw %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpperm %xmm4, %xmm5, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpperm %xmm1, %xmm5, %xmm3, %xmm2 +; XOPAVX2-NEXT: vpperm %xmm1, %xmm4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %splat) @@ -1376,7 +1378,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,14,50,60] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,14,50,60] ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq @@ -1399,11 +1401,11 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -1437,8 +1439,8 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX1-NEXT: vpsrld $6, %xmm1, %xmm4 ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1475,7 +1477,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11] ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq @@ -1499,9 +1501,9 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; XOPAVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq @@ -1520,15 +1522,15 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX1-LABEL: constant_funnnel_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,128,64,32,16,8,4,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [256,128,64,32,16,8,4,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32768,16384,8192,4096,2048,1024,512,256] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,64,32,16,8,4,2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1566,9 +1568,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512BW-LABEL: constant_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -1578,7 +1580,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq @@ -1602,13 +1604,13 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm2 -; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -1657,7 +1659,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [128,32,8,2,128,2,8,32] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 @@ -1677,10 +1679,10 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] -; AVX2-NEXT: vpsllw $8, %ymm2, %ymm2 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $8, %ymm2, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1696,10 +1698,10 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] -; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; @@ -1712,9 +1714,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $8, %ymm2, %ymm2 ; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 665223167fbb4..d0f415c1df5f9 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -141,8 +141,8 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpslld $16, %zmm4, %zmm4 -; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 @@ -164,8 +164,8 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpslld $16, %zmm4, %zmm4 -; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 @@ -184,7 +184,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 @@ -201,7 +201,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 @@ -222,25 +222,25 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 -; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm8 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm7 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm7 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm7 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm7, %ymm10, %ymm7 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm7 ; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm6 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm7 @@ -253,61 +253,61 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512F-NEXT: vpand %ymm6, %ymm10, %ymm6 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm7 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $4, %ymm6, %ymm7 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 +; AVX512F-NEXT: vpxor %ymm3, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm7 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm7, %ymm9, %ymm7 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512F-NEXT: vpsllw $4, %ymm5, %ymm6 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vpxor %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpsllw $2, %ymm5, %ymm6 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm6 -; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm7 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm7, %ymm6, %ymm5 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm9, %ymm3 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm3 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpandq %zmm7, %zmm2, %zmm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm8 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm6 @@ -320,48 +320,48 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $4, %ymm5, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-NEXT: vpxor %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $2, %ymm5, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512VL-NEXT: vpxor %ymm7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm6 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-NEXT: vpxor %ymm7, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm5, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] @@ -386,12 +386,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm5, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] @@ -424,7 +424,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -446,7 +446,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -487,8 +487,8 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512F-NEXT: vpsrlq %xmm2, %zmm3, %zmm3 ; AVX512F-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] ; AVX512F-NEXT: vpsrlq %xmm2, %zmm0, %zmm0 @@ -497,8 +497,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; ; AVX512VL-LABEL: splatvar_funnnel_v16i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512VL-NEXT: vpsrlq %xmm2, %zmm3, %zmm3 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] ; AVX512VL-NEXT: vpsrlq %xmm2, %zmm0, %zmm0 @@ -507,8 +507,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; ; AVX512BW-LABEL: splatvar_funnnel_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512BW-NEXT: vpsrlq %xmm2, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] ; AVX512BW-NEXT: vpsrlq %xmm2, %zmm0, %zmm0 @@ -524,8 +524,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] ; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm0, %zmm0 @@ -551,13 +551,13 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -569,13 +569,13 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpaddw %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -651,7 +651,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 @@ -672,12 +672,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm1 ; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: @@ -696,12 +696,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: @@ -886,19 +886,19 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5 +; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] -; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] ; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 @@ -914,11 +914,11 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm3 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpmaddubsw %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3 @@ -928,19 +928,19 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] -; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] +; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512VL-NEXT: vpmullw %ymm4, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512VL-NEXT: vpmullw %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] -; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX512VL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] ; AVX512VL-NEXT: vpmullw %ymm6, %ymm1, %ymm1 @@ -956,11 +956,11 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3 +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] -; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaddubsw %ymm5, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3 @@ -970,12 +970,12 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: constant_funnnel_v64i8: @@ -992,12 +992,12 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm1 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 9ce682306f18b..fd6f0c9beb977 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -36,19 +36,19 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psrlq %xmm1, %xmm5 -; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psllq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE2-NEXT: psllq %xmm2, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: orpd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [63,63] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [63,63] ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: psubq %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm2, %xmm1 @@ -57,37 +57,38 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrlq %xmm1, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psllq %xmm3, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE41-NEXT: psllq %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -157,12 +158,12 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 ; X86-SSE2-NEXT: psrlq %xmm1, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psllq %xmm3, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm5, %xmm0 ; X86-SSE2-NEXT: retl @@ -432,7 +433,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -443,7 +444,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -630,12 +631,12 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; ; AVX512BW-LABEL: var_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero @@ -751,43 +752,43 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,63] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: psubq %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrlq %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: psllq %xmm3, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [63,63] +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: psubq %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrlq %xmm1, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: psllq %xmm3, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: retq ; -; SSE41-LABEL: splatvar_funnnel_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [63,63] -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: psubq %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: psrlq %xmm1, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: psllq %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: @@ -981,7 +982,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [15,0] +; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -994,7 +995,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1005,7 +1006,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1016,7 +1017,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1027,7 +1028,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1038,7 +1039,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1099,33 +1100,19 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind } define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { -; SSE2-LABEL: splatvar_funnnel_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrlw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_funnnel_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatvar_funnnel_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE-NEXT: psrlw %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: psrlw %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v16i8: ; AVX1: # %bb.0: @@ -1133,11 +1120,11 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i8: @@ -1145,12 +1132,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm1 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: @@ -1158,12 +1145,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm1 ; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: @@ -1171,12 +1158,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm1 ; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v16i8: @@ -1184,12 +1171,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm1 ; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: @@ -1250,9 +1237,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; X86-SSE2-NEXT: psrlw %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: psrlw %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE2-NEXT: retl @@ -1315,7 +1302,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1329,7 +1316,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1343,7 +1330,7 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] ; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1381,8 +1368,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1425,7 +1412,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1439,7 +1426,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1453,7 +1440,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1473,8 +1460,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1523,9 +1510,9 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [16,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [16,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,15,14,13,12,11,10,9] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,15,14,13,12,11,10,9] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1541,7 +1528,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1575,9 +1562,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,2,4,8,16,32,64,128] -; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,128,64,32,16,8,4,2] +; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1586,9 +1573,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,2,4,8,16,32,64,128] -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,128,64,32,16,8,4,2] +; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -1597,9 +1584,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,2,4,8,16,32,64,128] -; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,128,64,32,16,8,4,2] +; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -1608,20 +1595,20 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,2,4,8,16,32,64,128] -; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,128,64,32,16,8,4,2] +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1633,20 +1620,20 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: constant_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1658,9 +1645,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: retq @@ -1675,9 +1662,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,2,4,8,16,32,64,128] -; X86-SSE2-NEXT: psrlw $8, %xmm1 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,128,64,32,16,8,4,2] +; X86-SSE2-NEXT: psrlw $8, %xmm1 ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index 3d4f283260aa5..045f844690441 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -22,30 +22,30 @@ declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [63,63,63,63] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [63,63,63,63] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm6 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm6 +; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpsubq %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpand %xmm2, %xmm6, %xmm6 +; AVX1-NEXT: vpsllq %xmm6, %xmm5, %xmm8 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpsubq %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] @@ -340,7 +340,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -505,8 +505,8 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; ; AVX512BW-LABEL: var_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm2, %zmm2 @@ -526,7 +526,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLBW-NEXT: vpsrlvw %ymm3, %ymm4, %ymm3 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -597,24 +597,25 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm2 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -701,19 +702,19 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-NEXT: vpsrlq %xmm1, %xmm4, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm3[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 @@ -791,19 +792,19 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm2 ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm1 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v16i16: @@ -931,11 +932,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v32i8: @@ -944,11 +945,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i8: @@ -956,12 +957,12 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm1 ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: @@ -970,11 +971,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm1 ; AVX512BW-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: @@ -982,12 +983,12 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm1 ; AVX512VLBW-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: @@ -1075,7 +1076,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -1088,7 +1089,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1101,7 +1102,7 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] ; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1138,9 +1139,9 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] @@ -1159,7 +1160,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -1172,7 +1173,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1185,7 +1186,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1256,9 +1257,9 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1273,7 +1274,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1307,11 +1308,11 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 @@ -1329,9 +1330,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1340,9 +1341,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1351,22 +1352,20 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1377,22 +1376,20 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: constant_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] -; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] -; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1403,9 +1400,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: retq @@ -1671,7 +1668,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 11ea650e1f02d..f38afc8c90a3a 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -66,7 +66,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] @@ -96,7 +96,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -106,7 +106,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -213,7 +213,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] @@ -229,7 +229,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] @@ -303,14 +303,14 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm5 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpaddw %ymm4, %ymm4, %ymm2 ; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: @@ -320,14 +320,14 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm5 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpaddw %ymm4, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -397,7 +397,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 @@ -418,12 +418,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: @@ -431,12 +431,12 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm1 ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: @@ -594,9 +594,9 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -605,9 +605,9 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -616,9 +616,9 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq @@ -627,9 +627,9 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll index 178c02f384f9b..7aa34c0c0f2d0 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -342,8 +342,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -386,7 +386,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -400,7 +400,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -414,7 +414,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,0,0] +; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -434,8 +434,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index 372deb05e550c..cb456205b6ee5 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -41,10 +41,10 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm1 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] @@ -59,7 +59,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; ; SSE41-LABEL: var_funnnel_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -69,21 +69,21 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm8 ; SSE41-NEXT: psrld %xmm7, %xmm8 -; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: psrld %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: psrld %xmm4, %xmm7 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7] ; SSE41-NEXT: pandn %xmm3, %xmm2 ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_funnnel_v2i32: @@ -94,14 +94,14 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 @@ -225,10 +225,10 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: psrld %xmm5, %xmm1 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; X86-SSE2-NEXT: pandn %xmm4, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] @@ -269,10 +269,10 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm5, %xmm1 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] ; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: pslld $23, %xmm3 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] ; SSE2-NEXT: cvttps2dq %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] @@ -288,7 +288,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; SSE41-LABEL: splatvar_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] ; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] @@ -298,21 +298,21 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm1, %xmm8 ; SSE41-NEXT: psrld %xmm7, %xmm8 -; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: psrld %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: psrld %xmm4, %xmm7 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm4, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7] ; SSE41-NEXT: pandn %xmm3, %xmm2 ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 ; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_funnnel_v2i32: @@ -324,14 +324,14 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 @@ -470,10 +470,10 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] ; X86-SSE2-NEXT: psrld %xmm5, %xmm1 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] ; X86-SSE2-NEXT: pandn %xmm4, %xmm3 ; X86-SSE2-NEXT: pslld $23, %xmm3 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] ; X86-SSE2-NEXT: cvttps2dq %xmm3, %xmm1 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] @@ -578,7 +578,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,0,0] +; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0] ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 3cc17c1f2b86a..01cc58a6ea0be 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -183,9 +183,9 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; SSE-NEXT: psrlw $2, %xmm1 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: psrlw $7, %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: paddb %xmm1, %xmm0 ; SSE-NEXT: psubb %xmm2, %xmm0 ; SSE-NEXT: retq @@ -205,9 +205,9 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -222,10 +222,10 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper @@ -240,10 +240,10 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -260,25 +260,25 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; SSE-LABEL: test_divconstant_16i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632] ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632] -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,33024,14592,26368,47872,11008,20224,37632] ; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: psrlw $8, %xmm2 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: paddb %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE-NEXT: psraw $8, %xmm1 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,64,128,32,64,128,128,64] -; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE-NEXT: psraw $8, %xmm2 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,128,64,32,128,64,32] +; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: psrlw $8, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: psrlw $7, %xmm0 @@ -301,10 +301,10 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,64,128,32,64,128,128,64] -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,64,128,64,32,128,64,32] +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 @@ -334,7 +334,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_divconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427] ; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 @@ -361,21 +361,22 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: test_rem7_2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 +; SSE2-NEXT: movabsq $5270498306774157605, %rdi # imm = 0x4924924924924925 ; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: imulq %rsi +; SSE2-NEXT: imulq %rdi +; SSE2-NEXT: movq %rdx, %rsi ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $63, %rax -; SSE2-NEXT: sarq %rdx -; SSE2-NEXT: addq %rax, %rdx -; SSE2-NEXT: leaq (,%rdx,8), %rax -; SSE2-NEXT: subq %rax, %rdx -; SSE2-NEXT: addq %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm1 +; SSE2-NEXT: sarq %rsi +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: leaq (,%rsi,8), %rax +; SSE2-NEXT: subq %rax, %rsi +; SSE2-NEXT: addq %rcx, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: imulq %rsi +; SSE2-NEXT: imulq %rdi +; SSE2-NEXT: movq %rsi, %xmm0 ; SSE2-NEXT: movq %rdx, %rax ; SSE2-NEXT: shrq $63, %rax ; SSE2-NEXT: sarq %rdx @@ -383,28 +384,28 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; SSE2-NEXT: leaq (,%rdx,8), %rax ; SSE2-NEXT: subq %rax, %rdx ; SSE2-NEXT: addq %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movq %rdx, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: pextrq $1, %xmm0, %rcx -; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 +; SSE41-NEXT: movabsq $5270498306774157605, %rdi # imm = 0x4924924924924925 ; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: imulq %rsi +; SSE41-NEXT: imulq %rdi +; SSE41-NEXT: movq %rdx, %rsi ; SSE41-NEXT: movq %rdx, %rax ; SSE41-NEXT: shrq $63, %rax -; SSE41-NEXT: sarq %rdx -; SSE41-NEXT: addq %rax, %rdx -; SSE41-NEXT: leaq (,%rdx,8), %rax -; SSE41-NEXT: subq %rax, %rdx -; SSE41-NEXT: addq %rcx, %rdx -; SSE41-NEXT: movq %rdx, %xmm1 +; SSE41-NEXT: sarq %rsi +; SSE41-NEXT: addq %rax, %rsi +; SSE41-NEXT: leaq (,%rsi,8), %rax +; SSE41-NEXT: subq %rax, %rsi +; SSE41-NEXT: addq %rcx, %rsi ; SSE41-NEXT: movq %xmm0, %rcx ; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: imulq %rsi +; SSE41-NEXT: imulq %rdi +; SSE41-NEXT: movq %rsi, %xmm1 ; SSE41-NEXT: movq %rdx, %rax ; SSE41-NEXT: shrq $63, %rax ; SSE41-NEXT: sarq %rdx @@ -419,20 +420,21 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; AVX-LABEL: test_rem7_2i64: ; AVX: # %bb.0: ; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 +; AVX-NEXT: movabsq $5270498306774157605, %rdi # imm = 0x4924924924924925 ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi +; AVX-NEXT: movq %rdx, %rsi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: sarq %rsi +; AVX-NEXT: addq %rax, %rsi +; AVX-NEXT: leaq (,%rsi,8), %rax +; AVX-NEXT: subq %rax, %rsi +; AVX-NEXT: addq %rcx, %rsi ; AVX-NEXT: vmovq %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi +; AVX-NEXT: vmovq %rsi, %xmm0 ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -440,8 +442,8 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { ; AVX-NEXT: leaq (,%rdx,8), %rax ; AVX-NEXT: subq %rax, %rdx ; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq %res = srem <2 x i64> %a, ret <2 x i64> %res @@ -625,7 +627,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -647,7 +649,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -670,13 +672,13 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE2-LABEL: test_remconstant_16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632] -; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,33024,14592,26368,47872,11008,20224,37632] +; SSE2-NEXT: psrlw $8, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: packuswb %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] @@ -686,11 +688,11 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64] -; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,128,64,32,128,64,32] +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: packuswb %xmm2, %xmm3 ; SSE2-NEXT: psrlw $7, %xmm1 @@ -738,9 +740,9 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] -; SSE41-NEXT: psllw $8, %xmm2 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: psllw $8, %xmm2 ; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: psubb %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -765,14 +767,14 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,128,64,32,128,64,32] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -805,19 +807,19 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_remconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427] -; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [65427,65409,57,103,65467,43,79,65427,65427,79,43,65467,103,57,57,65427] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] +; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm3 -; AVX512BW-NEXT: vpsravw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $7, %xmm2, %xmm2 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm3 +; AVX512BW-NEXT: vpsravw %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 +; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 @@ -842,56 +844,56 @@ define <16 x i8> @test_rem_variable_16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: movsbl %ah, %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movsbl %ah, %eax ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movsbl %ah, %eax -; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax ; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movsbl %ah, %eax ; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movsbl %ah, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax @@ -899,19 +901,19 @@ define <16 x i8> @test_rem_variable_16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movsbl %ah, %eax ; SSE2-NEXT: movd %eax, %xmm4 ; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: movsbl %ah, %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll index 3b9ac630ab258..4ee41e3bb6876 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -12,30 +12,31 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 -; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 +; AVX1-NEXT: imulq %rsi +; AVX1-NEXT: movq %rdx, %rcx ; AVX1-NEXT: movq %rdx, %rax ; AVX1-NEXT: shrq $63, %rax -; AVX1-NEXT: sarq %rdx -; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: sarq %rcx +; AVX1-NEXT: addq %rax, %rcx ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: imulq %rsi +; AVX1-NEXT: vmovq %rcx, %xmm1 ; AVX1-NEXT: movq %rdx, %rax ; AVX1-NEXT: shrq $63, %rax ; AVX1-NEXT: sarq %rdx ; AVX1-NEXT: addq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: imulq %rsi +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: movq %rdx, %rax ; AVX1-NEXT: shrq $63, %rax ; AVX1-NEXT: sarq %rdx ; AVX1-NEXT: addq %rax, %rdx ; AVX1-NEXT: vmovq %rdx, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: imulq %rcx +; AVX1-NEXT: imulq %rsi ; AVX1-NEXT: movq %rdx, %rax ; AVX1-NEXT: shrq $63, %rax ; AVX1-NEXT: sarq %rdx @@ -49,30 +50,31 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 -; AVX2-NEXT: imulq %rcx +; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 +; AVX2-NEXT: imulq %rsi +; AVX2-NEXT: movq %rdx, %rcx ; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: shrq $63, %rax -; AVX2-NEXT: sarq %rdx -; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: sarq %rcx +; AVX2-NEXT: addq %rax, %rcx ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: imulq %rcx +; AVX2-NEXT: imulq %rsi +; AVX2-NEXT: vmovq %rcx, %xmm1 ; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: shrq $63, %rax ; AVX2-NEXT: sarq %rdx ; AVX2-NEXT: addq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: imulq %rcx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: imulq %rsi +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: shrq $63, %rax ; AVX2-NEXT: sarq %rdx ; AVX2-NEXT: addq %rax, %rdx ; AVX2-NEXT: vmovq %rdx, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: imulq %rcx +; AVX2-NEXT: imulq %rsi ; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: shrq $63, %rax ; AVX2-NEXT: sarq %rdx @@ -212,9 +214,9 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0 ; AVX2NOBW-NEXT: retq @@ -229,9 +231,9 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpsubb %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -250,42 +252,42 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,20224,11008,47872,26368,14592,33024,37632] -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [47872,12544,26368,6912,14592,30976,33024,35072] +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpsraw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [32,64,128,32,64,128,64,64] -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,64,128,32,64,128,64,64] +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm5, %xmm5 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [16,64,32,128,64,32,32,32] ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [35072,33024,30976,14592,6912,26368,12544,47872] -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [35072,33024,30976,14592,6912,26368,12544,47872] +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [37632,33024,14592,26368,47872,11008,20224,37632] +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,32,32,64,128,32,64,16] -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,128,64,32,128,64,32] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 @@ -309,10 +311,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2NOBW-NEXT: vpsraw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [32,32,32,64,128,32,64,16,32,64,128,32,64,128,64,64] -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2NOBW-NEXT: vpsraw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,64,128,64,32,128,64,32,16,64,32,128,64,32,32,32] +; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0 @@ -657,49 +659,49 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [47872,12544,26368,6912,14592,30976,33024,35072] ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm5, %xmm5 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [32,64,128,32,64,128,64,64] ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm6, %xmm6 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [16,64,32,128,64,32,32,32] ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 ; AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX1-NEXT: vpsrlw $7, %xmm4, %xmm6 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] -; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [35072,33024,30976,14592,6912,26368,12544,47872] -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] +; AVX1-NEXT: vpsllw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [35072,33024,30976,14592,6912,26368,12544,47872] +; AVX1-NEXT: vpsubb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632] +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,32,32,64,128,32,64,16] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [64,64,128,64,32,128,64,32] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,64,128,64,32,128,64,32] -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 @@ -730,14 +732,14 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpsraw $8, %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,64,128,64,32,128,64,32,16,64,32,128,64,32,32,32] ; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] -; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX2NOBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 6256c4f2acf7f..d45a0dc042596 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -11,31 +11,32 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX-NEXT: vpextrq $1, %xmm1, %rax -; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 -; AVX-NEXT: imulq %rcx +; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 +; AVX-NEXT: imulq %rsi +; AVX-NEXT: movq %rdx, %rcx ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm2 +; AVX-NEXT: sarq %rcx +; AVX-NEXT: addq %rax, %rcx ; AVX-NEXT: vmovq %xmm1, %rax -; AVX-NEXT: imulq %rcx +; AVX-NEXT: imulq %rsi +; AVX-NEXT: vmovq %rcx, %xmm1 ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX-NEXT: vmovq %rdx, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rax -; AVX-NEXT: imulq %rcx +; AVX-NEXT: imulq %rsi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rax -; AVX-NEXT: imulq %rcx +; AVX-NEXT: imulq %rsi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -45,29 +46,29 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rax -; AVX-NEXT: imulq %rcx +; AVX-NEXT: imulq %rsi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rax -; AVX-NEXT: imulq %rcx +; AVX-NEXT: imulq %rsi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: imulq %rcx +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX-NEXT: imulq %rsi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx ; AVX-NEXT: addq %rax, %rdx ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: imulq %rcx +; AVX-NEXT: imulq %rsi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -88,7 +89,7 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { ; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 -; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ; AVX-NEXT: vpsrld $31, %zmm0, %zmm1 @@ -172,7 +173,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] @@ -181,7 +182,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = zmm2 ^ (zmm1 & mem) ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 @@ -203,42 +204,42 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632] -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072] +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 -; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4 -; AVX512F-NEXT: vpaddb %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpsraw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [16,32,16,128,32,64,64,16,32,64,128,32,64,128,64,64] -; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3 +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [16,32,16,128,32,64,64,16,32,64,128,32,64,128,64,64] +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsraw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [64,16,32,32,16,16,16,16,16,64,32,128,64,32,32,32] ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm1, %ymm4, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912] -; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912] +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072] +; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpsraw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [32,32,32,64,128,32,64,16,16,16,16,16,32,32,16,64] -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,64,128,64,32,128,64,32,16,64,64,32,128,16,32,16] +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm0 @@ -262,10 +263,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsraw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsraw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 @@ -285,9 +286,9 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 +; AVX-NEXT: movabsq $5270498306774157605, %rdi # imm = 0x4924924924924925 ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -298,7 +299,7 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: vmovq %rdx, %xmm2 ; AVX-NEXT: vmovq %xmm1, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -307,22 +308,23 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: subq %rax, %rdx ; AVX-NEXT: addq %rcx, %rdx ; AVX-NEXT: vmovq %rdx, %xmm1 +; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; AVX-NEXT: vpextrq $1, %xmm3, %rcx ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi +; AVX-NEXT: movq %rdx, %rsi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax -; AVX-NEXT: sarq %rdx -; AVX-NEXT: addq %rax, %rdx -; AVX-NEXT: leaq (,%rdx,8), %rax -; AVX-NEXT: subq %rax, %rdx -; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx +; AVX-NEXT: sarq %rsi +; AVX-NEXT: addq %rax, %rsi +; AVX-NEXT: leaq (,%rsi,8), %rax +; AVX-NEXT: subq %rax, %rsi +; AVX-NEXT: addq %rcx, %rsi +; AVX-NEXT: vmovq %xmm3, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi +; AVX-NEXT: vmovq %rsi, %xmm2 ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -330,13 +332,13 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: leaq (,%rdx,8), %rax ; AVX-NEXT: subq %rax, %rdx ; AVX-NEXT: addq %rcx, %rdx -; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX-NEXT: vmovq %rdx, %xmm3 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -347,7 +349,7 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -359,7 +361,7 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vpextrq $1, %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -370,7 +372,7 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: vmovq %rdx, %xmm3 ; AVX-NEXT: vmovq %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: imulq %rsi +; AVX-NEXT: imulq %rdi ; AVX-NEXT: movq %rdx, %rax ; AVX-NEXT: shrq $63, %rax ; AVX-NEXT: sarq %rdx @@ -394,7 +396,7 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind { ; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 -; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm1 ; AVX-NEXT: vpsrld $31, %zmm1, %zmm2 @@ -499,7 +501,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] @@ -508,7 +510,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 = zmm3 ^ (zmm2 & mem) ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 @@ -538,49 +540,49 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072] ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm4, %ymm5 -; AVX512F-NEXT: vpaddb %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm5 +; AVX512F-NEXT: vpaddb %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpsraw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [16,32,16,128,32,64,64,16,32,64,128,32,64,128,64,64] ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsraw $8, %ymm6, %ymm6 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [64,16,32,32,16,16,16,16,16,64,32,128,64,32,32,32] ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX512F-NEXT: vpsrlw $7, %ymm4, %ymm6 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm4, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm6 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] -; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912] -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] +; AVX512F-NEXT: vpsllw $8, %ymm5, %ymm5 +; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912] +; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072] +; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [32,32,32,64,128,32,64,16,16,16,16,16,32,32,16,64] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpsraw $8, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [64,64,128,64,32,128,64,32,16,64,64,32,128,16,32,16] ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpsraw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [64,64,128,64,32,128,64,32,16,64,64,32,128,16,32,16] -; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0] ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 @@ -611,9 +613,9 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsraw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 13f7d68ccb893..aecbc8b71fe7f 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -190,7 +190,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] ; SSE41-NEXT: pmullw %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -271,30 +271,30 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256] ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37] -; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128] ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147] +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: packuswb %xmm2, %xmm3 ; SSE2-NEXT: psubb %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128] -; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,128,0,0,0] +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,32,32,32,128,128,64] -; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,256,128,32,32,32,64,64] +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -310,7 +310,6 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7] ; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37] -; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -318,6 +317,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7] ; SSE41-NEXT: psrlw $8, %xmm4 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147] +; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm4 ; SSE41-NEXT: packuswb %xmm3, %xmm4 ; SSE41-NEXT: psubb %xmm4, %xmm0 @@ -332,8 +332,8 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,32,32,32,128,128,64] -; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,256,128,32,32,32,64,64] +; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: packuswb %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -367,9 +367,9 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,64,32,32,32,128,128,64] -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,256,128,32,32,32,64,64] +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -400,7 +400,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_divconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37] @@ -413,7 +413,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -665,7 +665,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] ; SSE41-NEXT: pmullw %xmm1, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -764,10 +764,10 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256] ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37] -; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128] +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147] ; SSE2-NEXT: psrlw $8, %xmm3 @@ -777,9 +777,9 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,128,0,0,0,128] -; SSE2-NEXT: psrlw $8, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0] +; SSE2-NEXT: psrlw $8, %xmm4 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: packuswb %xmm4, %xmm2 ; SSE2-NEXT: paddb %xmm3, %xmm2 @@ -789,9 +789,9 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [14,13,12,11,10,9,9,7] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,256,128,32,32,32,64,64] +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14] ; SSE2-NEXT: pand %xmm4, %xmm2 @@ -825,8 +825,8 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128] -; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,128,0,0,0] +; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: packuswb %xmm2, %xmm3 ; SSE41-NEXT: paddb %xmm4, %xmm3 @@ -839,9 +839,9 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] -; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm1, %xmm2 ; SSE41-NEXT: psubb %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -881,9 +881,9 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -917,7 +917,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_remconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37] @@ -930,7 +930,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll index 1a5c3730c1839..ceeeed597aa99 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -11,33 +11,33 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: test_div7_4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 +; AVX1-NEXT: movabsq $2635249153387078803, %rdi # imm = 0x2492492492492493 ; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi +; AVX1-NEXT: mulq %rdi ; AVX1-NEXT: subq %rdx, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: addq %rdx, %rcx +; AVX1-NEXT: vmovq %xmm0, %rsi +; AVX1-NEXT: movq %rsi, %rax +; AVX1-NEXT: mulq %rdi ; AVX1-NEXT: vmovq %rcx, %xmm1 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: subq %rdx, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: addq %rdx, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 +; AVX1-NEXT: subq %rdx, %rsi +; AVX1-NEXT: shrq %rsi +; AVX1-NEXT: addq %rdx, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi +; AVX1-NEXT: mulq %rdi ; AVX1-NEXT: subq %rdx, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: addq %rdx, %rcx ; AVX1-NEXT: vmovq %rcx, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rcx ; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: mulq %rsi +; AVX1-NEXT: mulq %rdi ; AVX1-NEXT: subq %rdx, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: addq %rdx, %rcx @@ -331,11 +331,11 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [32,256,16,64,128,16,16,32,64,64,32,32,32,128,256,64] -; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,256,128,32,32,32,64,64,32,16,16,128,64,16,256,32] +; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_divconstant_32i8: @@ -371,18 +371,18 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: mulq %rsi -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: addq %rdx, %rax -; AVX1-NEXT: shrq $2, %rax -; AVX1-NEXT: leaq (,%rax,8), %rdx -; AVX1-NEXT: subq %rdx, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: movq %rcx, %rdi +; AVX1-NEXT: subq %rdx, %rdi +; AVX1-NEXT: shrq %rdi +; AVX1-NEXT: addq %rdx, %rdi +; AVX1-NEXT: shrq $2, %rdi +; AVX1-NEXT: leaq (,%rdi,8), %rax +; AVX1-NEXT: subq %rax, %rdi +; AVX1-NEXT: addq %rcx, %rdi ; AVX1-NEXT: vmovq %xmm1, %rcx ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: mulq %rsi +; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: subq %rdx, %rax ; AVX1-NEXT: shrq %rax @@ -391,11 +391,11 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: leaq (,%rax,8), %rdx ; AVX1-NEXT: subq %rdx, %rax ; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: vmovq %rax, %xmm2 ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: mulq %rsi +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: subq %rdx, %rax ; AVX1-NEXT: shrq %rax @@ -428,18 +428,18 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: mulq %rsi -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: shrq $2, %rax -; AVX2-NEXT: leaq (,%rax,8), %rdx -; AVX2-NEXT: subq %rdx, %rax -; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: movq %rcx, %rdi +; AVX2-NEXT: subq %rdx, %rdi +; AVX2-NEXT: shrq %rdi +; AVX2-NEXT: addq %rdx, %rdi +; AVX2-NEXT: shrq $2, %rdi +; AVX2-NEXT: leaq (,%rdi,8), %rax +; AVX2-NEXT: subq %rax, %rdi +; AVX2-NEXT: addq %rcx, %rdi ; AVX2-NEXT: vmovq %xmm1, %rcx ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: mulq %rsi +; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: subq %rdx, %rax ; AVX2-NEXT: shrq %rax @@ -448,11 +448,11 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: leaq (,%rax,8), %rdx ; AVX2-NEXT: subq %rdx, %rax ; AVX2-NEXT: addq %rcx, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vmovq %rax, %xmm2 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: mulq %rsi +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: subq %rdx, %rax ; AVX2-NEXT: shrq %rax @@ -689,9 +689,9 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,0,0,128,0,0,0,128] -; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,128,0,0,0,0,0,0] +; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 @@ -780,9 +780,9 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] -; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX2NOBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll index fd7a4c9b8d5ad..05fd4a2578576 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -11,16 +11,16 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX-NEXT: vpextrq $1, %xmm1, %rcx -; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 +; AVX-NEXT: movabsq $2635249153387078803, %rdi # imm = 0x2492492492492493 ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi +; AVX-NEXT: mulq %rdi ; AVX-NEXT: subq %rdx, %rcx ; AVX-NEXT: shrq %rcx ; AVX-NEXT: addq %rdx, %rcx ; AVX-NEXT: vmovq %rcx, %xmm2 ; AVX-NEXT: vmovq %xmm1, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi +; AVX-NEXT: mulq %rdi ; AVX-NEXT: subq %rdx, %rcx ; AVX-NEXT: shrq %rcx ; AVX-NEXT: addq %rdx, %rcx @@ -29,46 +29,46 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi +; AVX-NEXT: mulq %rdi ; AVX-NEXT: subq %rdx, %rcx ; AVX-NEXT: shrq %rcx ; AVX-NEXT: addq %rdx, %rcx ; AVX-NEXT: vmovq %rcx, %xmm3 ; AVX-NEXT: vmovq %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi +; AVX-NEXT: mulq %rdi ; AVX-NEXT: subq %rdx, %rcx ; AVX-NEXT: shrq %rcx ; AVX-NEXT: addq %rdx, %rcx ; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX-NEXT: vpextrq $1, %xmm4, %rcx ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrq $1, %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: subq %rdx, %rcx -; AVX-NEXT: shrq %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vmovq %xmm2, %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi +; AVX-NEXT: mulq %rdi +; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: subq %rdx, %rcx ; AVX-NEXT: shrq %rcx ; AVX-NEXT: addq %rdx, %rcx ; AVX-NEXT: vmovq %rcx, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vpextrq $1, %xmm0, %rcx +; AVX-NEXT: vmovq %xmm4, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi +; AVX-NEXT: mulq %rdi ; AVX-NEXT: subq %rdx, %rcx ; AVX-NEXT: shrq %rcx ; AVX-NEXT: addq %rdx, %rcx +; AVX-NEXT: vpextrq $1, %xmm0, %rsi ; AVX-NEXT: vmovq %rcx, %xmm3 +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: mulq %rdi +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX-NEXT: subq %rdx, %rsi +; AVX-NEXT: shrq %rsi +; AVX-NEXT: addq %rdx, %rsi +; AVX-NEXT: vmovq %rsi, %xmm3 ; AVX-NEXT: vmovq %xmm0, %rcx ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi +; AVX-NEXT: mulq %rdi ; AVX-NEXT: subq %rdx, %rcx ; AVX-NEXT: shrq %rcx ; AVX-NEXT: addq %rdx, %rcx @@ -89,7 +89,7 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { ; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1 -; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm0 ; AVX-NEXT: vpsrld $1, %zmm0, %zmm0 @@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] @@ -200,9 +200,9 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256] ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37] -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256] +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137] ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 @@ -218,22 +218,22 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [16,16,256,128,32,64,16,16,64,64,32,32,32,128,256,64] -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,16,32,8,8,8,256,16,32,16,16,128,64,16,256,32] +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256] ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27] ; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256] +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137] ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256] -; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137] -; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0] @@ -260,9 +260,9 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37] -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137] ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 @@ -278,11 +278,11 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = udiv <64 x i8> %a, ret <64 x i8> %res @@ -300,18 +300,18 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: shrq %rax -; AVX-NEXT: addq %rdx, %rax -; AVX-NEXT: shrq $2, %rax -; AVX-NEXT: leaq (,%rax,8), %rdx -; AVX-NEXT: subq %rdx, %rax -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: movq %rcx, %rdi +; AVX-NEXT: subq %rdx, %rdi +; AVX-NEXT: shrq %rdi +; AVX-NEXT: addq %rdx, %rdi +; AVX-NEXT: shrq $2, %rdi +; AVX-NEXT: leaq (,%rdi,8), %rax +; AVX-NEXT: subq %rax, %rdi +; AVX-NEXT: addq %rcx, %rdi ; AVX-NEXT: vmovq %xmm1, %rcx ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi +; AVX-NEXT: vmovq %rdi, %xmm1 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: subq %rdx, %rax ; AVX-NEXT: shrq %rax @@ -320,8 +320,8 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: leaq (,%rax,8), %rdx ; AVX-NEXT: subq %rdx, %rax ; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; AVX-NEXT: vpextrq $1, %xmm2, %rcx ; AVX-NEXT: movq %rcx, %rax @@ -374,8 +374,8 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { ; AVX-NEXT: subq %rdx, %rax ; AVX-NEXT: addq %rcx, %rax ; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vpextrq $1, %xmm0, %rcx +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi ; AVX-NEXT: movq %rcx, %rax @@ -414,7 +414,7 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind { ; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1 -; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] ; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm1 ; AVX-NEXT: vpsrld $1, %zmm1, %zmm1 @@ -517,7 +517,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] @@ -552,9 +552,9 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256] ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37] -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256] +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137] ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 @@ -580,19 +580,19 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] ; AVX512F-NEXT: vpsllw $8, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256] ; AVX512F-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm5 +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27] ; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256] +; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137] ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256] -; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137] -; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm5 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15],ymm5[24],ymm1[24],ymm5[25],ymm1[25],ymm5[26],ymm1[26],ymm5[27],ymm1[27],ymm5[28],ymm1[28],ymm5[29],ymm1[29],ymm5[30],ymm1[30],ymm5[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0] @@ -625,9 +625,9 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37] -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137] ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-idiv.ll b/llvm/test/CodeGen/X86/vector-idiv.ll index 3ff3f8d275c98..0245e7cb505c9 100644 --- a/llvm/test/CodeGen/X86/vector-idiv.ll +++ b/llvm/test/CodeGen/X86/vector-idiv.ll @@ -66,12 +66,12 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind { ; ; AVX2-LABEL: PR20355: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766] -; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuldq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index b3d8d05f69947..9cd8b9d7ff524 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -320,7 +320,7 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm2 = [84148480,218892552,353636624,488380696] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm3 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] @@ -339,7 +339,7 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm2 = [84148480,218892552,353636624,488380696] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm3 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] @@ -516,7 +516,7 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm4 = [84148480,218892552,353636624,488380696] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm6 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] @@ -547,7 +547,7 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm4 = [84148480,218892552,353636624,488380696] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm6 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] @@ -628,9 +628,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -641,9 +641,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -654,9 +654,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -667,9 +667,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -999,7 +999,7 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [84148480,218892552,353636624,488380696] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm8, %ymm2 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm7, %ymm10 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6] @@ -1054,7 +1054,7 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [84148480,218892552,353636624,488380696] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm10 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6] @@ -1193,11 +1193,11 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1213,11 +1213,11 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1233,11 +1233,11 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1253,11 +1253,11 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index da902b3aed5ab..228570a04170b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -58,23 +58,23 @@ define void @load_i16_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-LABEL: load_i16_stride3_vf2: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovd %xmm1, (%rsi) -; AVX2-FP-NEXT: vmovd %xmm2, (%rdx) -; AVX2-FP-NEXT: vmovd %xmm0, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FP-NEXT: vmovd %xmm1, (%rdx) +; AVX2-FP-NEXT: vmovd %xmm2, (%rcx) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride3_vf2: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovd %xmm1, (%rsi) -; AVX2-FCP-NEXT: vmovd %xmm2, (%rdx) -; AVX2-FCP-NEXT: vmovd %xmm0, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FCP-NEXT: vmovd %xmm1, (%rdx) +; AVX2-FCP-NEXT: vmovd %xmm2, (%rcx) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride3_vf2: @@ -93,12 +93,12 @@ define void @load_i16_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i16_stride3_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovd %xmm1, (%rsi) -; AVX512-FCP-NEXT: vmovd %xmm2, (%rdx) -; AVX512-FCP-NEXT: vmovd %xmm0, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovd %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovd %xmm1, (%rdx) +; AVX512-FCP-NEXT: vmovd %xmm2, (%rcx) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride3_vf2: @@ -117,12 +117,12 @@ define void @load_i16_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rsi) -; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rcx) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride3_vf2: @@ -141,12 +141,12 @@ define void @load_i16_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i16_stride3_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovd %xmm1, (%rsi) -; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovd %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rcx) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride3_vf2: @@ -165,12 +165,12 @@ define void @load_i16_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rcx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <6 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <6 x i16> %wide.vec, <6 x i16> poison, <2 x i32> @@ -345,15 +345,15 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i16_stride3_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,3,2,3,4,5,6,7] ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,1,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vmovq %xmm1, (%rdx) ; AVX512BW-NEXT: vmovq %xmm2, (%rcx) @@ -362,12 +362,12 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i16_stride3_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) @@ -377,15 +377,15 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i16_stride3_vf4: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,3,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,1,2,3] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) @@ -394,12 +394,12 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) @@ -477,18 +477,18 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX-NEXT: vmovdqa %xmm3, (%rsi) +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] +; AVX-NEXT: vmovdqa %xmm1, (%rsi) ; AVX-NEXT: vmovdqa %xmm4, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq @@ -628,12 +628,12 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i16_stride3_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] ; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx) @@ -643,12 +643,12 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i16_stride3_vf8: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) @@ -658,12 +658,12 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i16_stride3_vf8: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx) @@ -673,12 +673,12 @@ define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf8: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,7,10,13,16,19,22] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,5,8,11,14,17,20,23] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) @@ -827,193 +827,193 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0],xmm9[1,2],xmm5[3],xmm9[4,5],xmm5[6],xmm9[7] ; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6],xmm5[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX-NEXT: vmovaps %ymm2, (%rsi) -; AVX-NEXT: vmovaps %ymm7, (%rdx) +; AVX-NEXT: vmovaps %ymm1, (%rdx) ; AVX-NEXT: vmovaps %ymm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i16_stride3_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 +; AVX2-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7],ymm5[8],ymm3[9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-NEXT: vmovdqa %ymm4, (%rdx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i16_stride3_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7],ymm5[8],ymm3[9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-FP-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm4, (%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride3_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7],ymm5[8],ymm3[9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride3_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm4 ^ ymm3)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm3 ^ ymm4)) ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm7[3,4,5,6,7],ymm5[8,9,10],ymm7[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm0 & (ymm3 ^ ymm4)) +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7],ymm3[8],ymm0[9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX512-NEXT: vmovdqa %ymm3, (%rsi) +; AVX512-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512-NEXT: vmovdqa %ymm6, (%rdx) ; AVX512-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512-NEXT: vzeroupper @@ -1021,43 +1021,43 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride3_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm4 ^ ymm3)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm3 ^ ymm4)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm7[3,4,5,6,7],ymm5[8,9,10],ymm7[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm0 & (ymm3 ^ ymm4)) +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7],ymm3[8],ymm0[9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512-FCP-NEXT: vzeroupper @@ -1065,43 +1065,43 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-LABEL: load_i16_stride3_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm4 ^ ymm3)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm3 ^ ymm4)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm7[3,4,5,6,7],ymm5[8,9,10],ymm7[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm0 & (ymm3 ^ ymm4)) +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7],ymm3[8],ymm0[9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm3, (%rsi) +; AVX512DQ-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm6, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -1109,43 +1109,43 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm4 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm3 ^ ymm4)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm7[3,4,5,6,7],ymm5[8,9,10],ymm7[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm0 & (ymm3 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7],ymm3[8],ymm0[9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -1155,11 +1155,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1171,11 +1171,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1187,11 +1187,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1203,11 +1203,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1551,26 +1551,26 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm5 +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13,14],ymm6[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX2-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7 -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15] -; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9 +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2],ymm9[3,4,5,6,7],ymm8[8,9,10],ymm9[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,4,7] +; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5,6],ymm8[7],ymm11[8],ymm8[9],ymm11[10,11],ymm8[12],ymm11[13,14],ymm8[15] +; AVX2-NEXT: vpshufb %ymm7, %ymm8, %ymm9 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm7 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7] @@ -1579,8 +1579,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm10 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -1602,12 +1602,12 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 @@ -1616,12 +1616,12 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vmovdqa %ymm3, 32(%rsi) +; AVX2-NEXT: vmovdqa %ymm4, 32(%rsi) ; AVX2-NEXT: vmovdqa %ymm9, (%rsi) ; AVX2-NEXT: vmovdqa %ymm10, 32(%rdx) ; AVX2-NEXT: vmovdqa %ymm11, (%rdx) @@ -1635,26 +1635,26 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-FP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm5 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13,14],ymm6[15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm5 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15] -; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2],ymm9[3,4,5,6,7],ymm8[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,4,7] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5,6],ymm8[7],ymm11[8],ymm8[9],ymm11[10,11],ymm8[12],ymm11[13,14],ymm8[15] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm9 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm7 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7] @@ -1663,8 +1663,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm10 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -1686,12 +1686,12 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 @@ -1700,12 +1700,12 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rsi) +; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%rsi) ; AVX2-FP-NEXT: vmovdqa %ymm9, (%rsi) ; AVX2-FP-NEXT: vmovdqa %ymm10, 32(%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm11, (%rdx) @@ -1719,26 +1719,26 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13,14],ymm6[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2],ymm9[3,4,5,6,7],ymm8[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,4,7] +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5,6],ymm8[7],ymm11[8],ymm8[9],ymm11[10,11],ymm8[12],ymm11[13,14],ymm8[15] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm9 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7] @@ -1747,8 +1747,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm10 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -1770,12 +1770,12 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 @@ -1784,12 +1784,12 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rsi) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rsi) ; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rsi) ; AVX2-FCP-NEXT: vmovdqa %ymm10, 32(%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rdx) @@ -1812,23 +1812,23 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512-NEXT: vmovdqa %ymm0, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (ymm9 & (ymm7 ^ ymm8)) +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7],ymm9[8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14],ymm10[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6)) ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] @@ -1841,7 +1841,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm8 ^ ymm7)) ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -1861,7 +1861,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 & (ymm8 ^ ymm7)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -1870,7 +1870,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512-NEXT: vzeroupper @@ -1890,23 +1890,23 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (ymm9 & (ymm7 ^ ymm8)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7],ymm9[8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14],ymm10[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] @@ -1919,7 +1919,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm8 ^ ymm7)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -1939,7 +1939,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 & (ymm8 ^ ymm7)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -1948,7 +1948,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512-FCP-NEXT: vzeroupper @@ -1968,23 +1968,23 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (ymm9 & (ymm7 ^ ymm8)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7],ymm9[8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14],ymm10[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] @@ -1997,7 +1997,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm8 ^ ymm7)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -2017,7 +2017,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 & (ymm8 ^ ymm7)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -2026,7 +2026,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -2046,23 +2046,23 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (ymm9 & (ymm7 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7],ymm9[8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14],ymm10[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] @@ -2075,7 +2075,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm8 ^ ymm7)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -2095,7 +2095,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 & (ymm8 ^ ymm7)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -2104,7 +2104,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -2115,17 +2115,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -2138,17 +2138,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -2161,17 +2161,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -2184,17 +2184,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -2915,146 +2915,148 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm12, %ymm13, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 -; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 -; AVX2-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10 -; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11 -; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14 -; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6 -; AVX2-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm3 +; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm6 +; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 +; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm7 +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm11 +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpblendvb %ymm11, %ymm13, %ymm12, %ymm12 +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm13 +; AVX2-NEXT: vpblendvb %ymm11, %ymm10, %ymm9, %ymm9 +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm15 +; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm10 +; AVX2-NEXT: vpblendvb %ymm11, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 176(%rdi), %xmm7 +; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm9 +; AVX2-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,4,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX2-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7] -; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7],ymm6[8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13,14],ymm1[15] +; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa 368(%rdi), %xmm5 +; AVX2-NEXT: vmovdqa 352(%rdi), %xmm6 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15] -; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] +; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm8, %ymm1 +; AVX2-NEXT: vmovdqa 272(%rdi), %xmm4 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] -; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm12 -; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7] -; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX2-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX2-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX2-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7,8,9],ymm0[10],ymm13[11,12],ymm0[13],ymm13[14,15] +; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm13 ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX2-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX2-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7] -; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm10[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4],ymm0[5],ymm10[6,7,8,9],ymm0[10],ymm10[11,12],ymm0[13],ymm10[14,15] +; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm14[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7],ymm1[8],ymm14[9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7] -; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX2-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7] -; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm4 +; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12,13],ymm4[14],ymm9[15] ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7] -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6],xmm8[7] +; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -3063,16 +3065,16 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-NEXT: vmovdqa %ymm10, 64(%rdx) -; AVX2-NEXT: vmovdqa %ymm14, (%rdx) +; AVX2-NEXT: vmovdqa %ymm15, 64(%rdx) +; AVX2-NEXT: vmovdqa %ymm10, (%rdx) ; AVX2-NEXT: vmovdqa %ymm13, 96(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm11, 32(%rdx) ; AVX2-NEXT: vmovdqa %ymm3, 64(%rcx) ; AVX2-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-NEXT: vmovdqa %ymm1, 96(%rcx) -; AVX2-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: addq $136, %rsp ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3088,146 +3090,148 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm12, %ymm13, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 -; AVX2-FP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14 -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm6 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm7 +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm11 +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm13, %ymm12, %ymm12 +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm13 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm10, %ymm9, %ymm9 +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm15 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm5, %ymm4, %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm7 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm9 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,4,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7],ymm6[8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13,14],ymm1[15] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm2 -; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm1 +; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm4 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15] -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm12 -; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7,8,9],ymm0[10],ymm13[11,12],ymm0[13],ymm13[14,15] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm13 ; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm10[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4],ymm0[5],ymm10[6,7,8,9],ymm0[10],ymm10[11,12],ymm0[13],ymm10[14,15] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm14[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7],ymm1[8],ymm14[9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm5, %ymm4 +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12,13],ymm4[14],ymm9[15] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6],xmm8[7] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -3236,16 +3240,16 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FP-NEXT: vmovdqa %ymm10, 64(%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm14, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm15, 64(%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm10, (%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm13, 96(%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm11, 32(%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm3, 64(%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm1, 96(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FP-NEXT: addq $136, %rsp ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -3261,146 +3265,148 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm12, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14 -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm11 +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm13, %ymm12, %ymm12 +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm13 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm10, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm15 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm7 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm9 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,4,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7],ymm6[8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13,14],ymm1[15] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm2 -; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm1 +; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm12 -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7,8,9],ymm0[10],ymm13[11,12],ymm0[13],ymm13[14,15] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm13 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm10[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4],ymm0[5],ymm10[6,7,8,9],ymm0[10],ymm10[11,12],ymm0[13],ymm10[14,15] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm14[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7],ymm1[8],ymm14[9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,3,0,1] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12,13],ymm4[14],ymm9[15] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6],xmm8[7] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -3409,16 +3415,16 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FCP-NEXT: vmovdqa %ymm10, 64(%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm14, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm15, 64(%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm10, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm13, 96(%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FCP-NEXT: addq $136, %rsp ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -4087,23 +4093,23 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm3 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm5, %zmm3 @@ -4124,23 +4130,23 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm3 @@ -4161,23 +4167,23 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm5, %zmm3 @@ -4198,23 +4204,23 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index ae4f85ce42a19..4c0e6b4ad949b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -98,11 +98,11 @@ define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; AVX512-NEXT: vmovd %xmm1, (%rdx) ; AVX512-NEXT: vmovd %xmm3, (%rcx) -; AVX512-NEXT: vmovd %xmm2, (%r8) +; AVX512-NEXT: vmovd %xmm0, (%r8) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride4_vf2: @@ -111,11 +111,11 @@ define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vmovd %xmm1, (%rdx) ; AVX512-FCP-NEXT: vmovd %xmm3, (%rcx) -; AVX512-FCP-NEXT: vmovd %xmm2, (%r8) +; AVX512-FCP-NEXT: vmovd %xmm0, (%r8) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride4_vf2: @@ -125,11 +125,11 @@ define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovd %xmm1, (%rdx) ; AVX512DQ-NEXT: vmovd %xmm3, (%rcx) -; AVX512DQ-NEXT: vmovd %xmm2, (%r8) +; AVX512DQ-NEXT: vmovd %xmm0, (%r8) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf2: @@ -138,11 +138,11 @@ define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rdx) ; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rcx) -; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%r8) +; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%r8) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride4_vf2: @@ -152,11 +152,11 @@ define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vmovd %xmm1, (%rdx) ; AVX512BW-NEXT: vmovd %xmm3, (%rcx) -; AVX512BW-NEXT: vmovd %xmm2, (%r8) +; AVX512BW-NEXT: vmovd %xmm0, (%r8) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride4_vf2: @@ -165,11 +165,11 @@ define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vmovd %xmm1, (%rdx) ; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovd %xmm2, (%r8) +; AVX512BW-FCP-NEXT: vmovd %xmm0, (%r8) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride4_vf2: @@ -179,11 +179,11 @@ define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vmovd %xmm1, (%rdx) ; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovd %xmm2, (%r8) +; AVX512DQ-BW-NEXT: vmovd %xmm0, (%r8) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf2: @@ -192,11 +192,11 @@ define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <8 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> @@ -1028,10 +1028,10 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] @@ -1137,8 +1137,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -1233,7 +1233,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm4 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 @@ -1259,7 +1259,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,2,3,1,3,5,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm10 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 @@ -1365,9 +1365,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride4_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4 @@ -1375,7 +1374,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] ; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm9 @@ -1387,7 +1386,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 @@ -1479,9 +1478,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4 @@ -1489,7 +1487,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm9 @@ -1501,7 +1499,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 @@ -1525,15 +1523,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512BW-LABEL: load_i16_stride4_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1544,15 +1542,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512BW-FCP-LABEL: load_i16_stride4_vf16: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1563,15 +1561,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-BW-LABEL: load_i16_stride4_vf16: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1582,15 +1580,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf16: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -2118,19 +2116,19 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -2345,10 +2343,10 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] @@ -2573,136 +2571,136 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm5 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm8 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm10 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm9, %xmm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 ; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm15 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm8 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm2 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm11 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm8 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm11 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm10 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[3,1,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FCP-NEXT: addq $104, %rsp ; AVX2-FCP-NEXT: vzeroupper @@ -2843,9 +2841,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7 @@ -2853,7 +2850,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] ; AVX512-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] @@ -2881,7 +2878,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 @@ -3054,9 +3051,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7 @@ -3064,7 +3060,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] @@ -3092,7 +3088,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 @@ -3141,23 +3137,23 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm5 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm7 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -3177,23 +3173,23 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -3213,23 +3209,23 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -3249,23 +3245,23 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -4366,19 +4362,19 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -4401,10 +4397,10 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] @@ -4865,10 +4861,10 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] @@ -4910,19 +4906,19 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -5334,300 +5330,298 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i16_stride4_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm5, %xmm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3 +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FCP-NEXT: vpackusdw %xmm7, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FCP-NEXT: vpackusdw %xmm7, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm2 +; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm7, %xmm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm10 -; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm8 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm12 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm14 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm12 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm11 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 ; AVX2-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm13 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm11 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa 400(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm2 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa 400(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm12 = mem[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm10 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[3,1,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufd $231, (%rsp), %xmm5 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] @@ -5635,7 +5629,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -5666,7 +5660,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %ymm2, 96(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm15, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm14, (%r8) ; AVX2-FCP-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -5952,9 +5946,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23 ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 @@ -5962,7 +5955,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24 ; AVX512-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6022,7 +6015,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8 @@ -6375,9 +6368,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23 ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 @@ -6385,7 +6377,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6445,7 +6437,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8 @@ -6526,21 +6518,21 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm7, %zmm10 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm12 @@ -6551,7 +6543,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm14 @@ -6562,14 +6554,14 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -6588,21 +6580,21 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm12 @@ -6613,7 +6605,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm14 @@ -6624,14 +6616,14 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -6650,21 +6642,21 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm12 @@ -6675,7 +6667,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm14 @@ -6686,14 +6678,14 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -6712,21 +6704,21 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm12 @@ -6737,7 +6729,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm14 @@ -6748,14 +6740,14 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 9b19ec15c6f55..76e32598ebfd8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -348,30 +348,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX-NEXT: vpsrlq $48, %xmm2, %xmm4 -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX-NEXT: vpsrlq $48, %xmm2, %xmm5 +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] ; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) +; AVX-NEXT: vmovq %xmm5, (%rdx) +; AVX-NEXT: vmovq %xmm6, (%rcx) +; AVX-NEXT: vmovq %xmm4, (%r8) ; AVX-NEXT: vmovq %xmm1, (%r9) ; AVX-NEXT: retq ; @@ -461,21 +461,21 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i16_stride5_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-NEXT: vpextrw $5, %xmm0, %eax -; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] -; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6,7] +; AVX512-NEXT: vpextrw $7, %xmm2, %eax +; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512-NEXT: vpextrw $6, %xmm0, %eax ; AVX512-NEXT: vpextrw $1, %xmm0, %r10d -; AVX512-NEXT: vmovd %r10d, %xmm4 -; AVX512-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 +; AVX512-NEXT: vmovd %r10d, %xmm3 +; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vpextrw $3, %xmm2, %eax +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] @@ -486,8 +486,8 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vmovq %xmm3, (%rdx) ; AVX512-NEXT: vmovq %xmm5, (%rcx) ; AVX512-NEXT: vmovq %xmm6, (%r8) ; AVX512-NEXT: vmovq %xmm0, (%r9) @@ -497,27 +497,27 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] ; AVX512-FCP-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %eax ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovd %xmm2, %eax +; AVX512-FCP-NEXT: vmovd %xmm3, %eax ; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm3[0],xmm0[1,2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm3[2],xmm0[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) @@ -527,21 +527,21 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i16_stride5_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-NEXT: vpextrw $5, %xmm0, %eax -; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] -; AVX512DQ-NEXT: vpextrw $7, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpextrw $7, %xmm2, %eax +; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpextrw $6, %xmm0, %eax ; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r10d -; AVX512DQ-NEXT: vmovd %r10d, %xmm4 -; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpextrw $3, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 +; AVX512DQ-NEXT: vmovd %r10d, %xmm3 +; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpextrw $3, %xmm2, %eax +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; AVX512DQ-NEXT: vmovd %xmm2, %eax -; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] @@ -552,8 +552,8 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-NEXT: vmovq %xmm0, (%r9) @@ -563,27 +563,27 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpextrw $7, %xmm1, %eax -; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %eax ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovd %xmm2, %eax +; AVX512DQ-FCP-NEXT: vmovd %xmm3, %eax ; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm3[0],xmm0[1,2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm3[2],xmm0[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) @@ -597,11 +597,11 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] ; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] @@ -623,11 +623,11 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vpextrw $7, %xmm2, %eax ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] @@ -649,11 +649,11 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vpextrw $7, %xmm2, %eax ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] @@ -675,11 +675,11 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm2, %eax ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] @@ -829,10 +829,10 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm6 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,0,1,10,11,4,5,14,15,u,u] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,10,11,4,5,14,15,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3,4,5,6,7] ; AVX-NEXT: vpsllq $48, %xmm5, %xmm7 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,1,3] @@ -843,13 +843,13 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15] ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5],xmm8[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,4,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5],xmm9[6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6] ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7] @@ -1018,36 +1018,36 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] ; AVX512-NEXT: vpbroadcastw 70(%rdi), %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] -; AVX512-NEXT: vpsllq $48, %xmm3, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512-NEXT: vpsllq $48, %xmm4, %xmm5 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,2,0] +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] ; AVX512-NEXT: vmovdqa %xmm1, (%rsi) -; AVX512-NEXT: vmovdqa %xmm4, (%rdx) +; AVX512-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512-NEXT: vmovdqa %xmm5, (%rcx) ; AVX512-NEXT: vmovdqa %xmm6, (%r8) ; AVX512-NEXT: vmovdqa %xmm0, (%r9) @@ -1110,36 +1110,36 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] ; AVX512DQ-NEXT: vpbroadcastw 70(%rdi), %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] -; AVX512DQ-NEXT: vpsllq $48, %xmm3, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-NEXT: vpsllq $48, %xmm4, %xmm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,2,0] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] ; AVX512DQ-NEXT: vmovdqa %xmm1, (%rsi) -; AVX512DQ-NEXT: vmovdqa %xmm4, (%rdx) +; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rcx) ; AVX512DQ-NEXT: vmovdqa %xmm6, (%r8) ; AVX512DQ-NEXT: vmovdqa %xmm0, (%r9) @@ -1196,15 +1196,15 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1218,15 +1218,15 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1240,15 +1240,15 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1262,15 +1262,15 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1523,12 +1523,12 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] ; AVX-NEXT: vmovdqa 144(%rdi), %xmm8 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm5[5,6,7] ; AVX-NEXT: vmovdqa (%rdi), %xmm3 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 @@ -1573,10 +1573,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u] ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm14[5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,1,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7] ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] @@ -1590,12 +1590,12 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u] ; AVX-NEXT: vpsrlq $48, %xmm2, %xmm14 +; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm15[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] @@ -1647,53 +1647,53 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm7 +; AVX2-NEXT: vmovdqa 144(%rdi), %xmm5 ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm5[1],xmm4[2,3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-NEXT: vpblendvb %ymm6, %ymm9, %ymm10, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm4[2],xmm5[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] @@ -1707,14 +1707,14 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-NEXT: vmovdqa %ymm6, (%rsi) ; AVX2-NEXT: vmovdqa %ymm7, (%rdx) ; AVX2-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-NEXT: vmovdqa %ymm9, (%r8) @@ -1736,52 +1736,52 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm7 ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm4 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] -; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm9, %ymm10, %ymm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm5[1],xmm4[2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 @@ -1811,89 +1811,89 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FCP-LABEL: load_i16_stride5_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,3,5,u] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,0,0,0,4,7,1,6] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,u,u,u,4,7,1,6] +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm9, %ymm10, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,1,3,6,u] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,u,u,5,7,2,4] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] ; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] ; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,0,2,5,7] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,u,u,6,0,3,5] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,0,2,5,7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rsi) ; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm8, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1933,30 +1933,30 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] -; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 @@ -1993,7 +1993,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,u,u,u,4,6,1,3] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] @@ -2001,71 +2001,71 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,0,3,5,u] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,u,u,u,4,7,1,6] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,0,0,4,7,1,6] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,1,3,6,u] +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4] -; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] -; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,u,u,5,7,2,4] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,4,6,0,1,4,6,0] +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7] +; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,4,7,0,2,4,7,0] +; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,u,u,6,0,3,5] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,0,2,5,7] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdx) @@ -2110,30 +2110,30 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] -; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 @@ -2170,7 +2170,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,u,u,u,4,6,1,3] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] @@ -2178,71 +2178,71 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,0,3,5,u] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,u,u,u,4,7,1,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,0,0,4,7,1,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,1,3,6,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4] -; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,u,u,5,7,2,4] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,4,6,0,1,4,6,0] +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,4,7,0,2,4,7,0] +; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,u,u,6,0,3,5] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,0,2,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdx) @@ -2256,26 +2256,26 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) @@ -2289,26 +2289,26 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) @@ -2322,26 +2322,26 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx) @@ -2355,26 +2355,26 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) @@ -3233,7 +3233,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4],ymm3[5],ymm15[6,7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12],ymm3[13],ymm15[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] @@ -3379,8 +3379,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 @@ -3453,7 +3452,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] @@ -3601,8 +3600,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 @@ -3649,133 +3647,126 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FCP-LABEL: load_i16_stride5_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FCP-NEXT: subq $200, %rsp ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm10, %ymm5 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm13 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm5, %ymm8 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] ; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15] -; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm12, %ymm10, %ymm11 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,4,7,1,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,u,u,u,4,7,1,6] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm10, %ymm13 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7] ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm15 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm14, %ymm9 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm14, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm14, %ymm8 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm11 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,2,0,0,5,7,2,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,2,u,u,5,7,2,4] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0] ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5],ymm15[6],ymm2[7,8],ymm15[9],ymm2[10,11],ymm15[12],ymm2[13],ymm15[14],ymm2[15] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm14, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4,5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10],ymm7[11],ymm10[12,13],ymm7[14],ymm10[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 @@ -3785,55 +3776,50 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm2, %ymm8 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15] -; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] +; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm8 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15] -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm11, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm7[2],ymm10[3],ymm7[4],ymm10[5,6],ymm7[7],ymm10[8,9],ymm7[10],ymm10[11],ymm7[12],ymm10[13,14],ymm7[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,u,u,6,0,3,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm1, %ymm6 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3844,16 +3830,16 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm15, 32(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%r8) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FCP-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-FCP-NEXT: addq $200, %rsp ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -3862,69 +3848,73 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] -; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512-NEXT: vmovdqa64 176(%rdi), %xmm20 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] +; AVX512-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[3,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vmovdqa 160(%rdi), %xmm8 +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2,3,4,5,6,7] +; AVX512-NEXT: vmovdqa (%rdi), %ymm9 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15] -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm15 +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm4[1,2],ymm0[3],ymm4[4],ymm0[5],ymm4[6,7],ymm0[8],ymm4[9,10],ymm0[11],ymm4[12],ymm0[13],ymm4[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6],ymm14[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] +; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5,6,7] +; AVX512-NEXT: vpor %ymm12, %ymm14, %ymm14 ; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3] -; AVX512-NEXT: vpshufb %xmm7, %xmm13, %xmm7 -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm12[0],xmm11[1],xmm12[2,3] +; AVX512-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm15 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15)) -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm14)) +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm16 +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] ; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512-NEXT: vpsrlq $48, %xmm20, %xmm15 -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3] +; AVX512-NEXT: vpsrlq $48, %xmm5, %xmm15 +; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,3,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4],ymm1[5],ymm6[6,7],ymm1[8],ymm6[9,10],ymm1[11],ymm6[12],ymm1[13],ymm6[14,15] +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10,11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] ; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm0 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero @@ -3935,32 +3925,24 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0)) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] ; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,1,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3] +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm13 -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15] -; AVX512-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm13 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] ; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] -; AVX512-NEXT: vmovdqa64 %xmm15, %xmm22 +; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm5[0],xmm8[1],xmm5[2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] @@ -3968,17 +3950,18 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %xmm13, %xmm14, %xmm13 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3] -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0)) @@ -3992,47 +3975,53 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5],ymm1[6],ymm6[7,8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13],ymm1[14],ymm6[15] ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4],xmm14[5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] +; AVX512-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6],ymm15[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512-NEXT: vmovdqa64 %ymm22, %ymm15 +; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm7[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3] -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm12 +; AVX512-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13)) -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 -; AVX512-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm7)) +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm8[2],xmm5[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4],xmm5[5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7,8],ymm1[9],ymm6[10],ymm1[11],ymm6[12,13],ymm1[14],ymm6[15] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 @@ -4055,160 +4044,162 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,4,7,1,4,6,u,u] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,3,2,4,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,u,u,u,4,6,1,3] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] +; AVX512-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,1,u,0,3,5,u] +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm13, %ymm14 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] -; AVX512-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12)) -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 -; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm16 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3],xmm12[4,5,6],xmm15[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,u,u,u,4,7,1,6] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm17, %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpor %ymm6, %ymm12, %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,3,2,u,1,3,6,u] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm12 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm15 +; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm17 +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,2,5,7,4,7,u,u] +; AVX512-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm14[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] ; AVX512-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm17 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4],xmm13[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm18 & (zmm12 ^ zmm6)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm19 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4] -; AVX512-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,3,5,2,5,7,u,u] +; AVX512-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4],ymm6[5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,2,u,u,5,7,2,4] +; AVX512-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,4,6,0,1,4,6,0] +; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm12 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm13)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm18 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2],xmm6[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,3,u,u,5,0,2,7] +; AVX512-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [1,3,6,0,5,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm14[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,4,7,0,2,4,7,0] +; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6)) ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3],ymm7[4],ymm9[5,6],ymm7[7],ymm9[8,9],ymm7[10],ymm9[11],ymm7[12],ymm9[13,14],ymm7[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,u,u,6,0,3,5] +; AVX512-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,4,6,3,6,u,u,u] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,1,3,0,2,5,7] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm2 {%k1} ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -4219,8 +4210,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -4231,69 +4222,73 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqa64 176(%rdi), %xmm20 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[3,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm8 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm9 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm11, %ymm12, %ymm15 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm4[1,2],ymm0[3],ymm4[4],ymm0[5],ymm4[6,7],ymm0[8],ymm4[9,10],ymm0[11],ymm4[12],ymm0[13],ymm4[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6],ymm14[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-NEXT: vpor %ymm12, %ymm14, %ymm14 ; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm13, %xmm7 -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm12[0],xmm11[1],xmm12[2,3] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm15 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm14)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm16 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512DQ-NEXT: vpsrlq $48, %xmm20, %xmm15 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3] +; AVX512DQ-NEXT: vpsrlq $48, %xmm5, %xmm15 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,3,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4],ymm1[5],ymm6[6,7],ymm1[8],ymm6[9,10],ymm1[11],ymm6[12],ymm1[13],ymm6[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10,11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero @@ -4304,32 +4299,24 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,1,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3] +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm13 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm13 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm22 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm5[0],xmm8[1],xmm5[2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] @@ -4337,17 +4324,18 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm14, %xmm13 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0)) @@ -4361,47 +4349,53 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5],ymm1[6],ymm6[7,8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13],ymm1[14],ymm6[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4],xmm14[5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] +; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6],ymm15[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm15 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm7[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3] -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm12 +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm7)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm8[2],xmm5[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4],xmm5[5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7,8],ymm1[9],ymm6[10],ymm1[11],ymm6[12,13],ymm1[14],ymm6[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 @@ -4424,160 +4418,162 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,4,7,1,4,6,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,3,2,4,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,u,u,u,4,6,1,3] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] +; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,1,u,0,3,5,u] +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm13, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3],xmm12[4,5,6],xmm15[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,u,u,u,4,7,1,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm17, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm12, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,3,2,u,1,3,6,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm15 +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm17 +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,2,5,7,4,7,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm14[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] ; AVX512DQ-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm17 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4],xmm13[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm18 & (zmm12 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm19 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4] -; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,3,5,2,5,7,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4],ymm6[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,2,u,u,5,7,2,4] +; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,4,6,0,1,4,6,0] +; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm13)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm18 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2],xmm6[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,3,u,u,5,0,2,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [1,3,6,0,5,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm14[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,4,7,0,2,4,7,0] +; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm13 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6)) ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3],ymm7[4],ymm9[5,6],ymm7[7],ymm9[8,9],ymm7[10],ymm9[11],ymm7[12],ymm9[13,14],ymm7[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,u,u,6,0,3,5] +; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,4,6,3,6,u,u,u] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,1,3,0,2,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -4588,8 +4584,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -4598,244 +4594,244 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i16_stride5_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm6 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm3 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride5_vf32: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] +; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride5_vf32: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] +; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf32: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <160 x i16>, ptr %in.vec, align 64 @@ -6552,7 +6548,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4],ymm11[5],ymm8[6,7],ymm11[8],ymm8[9,10],ymm11[11],ymm8[12],ymm11[13],ymm8[14,15] @@ -6603,7 +6599,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload @@ -6892,8 +6888,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -7010,48 +7005,47 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4],ymm15[5],ymm0[6,7],ymm15[8],ymm0[9,10],ymm15[11],ymm0[12],ymm15[13],ymm0[14,15] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm2[1],ymm15[2,3],ymm2[4],ymm15[5],ymm2[6],ymm15[7,8],ymm2[9],ymm15[10,11],ymm2[12],ymm15[13],ymm2[14],ymm15[15] -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm4, %ymm3, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm2[1,2],ymm7[3],ymm2[4],ymm7[5],ymm2[6,7],ymm7[8],ymm2[9,10],ymm7[11],ymm2[12],ymm7[13],ymm2[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] -; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm14 +; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm10 ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm7 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15] ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm12 @@ -7063,55 +7057,54 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm4, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15] ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13],ymm5[14],ymm2[15] ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm2 -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] +; AVX2-FP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0],ymm8[1],mem[2],ymm8[3],mem[4,5],ymm8[6],mem[7,8],ymm8[9],mem[10],ymm8[11],mem[12,13],ymm8[14],mem[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm8 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] @@ -7119,128 +7112,126 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm1 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm8, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5],ymm4[6],ymm9[7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13],ymm4[14],ymm9[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm12 -; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm12 +; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm13 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0],xmm1[1],xmm9[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0],xmm13[1],xmm9[2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 624(%rdi), %xmm10 -; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm10[1],xmm8[2,3] +; AVX2-FP-NEXT: vmovdqa 624(%rdi), %xmm8 +; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm8[1],xmm6[2,3] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 464(%rdi), %xmm6 -; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm6[1],xmm5[2,3] -; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm11[1],xmm7[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm2[1],xmm7[2,3] ; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm4 +; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm11 ; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm14 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3] +; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm13[2],xmm9[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] -; AVX2-FP-NEXT: vmovdqa %xmm10, %xmm13 -; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm10 -; AVX2-FP-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm8[2],xmm6[3] +; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm14 +; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm10 +; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3] -; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm3 -; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm3 +; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm2[2],xmm7[3] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm8[2],ymm15[3],ymm8[4],ymm15[5,6],ymm8[7],ymm15[8,9],ymm8[10],ymm15[11],ymm8[12],ymm15[13,14],ymm8[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm0, %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm0, %ymm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm11[1],xmm13[2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw $82, (%rsp), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm13[2],ymm4[3],ymm13[4],ymm4[5,6],ymm13[7],ymm4[8,9],ymm13[10],ymm4[11],ymm13[12],ymm4[13,14],ymm13[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0],xmm10[1],xmm14[2,3] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] @@ -7260,8 +7251,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3] +; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm9 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] @@ -7274,8 +7265,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 @@ -7288,12 +7279,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] +; AVX2-FP-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm8[0],mem[1,2],ymm8[3],mem[4],ymm8[5],mem[6,7],ymm8[8],mem[9,10],ymm8[11],mem[12],ymm8[13],mem[14,15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] @@ -7301,28 +7291,28 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm9 = xmm1[0,1],mem[2],xmm1[3] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm9 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm9 = mem[0,1],xmm6[2],mem[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX2-FP-NEXT: vpblendw $181, (%rsp), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0],ymm7[1],mem[2],ymm7[3],mem[4,5],ymm7[6],mem[7,8],ymm7[9],mem[10],ymm7[11],mem[12,13],ymm7[14],mem[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FP-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm4[1,2],ymm13[3],ymm4[4],ymm13[5],ymm4[6,7],ymm13[8],ymm4[9,10],ymm13[11],ymm4[12],ymm13[13],ymm4[14,15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FP-NEXT: vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm9 = xmm4[0,1],mem[2],xmm4[3] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm9 = mem[0,1],xmm8[2],mem[3] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] @@ -7338,7 +7328,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3] +; AVX2-FP-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm9 = xmm15[0,1],mem[2],xmm15[3] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] @@ -7366,16 +7357,16 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15] +; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 @@ -7384,19 +7375,18 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm4 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] +; AVX2-FP-NEXT: vpblendw $107, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm14 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7] ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 @@ -7404,7 +7394,6 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm8 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 @@ -7485,16 +7474,15 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i16_stride5_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 @@ -7506,15 +7494,16 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2],ymm12[3],ymm3[4],ymm12[5],ymm3[6,7],ymm12[8],ymm3[9,10],ymm12[11],ymm3[12],ymm12[13],ymm3[14,15] +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 @@ -7523,23 +7512,23 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5],ymm15[6],ymm9[7,8],ymm15[9],ymm9[10,11],ymm15[12],ymm9[13],ymm15[14],ymm9[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15] -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] @@ -7548,15 +7537,16 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm10 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15] -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm6[1,2],ymm13[3],ymm6[4],ymm13[5],ymm6[6,7],ymm13[8],ymm6[9,10],ymm13[11],ymm6[12],ymm13[13],ymm6[14,15] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] @@ -7564,14 +7554,14 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,0,0,0,4,7,1,6] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,u,u,u,4,7,1,6] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] @@ -7579,227 +7569,228 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm5, %ymm11 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm11, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0],ymm13[1],ymm6[2,3],ymm13[4],ymm6[5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10,11],ymm13[12],ymm6[13],ymm13[14],ymm6[15] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm11 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm4, %ymm1, %ymm5 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,1,3,0,3,5,7] +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm11, %ymm4 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm5, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm11, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm7, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm10 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm11 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2,3,4],ymm11[5,6,7],ymm1[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm12 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm3 +; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7],ymm9[8,9],mem[10],ymm9[11],mem[12],ymm9[13,14],mem[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,0,0,5,7,2,4] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0] -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm8 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm5 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,u,u,5,7,2,4] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0] +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm11 +; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm6, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10,11],ymm7[12],ymm1[13],ymm7[14],ymm1[15] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm8, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5],ymm6[6],mem[7,8],ymm6[9],mem[10,11],ymm6[12],mem[13],ymm6[14],mem[15] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm9[1,2],mem[3],ymm9[4],mem[5],ymm9[6,7],mem[8],ymm9[9,10],mem[11],ymm9[12],mem[13],ymm9[14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,0,0,5,0,2,7] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,u,u,5,0,2,7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,4,7,0,2,4,7,0] +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2],xmm0[3] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm10[1],mem[2],ymm10[3],mem[4,5],ymm10[6],mem[7,8],ymm10[9],mem[10],ymm10[11],mem[12,13],ymm10[14],mem[15] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm10 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm12 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2],xmm0[3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm1[1],ymm7[2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8],ymm1[9],ymm7[10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm8, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm13[1,2],ymm4[3],ymm13[4],ymm4[5],ymm13[6,7],ymm4[8],ymm13[9,10],ymm4[11],ymm13[12],ymm4[13],ymm13[14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2],xmm0[3] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm14[1],ymm6[2],ymm14[3],ymm6[4,5],ymm14[6],ymm6[7,8],ymm14[9],ymm6[10],ymm14[11],ymm6[12,13],ymm14[14],ymm6[15] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] @@ -7809,92 +7800,91 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,0,0,6,0,3,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,u,u,6,0,3,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7] -; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,2,1,3,0,2,5,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm5, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] +; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5],mem[6],ymm15[7,8],mem[9],ymm15[10,11],mem[12],ymm15[13],mem[14],ymm15[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3,4],xmm11[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm14[2],mem[3],ymm14[4],mem[5,6],ymm14[7],mem[8,9],ymm14[10],mem[11],ymm14[12],mem[13,14],ymm14[15] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%r9) ; AVX2-FCP-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-FCP-NEXT: vzeroupper @@ -8373,7 +8363,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride5_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512-FCP-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8384,18 +8374,17 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %ymm11 ; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX512-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -8403,10 +8392,10 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm19 ; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 @@ -8415,17 +8404,18 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm30 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,4,7,1,4,6,u,u] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] ; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 @@ -8472,20 +8462,21 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 ; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 ; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,u,u,u,4,7,1,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm21 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -8494,7 +8485,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,2,5,7,4,7,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] ; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm5 ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4 @@ -8504,7 +8495,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] ; AVX512-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 ; AVX512-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] @@ -8512,29 +8503,28 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm10)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm10 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm17 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm16, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm11 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15] @@ -8547,14 +8537,14 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload +; AVX512-FCP-NEXT: vpermd %ymm19, %ymm25, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 @@ -8565,7 +8555,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,3,5,2,5,7,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 ; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 @@ -8578,17 +8568,16 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,2,u,u,5,7,2,4] +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm29 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm21 ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 @@ -8605,6 +8594,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -8621,7 +8611,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 @@ -8633,67 +8623,68 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 +; AVX512-FCP-NEXT: vpermd %ymm19, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [1,3,6,0,5,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm20, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,4,7,0,2,4,7,0] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm25 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm22 -; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm31 +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm30 +; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm18 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm28 +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm31 +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm20, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm19, %ymm5, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,4,6,3,6,u,u,u] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm20 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,u,u,5,0,2,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] @@ -8703,10 +8694,10 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3)) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpblendw $82, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] @@ -8730,23 +8721,23 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm27, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm19, %ymm26, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm11 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] @@ -8757,11 +8748,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,u,u,6,0,3,5] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] @@ -8773,7 +8764,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpblendw $181, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] @@ -8811,14 +8802,15 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm3, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm3, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512-FCP-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512-FCP-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -9295,7 +9287,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512DQ-FCP-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9306,18 +9298,17 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm28 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -9325,10 +9316,10 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm19 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 @@ -9337,17 +9328,18 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm30 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,4,7,1,4,6,u,u] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 @@ -9394,20 +9386,21 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,u,u,u,4,7,1,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] @@ -9416,7 +9409,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,2,5,7,4,7,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4 @@ -9426,7 +9419,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] ; AVX512DQ-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] @@ -9434,29 +9427,28 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm10 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm16, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15] @@ -9469,14 +9461,14 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm25, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 @@ -9487,7 +9479,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,3,5,2,5,7,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 @@ -9500,17 +9492,16 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,2,u,u,5,7,2,4] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm21 ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 @@ -9527,6 +9518,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -9543,7 +9535,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 @@ -9555,67 +9547,68 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [1,3,6,0,5,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm20, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,4,7,0,2,4,7,0] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm25 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm18 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm31 +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm20, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm5, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,4,6,3,6,u,u,u] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm20 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,u,u,5,0,2,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] @@ -9625,10 +9618,10 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpblendw $82, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] @@ -9652,23 +9645,23 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm27, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm26, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm11 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] @@ -9679,11 +9672,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,u,u,6,0,3,5] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] @@ -9695,7 +9688,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpblendw $181, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] @@ -9733,14 +9726,15 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512DQ-FCP-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512DQ-FCP-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -9760,13 +9754,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 @@ -9776,11 +9770,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -9790,11 +9784,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 @@ -9804,19 +9798,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 ; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -9824,7 +9818,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 @@ -9859,13 +9853,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 @@ -9875,11 +9869,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -9889,11 +9883,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 @@ -9903,19 +9897,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -9923,7 +9917,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 @@ -9958,13 +9952,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 @@ -9974,11 +9968,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -9988,11 +9982,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 @@ -10002,19 +9996,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -10022,7 +10016,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 @@ -10057,13 +10051,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 @@ -10073,11 +10067,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -10087,11 +10081,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 @@ -10101,19 +10095,19 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -10121,7 +10115,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index feb75b21d5c8d..7a8e12ec38884 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -479,13 +479,13 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i16_stride6_vf4: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vpsrld $16, %xmm1, %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] @@ -509,13 +509,13 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FP-LABEL: load_i16_stride6_vf4: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -538,13 +538,13 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i16_stride6_vf4: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -579,12 +579,12 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [8,5,2,11] ; AVX512-NEXT: vpermd %zmm2, %zmm5, %zmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] @@ -610,12 +610,12 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,5,2,11] ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] @@ -642,12 +642,12 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [8,5,2,11] ; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] @@ -673,12 +673,12 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,5,2,11] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] @@ -932,8 +932,8 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpsrlq $16, %xmm4, %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,0,2,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,6,6,7] @@ -941,33 +941,33 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7] ; AVX-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX-NEXT: vpslld $16, %xmm5, %xmm9 -; AVX-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX-NEXT: vpsrldq {{.*#+}} xmm10 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] -; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] ; AVX-NEXT: vpsrld $16, %xmm2, %xmm9 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4,5],xmm8[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,3,3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm8[6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0] ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4],xmm10[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm5[4,5],xmm6[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0,1,2,3],xmm5[4,5],xmm7[6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm11[5,6,7] ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm11 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,2,3,3] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm12[0],xmm11[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm12[0],xmm11[0] ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm11[3,4],xmm9[5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] @@ -978,23 +978,23 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,4,6] ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3,4],xmm7[5,6,7] ; AVX-NEXT: vpsrlq $48, %xmm2, %xmm2 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX-NEXT: vmovdqa %xmm3, (%rsi) -; AVX-NEXT: vmovdqa %xmm7, (%rdx) +; AVX-NEXT: vmovdqa %xmm6, (%rdx) ; AVX-NEXT: vmovdqa %xmm8, (%rcx) ; AVX-NEXT: vmovdqa %xmm9, (%r8) -; AVX-NEXT: vmovdqa %xmm6, (%r9) +; AVX-NEXT: vmovdqa %xmm7, (%r9) ; AVX-NEXT: vmovdqa %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -1004,17 +1004,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-NEXT: vpslld $16, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] ; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 @@ -1030,7 +1030,7 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] @@ -1041,150 +1041,150 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4],xmm2[5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX2-NEXT: vmovdqa %xmm2, (%rsi) +; AVX2-NEXT: vmovdqa %xmm1, (%rsi) ; AVX2-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-NEXT: vmovdqa %xmm8, (%rcx) ; AVX2-NEXT: vmovdqa %xmm6, (%r8) -; AVX2-NEXT: vmovdqa %xmm1, (%r9) +; AVX2-NEXT: vmovdqa %xmm2, (%r9) ; AVX2-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i16_stride6_vf8: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,0,3] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX2-FP-NEXT: vpslld $16, %xmm3, %xmm7 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3],xmm6[4,5],xmm4[6,7] ; AVX2-FP-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1],xmm3[2],xmm5[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3,4],xmm5[5,6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsi) -; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx) +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-FP-NEXT: vmovdqa %xmm4, (%rdx) ; AVX2-FP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX2-FP-NEXT: vmovdqa %xmm6, (%r8) -; AVX2-FP-NEXT: vmovdqa %xmm4, (%r9) -; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) +; AVX2-FP-NEXT: vmovdqa %xmm5, (%r9) +; AVX2-FP-NEXT: vmovdqa %xmm1, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride6_vf8: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,0,3] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX2-FCP-NEXT: vpslld $16, %xmm3, %xmm7 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3],xmm6[4,5],xmm4[6,7] ; AVX2-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1],xmm3[2],xmm5[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3,4],xmm5[5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx) +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%r8) -; AVX2-FCP-NEXT: vmovdqa %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX2-FCP-NEXT: vmovdqa %xmm5, (%r9) +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -1194,27 +1194,27 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX512-NEXT: vpslld $16, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX512-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,2,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX512-NEXT: vpbroadcastw 74(%rdi), %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX512-NEXT: vpbroadcastw 74(%rdi), %xmm7 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6],xmm5[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] @@ -1234,17 +1234,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] ; AVX512-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-NEXT: vmovdqa %xmm5, (%rdx) @@ -1257,30 +1257,29 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i16_stride6_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512-FCP-NEXT: vpslld $16, %xmm3, %xmm7 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 @@ -1289,32 +1288,33 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%r9) -; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rax) +; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1324,27 +1324,27 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX512DQ-NEXT: vpslld $16, %xmm0, %xmm2 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,2,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX512DQ-NEXT: vpbroadcastw 74(%rdi), %xmm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX512DQ-NEXT: vpbroadcastw 74(%rdi), %xmm7 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6],xmm5[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] @@ -1364,17 +1364,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rdx) @@ -1387,30 +1387,29 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vpslld $16, %xmm3, %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 @@ -1419,32 +1418,33 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -1453,17 +1453,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1479,17 +1479,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1505,17 +1505,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1531,17 +1531,17 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18,24,30,36,42] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19,25,31,37,43] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20,26,32,38,44] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21,27,33,39,45] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22,28,34,40,46] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23,29,35,41,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -2055,10 +2055,10 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-LABEL: load_i16_stride6_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] @@ -2067,16 +2067,16 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5] ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] @@ -2090,11 +2090,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6],xmm9[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,0,4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] @@ -2105,7 +2105,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -2132,38 +2132,38 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,2,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-NEXT: vmovdqa %ymm4, (%rdx) ; AVX2-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-NEXT: vmovdqa %ymm9, (%r8) ; AVX2-NEXT: vmovdqa %ymm5, (%r9) @@ -2175,9 +2175,9 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i16_stride6_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] @@ -2187,34 +2187,34 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm7 -; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm12 +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm7 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[2,1,0,3] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm13, %xmm14 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm5[2,3] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm5[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm11, %xmm10 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[1,1,1,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] @@ -2223,7 +2223,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11 @@ -2248,15 +2248,15 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2,3],xmm4[4],xmm10[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm7, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 @@ -2271,16 +2271,16 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovdqa %ymm4, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm5, (%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8) ; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9) @@ -2292,9 +2292,9 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i16_stride6_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] @@ -2304,34 +2304,34 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm7 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm12 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[2,1,0,3] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm14 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm5[2,3] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm5[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm10 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[1,1,1,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm10 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] @@ -2340,7 +2340,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 @@ -2365,15 +2365,15 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2,3],xmm4[4],xmm10[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm7, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 @@ -2388,16 +2388,16 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9) @@ -2417,27 +2417,27 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm8 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] ; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm9, %xmm13, %xmm12 -; AVX512-NEXT: vpshufb %xmm9, %xmm11, %xmm9 -; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm12 +; AVX512-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3],xmm12[4,5],xmm10[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] @@ -2483,9 +2483,9 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[2,2,2,2,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] @@ -2497,12 +2497,12 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm6 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] @@ -2531,8 +2531,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] @@ -2579,14 +2579,14 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,1,2,1,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1,2],xmm12[3],xmm11[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] @@ -2641,27 +2641,27 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm8 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] ; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm13, %xmm12 -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm11, %xmm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm12 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3],xmm12[4,5],xmm10[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] @@ -2707,9 +2707,9 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] @@ -2721,12 +2721,12 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm6 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] @@ -2755,8 +2755,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] @@ -2803,14 +2803,14 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,1,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1,2],xmm12[3],xmm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] @@ -2863,41 +2863,39 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512BW-NEXT: vpermw %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %zmm5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm5, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512BW-NEXT: vpermw %zmm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] ; AVX512BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermw %zmm5, %zmm8, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -2918,41 +2916,39 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] -; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] -; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm8, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -2973,41 +2969,39 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] -; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] -; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -3028,41 +3022,39 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] -; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] -; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] -; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm8, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -4136,50 +4128,48 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i16_stride6_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm0 +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7] -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-NEXT: vpblendvb %ymm0, %ymm9, %ymm4, %ymm4 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm4, %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm8[2,3],ymm7[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm8[0,1],ymm7[0,1] +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm9 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7] -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm6 -; AVX2-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm6 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] +; AVX2-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpblendvb %ymm0, %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] @@ -4187,32 +4177,32 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %ymm3, %ymm11, %ymm1 -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,1,2,3] +; AVX2-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,0,3] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,2,0,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] -; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm12[1],ymm15[2,3,4,5],ymm12[6],ymm15[7] +; AVX2-NEXT: vpshufb %ymm13, %ymm8, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm11 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3,4],xmm7[5,6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4221,51 +4211,51 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7] ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5,6],mem[7] +; AVX2-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,0,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[0],ymm10[1],mem[2,3,4,5],ymm10[6],mem[7] -; AVX2-NEXT: vpshufb %ymm15, %ymm10, %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,2,0,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2],xmm9[3],xmm14[4,5],xmm9[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = mem[0],ymm14[1],mem[2,3,4,5],ymm14[6],mem[7] +; AVX2-NEXT: vpshufb %ymm13, %ymm14, %ymm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm13[3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3,4,5,6,7],ymm9[8,9,10],ymm6[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,1,0,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,1,0,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6],xmm6[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-NEXT: vpshufb %ymm6, %ymm12, %ymm7 +; AVX2-NEXT: vpshufb %ymm6, %ymm8, %ymm8 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3],xmm8[4,5],xmm1[6],xmm8[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3],xmm7[4,5],xmm1[6],xmm7[7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm1 +; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm1 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX2-NEXT: vpshufb %ymm6, %ymm10, %ymm2 +; AVX2-NEXT: vpshufb %ymm6, %ymm14, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,1,0,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] @@ -4278,99 +4268,99 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] +; AVX2-NEXT: vpblendd $148, (%rsp), %ymm15, %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,1,2,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,0,0,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,6,5,6,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,2,0,4,5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-NEXT: vpshufb %ymm11, %ymm8, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-NEXT: vpshufb %ymm6, %ymm14, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4] -; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-NEXT: vpshufb %ymm11, %ymm2, %ymm11 -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] -; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX2-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] +; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,1,2,3] +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[0,0,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1,2],xmm8[3],xmm13[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm6 +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,7,5,6,5] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[3,1,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX2-NEXT: vpshufb %ymm6, %ymm14, %ymm8 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm8[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,3,3,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -4378,24 +4368,24 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7] -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm5 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,1,0,2,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm2 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] @@ -4412,10 +4402,10 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-NEXT: vmovdqa %ymm5, 32(%r8) +; AVX2-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-NEXT: vmovdqa %ymm5, 32(%r9) ; AVX2-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm2, 32(%rax) @@ -4452,7 +4442,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4531,7 +4521,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,u,u,u,u,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7] @@ -4579,7 +4569,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm9 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7] @@ -4621,7 +4611,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7] @@ -4728,7 +4718,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4807,7 +4797,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,u,u,u,u,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7] @@ -4855,7 +4845,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm9 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7] @@ -4897,7 +4887,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7] @@ -4978,96 +4968,97 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride6_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX512-NEXT: vmovdqa 224(%rdi), %ymm14 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7] -; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512-NEXT: subq $136, %rsp +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX512-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,0,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX512-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] -; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] +; AVX512-NEXT: vmovdqa64 %ymm7, %ymm23 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[2,2,2,2,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3,4],xmm1[5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] +; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm6[2,3],mem[2,3] ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm7, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm6[2,3],mem[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7] ; AVX512-NEXT: vmovdqa64 %ymm10, %ymm16 -; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm7 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,2,0,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6,7] -; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7] -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512-NEXT: vmovdqa64 %ymm12, %ymm28 -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa 352(%rdi), %ymm8 +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm7 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,0,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm10 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX512-NEXT: vmovdqa64 %ymm10, %ymm22 +; AVX512-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 352(%rdi), %ymm9 ; AVX512-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7] -; AVX512-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm20 -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7] -; AVX512-NEXT: vpshufb %xmm9, %xmm10, %xmm9 -; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm10 -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm9[2,3],mem[2,3] -; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0],ymm12[1],ymm15[2,3,4,5],ymm12[6],ymm15[7] -; AVX512-NEXT: vmovdqa64 %ymm15, %ymm25 -; AVX512-NEXT: vmovdqa64 %ymm12, %ymm27 -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm10, %ymm30 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512-NEXT: vmovdqa64 %ymm10, %ymm19 +; AVX512-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,2,2,2,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4],xmm10[5,6,7] +; AVX512-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm11[2,3],mem[2,3] +; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm11 +; AVX512-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm13[1],ymm11[2,3,4,5],ymm13[6],ymm11[7] +; AVX512-NEXT: vmovdqa64 %ymm11, %ymm26 +; AVX512-NEXT: vmovdqa64 %ymm13, %ymm27 +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX512-NEXT: vpshufb %xmm9, %xmm4, %xmm0 +; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm0 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX512-NEXT: vmovdqa64 %ymm12, %ymm30 ; AVX512-NEXT: vmovdqa64 %ymm14, %ymm31 -; AVX512-NEXT: vmovdqa64 %ymm11, %ymm21 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] @@ -5075,8 +5066,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] @@ -5087,10 +5078,10 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm23 +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm25 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] -; AVX512-NEXT: vmovdqa64 %ymm13, %ymm19 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] +; AVX512-NEXT: vmovdqa64 %ymm15, %ymm21 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] @@ -5098,17 +5089,15 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm29, %ymm13 +; AVX512-NEXT: vmovdqa64 %ymm22, %ymm13 ; AVX512-NEXT: vmovdqa64 %ymm28, %ymm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm28 -; AVX512-NEXT: vmovdqa64 %ymm20, %ymm29 -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] @@ -5117,7 +5106,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14 -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm27, %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] @@ -5127,7 +5116,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm17 ^ (zmm0 & (zmm2 ^ zmm17)) -; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm2)) ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7] @@ -5139,8 +5128,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm30 -; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm18 +; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm22 +; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] @@ -5161,72 +5150,71 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm5)) ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm31, %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm2 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,3,2,1] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512-NEXT: vpshufb %xmm9, %xmm4, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX512-NEXT: vpshufb %xmm9, %xmm4, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] -; AVX512-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7] +; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm10 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,2,2,2,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm11) | ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm8 ; AVX512-NEXT: movw $31, %ax ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6],ymm10[7] +; AVX512-NEXT: vmovdqa32 %zmm10, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa64 %ymm19, %ymm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6],ymm15[7] ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX512-NEXT: vpshufb %xmm9, %xmm8, %xmm12 +; AVX512-NEXT: vpshufb %xmm9, %xmm8, %xmm14 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm12 -; AVX512-NEXT: vmovdqa64 %ymm27, %ymm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0,1,2,3],xmm14[4],xmm10[5],xmm14[6,7] +; AVX512-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4 ; AVX512-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} ; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 @@ -5234,115 +5222,116 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload ; AVX512-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem)) ; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa32 %zmm30, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) -; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload -; AVX512-NEXT: # zmm15 = mem ^ (zmm3 & (zmm15 ^ mem)) -; AVX512-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem)) +; AVX512-NEXT: vmovdqa32 %zmm18, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm17 & (zmm10 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm17 & (zmm14 ^ zmm2)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm17 & (zmm1 ^ zmm0)) ; AVX512-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm14, (%r9) ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: addq $136, %rsp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride6_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $136, %rsp -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm2 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-FCP-NEXT: subq $104, %rsp +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,1,0,3] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm5 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6],ymm10[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4,5],xmm8[6,7] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] -; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm0[1],ymm8[2,3,4,5],ymm0[6],ymm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm27 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm8[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],mem[2,3] +; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] -; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm24 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],mem[2,3] +; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm13, %ymm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0],ymm0[1],ymm13[2,3,4,5],ymm0[6],ymm13[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3],xmm8[4,5],xmm1[6],xmm8[7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] @@ -5350,18 +5339,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm30 -; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm31 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm21 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] @@ -5371,26 +5360,26 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm29 ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm27 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm1, %xmm16 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 @@ -5398,9 +5387,10 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm30 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] @@ -5408,7 +5398,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm17 ^ (zmm0 & (zmm11 ^ zmm17)) -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm11)) ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] @@ -5419,116 +5409,116 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm7 ^ (zmm0 & (zmm5 ^ zmm7)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm5)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4],xmm8[5],xmm3[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,2,2,2,4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm11) | ymm10 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[2,2,2,2,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm13) | ymm9 ; AVX512-FCP-NEXT: movw $31, %ax ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm13 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm13[4],xmm8[5],xmm13[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm12 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] +; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm9 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm12 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm12[1],ymm4[2,3,4,5],ymm12[6],ymm4[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm10[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm7 & ymm11) -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm8 & ymm13) +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4],xmm2[5],xmm7[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm2 & (zmm5 ^ mem)) ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm15 = mem ^ (zmm2 & (zmm15 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm15 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rdx) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm17 & (zmm8 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm3)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm17 & (zmm0 ^ zmm1)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-FCP-NEXT: addq $136, %rsp +; AVX512-FCP-NEXT: addq $104, %rsp ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -5560,28 +5550,28 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],mem[2,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] ; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm18 ; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm6 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3],xmm5[4,5],xmm0[6,7] ; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm2[1],ymm12[2,3,4,5],ymm2[6],ymm12[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[2,2,2,2,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm11, %xmm7 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm8 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm7 @@ -5611,21 +5601,21 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm8, %zmm3 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3],xmm6[4,5],xmm0[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm5, %xmm0 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm0 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm17 & (zmm2 ^ zmm3)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm17 & (zmm5 ^ zmm3)) ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm29 @@ -5674,19 +5664,19 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,6,5,6,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16)) -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9)) ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] @@ -5697,12 +5687,12 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] @@ -5732,33 +5722,33 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,3,2,1] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm2, %xmm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm2, %xmm5 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7] ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[2,2,2,2,4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm14) | ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ymm14) | ymm11 ; AVX512DQ-NEXT: movw $31, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm11 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,3,2,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,0,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6,7] +; AVX512DQ-NEXT: vinserti32x8 $0, %ymm7, %zmm0, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6],ymm11[7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm7 +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,3,2,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[0,1,0,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4],xmm11[5],xmm8[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm11 ; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm10 @@ -5771,7 +5761,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] @@ -5779,18 +5769,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpshufb %xmm15, %xmm5, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm14) | ymm3 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -5811,109 +5801,109 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i16_stride6_vf32: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: pushq %rax -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5,6],ymm13[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm0 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm4 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4,5],xmm4[6],xmm8[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm5 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm3[2],xmm5[3],xmm3[4,5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm0[1],ymm12[2,3,4,5],ymm0[6],ymm12[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm11 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm10 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3],xmm7[4,5],xmm10[6],xmm7[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm0[1],ymm10[2,3,4,5],ymm0[6],ymm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm17 & (zmm10 ^ zmm16)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm16 ^ (zmm17 & (zmm11 ^ zmm16)) ; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2],xmm5[3],xmm8[4,5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5],xmm8[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2],xmm6[3],xmm0[4,5],xmm6[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm17 & (zmm2 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm17 & (zmm3 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm30 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] @@ -5924,137 +5914,137 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm1, %xmm17 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm17[0,1,2,1] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,6,5,6,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,5,6,4] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16)) -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9)) ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm10 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5,6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5,6],xmm7[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm0 & (zmm3 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm0 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm1 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,2,2,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm13) | ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm13) | ymm8 ; AVX512DQ-FCP-NEXT: movw $31, %ax ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4,5],ymm10[6],ymm14[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm11[1],ymm14[2,3,4,5],ymm11[6],ymm14[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm8 & ymm13) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm13) ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -6083,11 +6073,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 ; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -6101,17 +6091,17 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k2 @@ -6125,11 +6115,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -6141,7 +6131,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -6171,11 +6161,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -6189,17 +6179,17 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 @@ -6213,11 +6203,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -6229,7 +6219,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -6259,11 +6249,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -6277,17 +6267,17 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 @@ -6301,11 +6291,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -6317,7 +6307,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -6347,11 +6337,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -6365,17 +6355,17 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 @@ -6389,11 +6379,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -6405,7 +6395,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -8608,127 +8598,129 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-LABEL: load_i16_stride6_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $1272, %rsp # imm = 0x4F8 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX2-NEXT: subq $1224, %rsp # imm = 0x4C8 +; AVX2-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 480(%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 416(%rdi), %ymm6 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 480(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 448(%rdi), %ymm3 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1] ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] -; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] -; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm3 -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-NEXT: vpshufb %ymm2, %ymm10, %ymm7 -; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7] -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] -; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm13 -; AVX2-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3,4,5],ymm2[6],ymm4[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] +; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm1 +; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm1[2,3],ymm0[2,3] ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm1[0,1],ymm0[0,1] ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-NEXT: vpshufb %xmm6, %xmm12, %xmm13 -; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm6 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] -; AVX2-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa 672(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[0,1],ymm0[0,1] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX2-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vpshufb %xmm5, %xmm14, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] +; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm12 +; AVX2-NEXT: vpblendvb %ymm15, %ymm11, %ymm12, %ymm11 +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 608(%rdi), %ymm8 +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX2-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX2-NEXT: vpshufb %xmm5, %xmm11, %xmm12 +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0],xmm13[1],xmm12[2,3],xmm13[4],xmm12[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX2-NEXT: vpshufb %ymm7, %ymm12, %ymm5 +; AVX2-NEXT: vpblendvb %ymm15, %ymm13, %ymm5, %ymm5 +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm10[1],xmm6[2,3],xmm10[4],xmm6[5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-NEXT: vpblendvb %ymm15, %ymm10, %ymm9, %ymm9 +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-NEXT: # xmm7 = mem[1,1,2,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3],xmm7[4],xmm4[5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm7 +; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm7, %ymm4 +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm6, %ymm12, %ymm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm11, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] -; AVX2-NEXT: vpshufb %ymm9, %ymm7, %ymm3 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %ymm9, %ymm13, %ymm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8741,9 +8733,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] @@ -8753,16 +8745,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0 +; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vpshufb %xmm10, %xmm6, %xmm2 +; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX2-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] @@ -8770,9 +8763,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2 @@ -8781,7 +8774,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: vpblendd $109, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm14 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3] @@ -8789,45 +8782,45 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vmovdqa 736(%rdi), %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 736(%rdi), %ymm7 +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3,4],ymm8[5],ymm2[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3,4],xmm12[5,6,7] -; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3,4],xmm12[5,6,7] +; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm7 +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[2,2,2,2,4,5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3,4],xmm11[5,6,7] -; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm9 +; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3,4],xmm11[5,6,7] +; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm9 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] -; AVX2-NEXT: vpshufb %xmm10, %xmm1, %xmm8 +; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-NEXT: vpshufb %xmm10, %xmm1, %xmm7 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm10 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,2,0,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm15[2],xmm8[3],xmm15[4,5],xmm8[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7] -; AVX2-NEXT: vpshufb %ymm13, %ymm8, %ymm13 +; AVX2-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4,5],ymm7[6],mem[7] +; AVX2-NEXT: vpshufb %ymm13, %ymm7, %ymm13 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7],ymm13[8,9,10],ymm9[11,12,13,14,15] @@ -8836,64 +8829,64 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-NEXT: # xmm9 = mem[2,1,0,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm9[2],xmm5[3,4],xmm9[5],xmm5[6],xmm9[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3,4],xmm9[5],xmm4[6],xmm9[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-NEXT: vpshufb %ymm9, %ymm13, %ymm13 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2],ymm13[3,4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm4, %xmm15, %xmm15 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3],xmm15[4,5],xmm6[6],xmm15[7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7],ymm13[8,9,10],ymm6[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-NEXT: # xmm6 = mem[2,1,0,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm9, %ymm6, %ymm6 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2],ymm13[3,4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3],xmm15[4,5],xmm7[6],xmm15[7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3,4,5,6,7],ymm13[8,9,10],ymm7[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd $198, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-NEXT: # xmm7 = mem[2,1,0,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6],xmm7[7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[2,1,0,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5],xmm8[6],xmm6[7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,1,0,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6],xmm4[7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm9, %ymm5, %ymm5 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-NEXT: vpshufb %xmm5, %xmm12, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-NEXT: vpshufb %xmm4, %xmm12, %xmm5 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm5, %xmm11, %xmm2 +; AVX2-NEXT: vpshufb %xmm4, %xmm11, %xmm2 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] -; AVX2-NEXT: vpshufb %ymm9, %ymm8, %ymm2 +; AVX2-NEXT: vpshufb %ymm9, %ymm7, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,1,0,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] @@ -8923,8 +8916,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] @@ -8933,16 +8926,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-NEXT: vpshufb %ymm9, %ymm3, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -8957,27 +8950,27 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: vpblendd $219, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-NEXT: vpshufb %ymm9, %ymm3, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] @@ -8994,45 +8987,45 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[0,0,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpshufb %ymm8, %ymm9, %ymm13 +; AVX2-NEXT: vpshufb %ymm9, %ymm10, %ymm13 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] -; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm13[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm11 +; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,0,0,0,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,6,5,6,4] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,6,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6],xmm13[7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,1,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[2,1,2,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,0,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[2,1,2,0,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1,2],xmm12[3],xmm8[4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm8, %ymm15, %ymm8 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] -; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] +; AVX2-NEXT: vpshufb %ymm9, %ymm15, %ymm9 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -9062,12 +9055,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm2 +; AVX2-NEXT: vpshufb %ymm7, %ymm10, %ymm2 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] ; AVX2-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -9076,7 +9070,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX2-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[3,1,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-NEXT: vpshuflw $244, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-NEXT: # xmm2 = mem[0,1,3,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] @@ -9086,8 +9080,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,5,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,5,6,5] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[1,1,1,1,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] @@ -9100,253 +9094,255 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,3,2,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,3,2,1] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,0,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX2-NEXT: vpshufb %xmm9, %xmm11, %xmm4 +; AVX2-NEXT: vpshufb %xmm8, %xmm12, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,3,2,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,3,2,1] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,1,0,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX2-NEXT: vpshufb %xmm9, %xmm13, %xmm10 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6,7] +; AVX2-NEXT: vpshufb %xmm8, %xmm14, %xmm11 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX2-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5],xmm9[6,7] -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[0,1,2,3,4],ymm9[5,6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-NEXT: vpshufb %xmm9, %xmm11, %xmm11 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm8 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,3,2,1] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,1,0,2,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,6,6,6] +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4],xmm11[5],xmm8[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm8 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[0,1,1,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] -; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5],xmm8[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vpshufb %xmm9, %xmm13, %xmm7 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-NEXT: vpshufb %xmm2, %xmm14, %xmm7 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,1,1,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-NEXT: vmovdqa %ymm6, 96(%r8) -; AVX2-NEXT: vmovdqa %ymm8, 32(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-NEXT: vmovdqa %ymm5, (%r8) -; AVX2-NEXT: vmovdqa %ymm10, 96(%r9) -; AVX2-NEXT: vmovdqa %ymm2, 32(%r9) -; AVX2-NEXT: vmovdqa %ymm4, (%r9) -; AVX2-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm10, 96(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-NEXT: vmovdqa %ymm11, 96(%r9) +; AVX2-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-NEXT: vmovdqa %ymm6, 64(%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-NEXT: addq $1272, %rsp # imm = 0x4F8 +; AVX2-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX2-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i16_stride6_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $1304, %rsp # imm = 0x518 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[0,1],ymm4[0,1] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[0,1],ymm2[0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3,4,5],ymm4[6],ymm6[7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm2, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0],xmm9[1],xmm4[2,3],xmm9[4],xmm4[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm9, %ymm8, %ymm8 +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[0,1],ymm1[0,1] +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7] +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm10, %ymm8 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm9 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm8, %ymm13 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm8, %ymm13 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm12, %ymm13, %ymm12 +; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm13 ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm3 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4,5],ymm4[6],ymm5[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm14, %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm11[1],xmm1[2,3],xmm11[4],xmm1[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm10 +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm4, %ymm5 +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm5 +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm8, %ymm5 +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm0 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9364,27 +9360,27 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm2 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] -; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] @@ -9393,7 +9389,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm2 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] @@ -9402,8 +9398,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] +; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] @@ -9447,14 +9443,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,u,u,u,u,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm9 = mem[1,1,1,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6],xmm9[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] @@ -9467,17 +9463,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm6, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm7, %xmm0 ; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] @@ -9488,7 +9484,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -9502,7 +9498,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm10, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] @@ -9512,71 +9508,72 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,2,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,6,4] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm1 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendd $219, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm2, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9585,354 +9582,356 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,0,3] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm0 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm13, %ymm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm5, %xmm12 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm13, %ymm15 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm15[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm14, %xmm3 -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm15 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,1,2,1] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,6,5,6,4] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1,2,3],xmm3[4],xmm15[5,6],xmm3[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,2,3] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,1,2,0,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] -; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,1,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm13, %ymm4 +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm0 +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm5, %xmm1 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[3,1,2,1,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm13, %ymm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX2-FP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm6 -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm2 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm10 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm4 = mem[3,1,2,1,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm12, %xmm2 +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,7,5,6,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6],xmm4[7] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm4 +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[3,1,2,1,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,3,2,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm7, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,3,2,1] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm13, %xmm10 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3],xmm3[4],xmm10[5],xmm3[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5,6],ymm10[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,3,2,1] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm14, %xmm10 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm10[4],xmm6[5],xmm10[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4],xmm11[5],xmm8[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5],xmm7[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm9 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm13, %xmm11 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm14, %xmm11 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm12 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, (%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm8, 96(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm9, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 64(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm1, 96(%r8) +; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FP-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-FP-NEXT: vmovdqa %ymm10, 96(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-FP-NEXT: addq $1304, %rsp # imm = 0x518 +; AVX2-FP-NEXT: vmovdqa %ymm6, 96(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-FP-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride6_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $1304, %rsp # imm = 0x518 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[0,1],ymm4[0,1] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[0,1],ymm2[0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3,4,5],ymm4[6],ymm6[7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0],xmm9[1],xmm4[2,3],xmm9[4],xmm4[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm9, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[0,1],ymm1[0,1] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm8 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm13 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm13 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm12, %ymm13, %ymm12 +; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm13 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4,5],ymm4[6],ymm5[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm14, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm11[1],xmm1[2,3],xmm11[4],xmm1[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm5 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm1 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm0 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm0 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9950,27 +9949,27 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm2 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] -; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm4 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] @@ -9979,7 +9978,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] @@ -9988,8 +9987,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] +; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] @@ -10033,14 +10032,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,u,u,u,u,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm9 = mem[1,1,1,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6],xmm9[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] @@ -10053,17 +10052,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm0 ; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] @@ -10074,7 +10073,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -10088,7 +10087,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] @@ -10098,71 +10097,72 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,2,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,6,4] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm1 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendd $219, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10171,238 +10171,239 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,0,3] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm0 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm12 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm15 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm15[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm3 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm15 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,1,2,1] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,6,5,6,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1,2,3],xmm3[4],xmm15[5,6],xmm3[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = mem[3,1,2,1,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm4 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm0 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm1 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[3,1,2,1,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX2-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm2 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm10 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm4 = mem[3,1,2,1,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm2 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,7,5,6,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6],xmm4[7] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm4 +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[3,1,2,1,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,3,2,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,3,2,1] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm10 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3],xmm3[4],xmm10[5],xmm3[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5,6],ymm10[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,3,2,1] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm10[4],xmm6[5],xmm10[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4],xmm11[5],xmm8[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5],xmm7[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm9 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm8, 96(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm9, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm10, 96(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-FCP-NEXT: addq $1304, %rsp # imm = 0x518 +; AVX2-FCP-NEXT: vmovdqa %ymm6, 96(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-FCP-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride6_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX512-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm0 +; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm0 ; AVX512-NEXT: vextracti32x4 $1, %ymm1, %xmm20 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[0,2,0,3] @@ -10417,31 +10418,31 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] -; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] +; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],mem[2,3] +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX512-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512-NEXT: vpshufb %xmm9, %xmm12, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512-NEXT: vextracti32x4 $1, %ymm12, %xmm22 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm22[0,2,0,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm22[0,2,0,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vpshufb %xmm10, %xmm12, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm1 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 736(%rdi), %ymm1 @@ -10453,7 +10454,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10476,17 +10477,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vpshufb %xmm9, %xmm14, %xmm1 +; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm30 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7] -; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10495,9 +10496,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm0 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] +; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm0 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2 @@ -10510,30 +10511,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX512-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3,4],xmm6[5,6,7] -; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512-NEXT: vmovdqa64 256(%rdi), %ymm18 +; AVX512-NEXT: vshufi64x2 {{.*#+}} ymm1 = ymm18[2,3],mem[2,3] ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 +; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX512-NEXT: vinserti32x4 $1, 288(%rdi), %ymm18, %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0 ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm10, %xmm15, %xmm0 -; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm14 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm9, %xmm15, %xmm0 +; AVX512-NEXT: vpshufb %xmm9, %xmm14, %xmm14 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512-NEXT: vpshufb %xmm14, %xmm7, %xmm7 @@ -10542,8 +10543,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %xmm10, %xmm9, %xmm0 -; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm9, %xmm10, %xmm0 +; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX512-NEXT: vpshufb %ymm3, %ymm8, %ymm4 @@ -10560,9 +10561,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2 ; AVX512-NEXT: vpshufb %xmm14, %xmm2, %xmm2 @@ -10574,8 +10575,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %ymm23, %ymm0 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512-NEXT: vpshufb %xmm10, %xmm12, %xmm3 +; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512-NEXT: vpshufb %xmm9, %xmm12, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10602,9 +10603,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] @@ -10617,8 +10618,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] @@ -10629,16 +10630,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX512-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-NEXT: vpblendd $36, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512-NEXT: vpblendd $219, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] @@ -10659,7 +10660,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm29 & (zmm1 ^ zmm2)) -; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm22 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm22 & (zmm3 ^ zmm1)) ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10673,8 +10674,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] @@ -10685,8 +10686,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm23 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] @@ -10695,14 +10696,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] ; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] @@ -10711,8 +10712,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] ; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10807,26 +10808,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm24 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7] +; AVX512-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] ; AVX512-NEXT: vpshufb %xmm13, %xmm5, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm30 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm0 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm4 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] @@ -10844,8 +10845,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload ; AVX512-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-NEXT: vpblendd $109, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] +; AVX512-NEXT: vpblendd $146, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] @@ -10862,11 +10863,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] +; AVX512-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX512-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512-NEXT: # ymm15 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX512-NEXT: # ymm15 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm14 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7] @@ -10882,8 +10883,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[2,2,2,2,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm12 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] @@ -10894,11 +10895,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 ; AVX512-NEXT: vmovdqa32 %zmm4, %zmm31 {%k1} ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm7 ; AVX512-NEXT: vpshufb %xmm13, %xmm7, %xmm3 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] @@ -10915,7 +10916,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm3 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0 @@ -11000,7 +11001,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm6 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm6 & (zmm3 ^ zmm30)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm31)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm2)) @@ -11048,29 +11049,29 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],mem[2,3] +; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm1 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11091,7 +11092,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm17 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm29 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] @@ -11108,10 +11109,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm30 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm2 @@ -11139,21 +11140,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %ymm16 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} ymm1 = ymm16[2,3],mem[2,3] ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 +; AVX512-FCP-NEXT: vinserti32x4 $1, 288(%rdi), %ymm16, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] @@ -11231,9 +11232,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] @@ -11246,11 +11247,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm19 @@ -11258,16 +11259,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm18 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] +; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm6 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm5 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm17 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $219, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX512-FCP-NEXT: vpblendd $36, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm5 @@ -11288,7 +11289,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm29 & (zmm4 ^ zmm3)) -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm26 & (zmm5 ^ zmm4)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -11301,8 +11302,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm3 @@ -11312,8 +11313,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm11 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 @@ -11321,14 +11322,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm0 @@ -11336,8 +11337,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -11362,7 +11363,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm24 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm11 ; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm19 @@ -11431,9 +11432,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] @@ -11445,11 +11446,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] @@ -11466,8 +11467,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $146, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX512-FCP-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 @@ -11483,11 +11484,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm15 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm6 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm3 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm13 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7] @@ -11502,8 +11503,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[2,2,2,2,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm4 @@ -11513,24 +11514,24 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm31 ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm3[4],xmm12[5],xmm3[6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm12 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm4[5,6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm3[5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 @@ -11538,8 +11539,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm17 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm29) | ymm10 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm11 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 @@ -11562,14 +11563,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm14 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm13 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm29) | ymm14 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 @@ -11579,40 +11580,40 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm2 {%k1} ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm4 & (zmm5 ^ mem)) ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm4 & (zmm5 ^ mem)) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm4 & (zmm5 ^ mem)) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm4 & (zmm5 ^ mem)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm26 & (zmm4 ^ zmm30)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm26 & (zmm3 ^ zmm30)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ zmm31)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm26 & (zmm11 ^ zmm0)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm26 & (zmm1 ^ zmm2)) @@ -11622,7 +11623,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) @@ -11658,27 +11659,27 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5] ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm15, %xmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX512DQ-NEXT: vextracti32x4 $1, %ymm15, %xmm21 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[0,2,0,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm15, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX512DQ-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm2 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11696,9 +11697,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3,4,5],ymm4[6],ymm1[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm27 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm27 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] @@ -11723,9 +11724,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm11 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7] ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -11739,22 +11740,22 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm9 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] +; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] -; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm1[1],ymm4[2,3,4,5],ymm1[6],ymm4[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm31 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm6, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4],xmm8[5,6,7] ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm2 @@ -11778,18 +11779,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm7 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm9, %xmm4 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm9, %xmm5 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm8, %xmm3 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -11813,7 +11814,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm15, %xmm4 @@ -11901,7 +11902,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm21 & (zmm1 ^ zmm2)) -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm18 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm18 & (zmm3 ^ zmm1)) ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -12047,7 +12048,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm5, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm18 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] @@ -12146,7 +12147,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm13 @@ -12209,7 +12210,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm3, (%rdx) -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm3 & (zmm24 ^ zmm23)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm3 & (zmm21 ^ zmm25)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm3 & (zmm0 ^ zmm26)) @@ -12257,28 +12258,28 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm2 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],mem[2,3] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,1,0,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm2 @@ -12334,16 +12335,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,1,0,3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm2 @@ -12352,10 +12353,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] @@ -12382,14 +12383,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -12397,9 +12398,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm16 & (zmm6 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm16 & (zmm4 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1 @@ -12461,7 +12462,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm23 @@ -12501,7 +12502,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm20 & (zmm4 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm28 & (zmm5 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -12574,7 +12575,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm21 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 @@ -12823,23 +12824,23 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm14, %zmm15 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm8 ; AVX512BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} @@ -12850,18 +12851,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k2} ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm16 @@ -12872,14 +12873,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} @@ -12890,18 +12891,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm16 ; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm19 {%k1} ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm20 @@ -12909,38 +12910,38 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 ; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm23 ; AVX512BW-NEXT: movw $31, %di ; AVX512BW-NEXT: kmovd %edi, %k2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm21, %zmm24 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm23 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm24, %zmm21 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm23 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm24 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm1 {%k2} +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm1 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm21, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm20, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} @@ -12968,23 +12969,23 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} @@ -12995,18 +12996,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm16 @@ -13017,14 +13018,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} @@ -13035,18 +13036,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm20 @@ -13054,38 +13055,38 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: movw $31, %di ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] +; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm21, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm24, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm24 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] +; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm20, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} @@ -13113,23 +13114,23 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} @@ -13140,18 +13141,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm16 @@ -13162,14 +13163,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} @@ -13180,18 +13181,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm20 @@ -13199,38 +13200,38 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: movw $31, %di ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] +; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm21, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm24, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm24 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm10, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] +; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm20, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} @@ -13258,23 +13259,23 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58] ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $4192256, %edi # imm = 0x3FF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} @@ -13285,18 +13286,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59] ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm16 @@ -13307,14 +13308,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} @@ -13325,18 +13326,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm18, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm20 @@ -13344,38 +13345,38 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: movw $31, %di ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] +; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm21, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm24, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm22 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31] +; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm20, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 038c73bd9fed2..398fd07e05a3f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -70,11 +70,11 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX-NEXT: vmovd %xmm2, (%rsi) ; AVX-NEXT: vmovd %xmm4, (%rdx) ; AVX-NEXT: vmovd %xmm6, (%rcx) ; AVX-NEXT: vpextrd $2, %xmm5, (%r8) +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX-NEXT: vmovd %xmm7, (%r9) ; AVX-NEXT: vmovd %xmm3, (%r10) ; AVX-NEXT: vmovd %xmm0, (%rax) @@ -100,11 +100,11 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX2-NEXT: vmovd %xmm2, (%rsi) ; AVX2-NEXT: vmovd %xmm4, (%rdx) ; AVX2-NEXT: vmovd %xmm6, (%rcx) ; AVX2-NEXT: vpextrd $2, %xmm5, (%r8) +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX2-NEXT: vmovd %xmm7, (%r9) ; AVX2-NEXT: vmovd %xmm3, (%r10) ; AVX2-NEXT: vmovd %xmm0, (%rax) @@ -186,11 +186,11 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX512-NEXT: vmovd %xmm2, (%rsi) ; AVX512-NEXT: vmovd %xmm4, (%rdx) ; AVX512-NEXT: vmovd %xmm6, (%rcx) ; AVX512-NEXT: vpextrd $2, %xmm5, (%r8) +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX512-NEXT: vmovd %xmm7, (%r9) ; AVX512-NEXT: vmovd %xmm3, (%r10) ; AVX512-NEXT: vmovd %xmm0, (%rax) @@ -237,18 +237,18 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpbroadcastw 8(%rdi), %xmm7 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpsrlq $48, %xmm1, %xmm8 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovd %xmm4, (%rdx) ; AVX512DQ-NEXT: vmovd %xmm6, (%rcx) ; AVX512DQ-NEXT: vpextrd $2, %xmm5, (%r8) +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovd %xmm7, (%r9) ; AVX512DQ-NEXT: vmovd %xmm3, (%r10) ; AVX512DQ-NEXT: vmovd %xmm0, (%rax) @@ -302,11 +302,11 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX512BW-NEXT: vmovd %xmm2, (%rsi) ; AVX512BW-NEXT: vmovd %xmm4, (%rdx) ; AVX512BW-NEXT: vmovd %xmm6, (%rcx) ; AVX512BW-NEXT: vpextrd $2, %xmm5, (%r8) +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX512BW-NEXT: vmovd %xmm7, (%r9) ; AVX512BW-NEXT: vmovd %xmm3, (%r10) ; AVX512BW-NEXT: vmovd %xmm0, (%rax) @@ -353,18 +353,18 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3] ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] ; AVX512DQ-BW-NEXT: vpbroadcastw 8(%rdi), %xmm7 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] ; AVX512DQ-BW-NEXT: vpsrlq $48, %xmm1, %xmm8 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovd %xmm4, (%rdx) ; AVX512DQ-BW-NEXT: vmovd %xmm6, (%rcx) ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm5, (%r8) +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vmovd %xmm7, (%r9) ; AVX512DQ-BW-NEXT: vmovd %xmm3, (%r10) ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rax) @@ -603,112 +603,112 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FP-LABEL: load_i16_stride7_vf4: ; AVX2-FP: # %bb.0: +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FP-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FP-NEXT: vmovq %xmm4, (%r9) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vmovq %xmm1, (%rcx) +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FP-NEXT: vmovq %xmm6, (%r9) ; AVX2-FP-NEXT: vmovq %xmm7, (%r10) -; AVX2-FP-NEXT: vmovq %xmm0, (%rax) +; AVX2-FP-NEXT: vmovq %xmm3, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride7_vf4: ; AVX2-FCP: # %bb.0: +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vmovq %xmm6, (%r9) ; AVX2-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) +; AVX2-FCP-NEXT: vmovq %xmm3, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride7_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] @@ -754,16 +754,16 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i16_stride7_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] @@ -804,13 +804,13 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i16_stride7_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] @@ -856,16 +856,16 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] @@ -1253,13 +1253,13 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm5[1],xmm7[2,3,4,5,6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,0,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,7,7] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm12[5,6,7] ; AVX-NEXT: vpsrlq $16, %xmm5, %xmm11 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm12 @@ -1338,11 +1338,11 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] @@ -1406,22 +1406,22 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm7 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm7[0,1],xmm9[2,3] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm2[3],xmm10[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3,4],xmm11[5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] @@ -1449,11 +1449,11 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] @@ -1463,8 +1463,8 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vmovdqa %xmm5, (%rsi) -; AVX2-FP-NEXT: vmovdqa %xmm6, (%rdx) +; AVX2-FP-NEXT: vmovdqa %xmm6, (%rsi) +; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX2-FP-NEXT: vmovdqa %xmm10, (%r8) ; AVX2-FP-NEXT: vmovdqa %xmm11, (%r9) @@ -1485,22 +1485,22 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm7[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm2[3],xmm10[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3,4],xmm11[5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] @@ -1528,11 +1528,11 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] @@ -1542,8 +1542,8 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %xmm10, (%r8) ; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r9) @@ -1581,11 +1581,11 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] @@ -1646,24 +1646,24 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3] ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],xmm1[2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3,4],xmm9[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3,4],xmm6[5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7] @@ -1672,40 +1672,40 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm12[1,2,3,4,5,6],xmm2[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r8) ; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r9) @@ -1743,11 +1743,11 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] @@ -1808,24 +1808,24 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],xmm1[2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3,4],xmm9[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3,4],xmm6[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7] @@ -1834,40 +1834,40 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm12[1,2,3,4,5,6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r9) @@ -1882,19 +1882,19 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1912,19 +1912,19 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1942,19 +1942,19 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1972,19 +1972,19 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -2452,7 +2452,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 48(%rdi), %xmm13 ; AVX-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm13[2],zero ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3,4],xmm15[5,6,7] ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX-NEXT: vandnps %ymm11, %ymm15, %ymm11 @@ -2510,7 +2510,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] ; AVX-NEXT: vmovdqa %xmm4, %xmm15 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,3,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -2529,7 +2529,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -2589,7 +2589,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vorps %ymm1, %ymm9, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,3,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] @@ -2606,15 +2606,15 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX-NEXT: vpsrlq $16, %xmm13, %xmm9 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,0,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,1,0,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0 @@ -2624,7 +2624,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] ; AVX-NEXT: vpsrlq $48, %xmm6, %xmm1 -; AVX-NEXT: vmovdqa %xmm6, %xmm15 +; AVX-NEXT: vmovdqa %xmm6, %xmm7 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] @@ -2636,15 +2636,19 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX-NEXT: vpsrld $16, %xmm3, %xmm1 +; AVX-NEXT: vmovdqa %xmm3, %xmm15 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX-NEXT: vmovdqa %xmm4, %xmm3 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vmovdqa %xmm11, %xmm4 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm11[1],xmm12[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,7,7] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,2] ; AVX-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -2658,7 +2662,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = zero,xmm1[1],mem[0],zero -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] @@ -2667,14 +2671,13 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,0,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] @@ -2709,41 +2712,41 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,4,7] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,6,4,7] +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm9[0,1,0,2] ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,2,1,4,5,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5],xmm10[6],xmm8[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2,3,4,5],xmm10[6],xmm13[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7,8,9,10],ymm12[11],ymm10[12,13,14,15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-NEXT: vmovdqa %xmm11, %xmm10 -; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm12, %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3],ymm10[4,5,6,7,8,9,10],ymm14[11],ymm10[12,13,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] +; AVX2-NEXT: vmovdqa %xmm12, %xmm10 +; AVX2-NEXT: vpblendvb %ymm10, %ymm13, %ymm14, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] @@ -2751,29 +2754,29 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4],ymm13[5,6,7,8,9,10,11],ymm12[12],ymm13[13,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendvb %ymm10, %ymm11, %ymm12, %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4],ymm12[5,6,7,8,9,10,11],ymm11[12],ymm12[13,14,15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm11, %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm8 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,2] ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4,5,6,7],ymm11[8],ymm8[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7] @@ -2782,19 +2785,19 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,2,1,0,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-NEXT: vpblendvb %ymm10, %ymm13, %ymm14, %ymm10 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,2,1,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm10 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] @@ -2815,7 +2818,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7,8,9,10,11,12,13],ymm11[14],ymm12[15] ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u] @@ -2831,17 +2834,17 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] @@ -2884,84 +2887,84 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,8,9,10,11,6,7,6,7] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,4,7] +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm7, %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2,3,4,5],xmm11[6],xmm14[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-FP-NEXT: vmovdqa %xmm12, %xmm11 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm13, %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3],ymm11[4,5,6,7,8,9,10],ymm15[11],ymm11[12,13,14,15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm11[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FP-NEXT: vmovdqa %xmm13, %xmm11 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4],ymm13[5,6,7,8,9,10,11],ymm12[12],ymm13[13,14,15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,1,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm12[1,2,3,4,5,6,7],ymm8[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm13 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm2[2,3,0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4],ymm14[5,6,7,8,9,10,11],ymm13[12],ymm14[13,14,15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6,7],ymm9[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm10[0,1,1,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31] +; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1,2,3,4,5,6,7],ymm12[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm13, %ymm14, %ymm11 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] @@ -2971,72 +2974,72 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7,8,9,10,11,12,13],ymm15[14],ymm14[15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1,2,3,4,5,6,7],ymm14[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm12 -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm14[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3,4,5,6],ymm7[7,8],ymm14[9,10,11,12,13,14],ymm7[15] -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7,8,9,10,11,12,13],ymm14[14],ymm15[15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2],xmm13[3],xmm12[4],xmm13[5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u] +; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm15, %xmm15 +; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5,6,7],ymm12[8,9,10,11,12],ymm15[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm15[1,2,3,4,5,6],ymm6[7,8],ymm15[9,10,11,12,13,14],ymm6[15] +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm7[2,3],ymm1[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm15 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3],xmm4[4],xmm0[5],xmm4[6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm9, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm10, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm9, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm8, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -3046,82 +3049,83 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FCP-LABEL: load_i16_stride7_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5] ; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm1[2],ymm6[3,4,5],ymm1[6],ymm6[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2,3,4,5],xmm10[6],xmm13[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,5,1,0,4,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm10 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm12, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,5,1,u,4,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm14, %ymm10 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FCP-NEXT: vmovdqa %xmm12, %xmm10 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm13, %ymm14, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,6,1,0,5,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,6,1,u,5,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm11, %ymm12, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm9 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12 +; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,1,1,3] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] @@ -3131,72 +3135,72 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm13, %ymm14, %ymm10 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm14 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,7,2,6,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,3,7,2,6,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2],xmm12[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2],xmm12[3],xmm7[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4,5,6,7],ymm11[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4,5,6,7],ymm11[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4] ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm14, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,0,3,7,0] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm14, %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm14, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,3,7,u] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm14, %ymm14 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm14 -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7],ymm7[8,9,10,11,12],ymm14[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,4,7,3,6,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,4,7,3,6,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm6[2,3],ymm1[4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3,4,5,6,7],ymm13[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,7,0,0,4,7,0] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0],ymm7[1,2,3,4,5,6,7],ymm13[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,4,7,0,0,4,7,0] +; AVX2-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm0 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,6,1,5,2,6,1,5] +; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,0,3,7,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,0,3,7,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3239,13 +3243,13 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[0,1,2,1,4,5,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6],xmm7[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm7 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] @@ -3268,26 +3272,26 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,1,2] ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7,8,9,10,11],ymm11[12],ymm10[13,14,15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4],ymm11[5,6,7,8,9,10,11],ymm12[12],ymm11[13,14,15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %ymm11, %ymm10, %ymm10 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %ymm11, %ymm10, %ymm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] @@ -3385,146 +3389,146 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i16_stride7_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,4,7,11,14,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,3,6,15,12,13,6,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,9,u,13,u,u,u] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,9,u,12,u,u,u] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm6[0,1,0,2] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4],xmm14[5],xmm1[6],xmm14[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm4, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm13[5,6,7] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5],xmm3[6],xmm15[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm14 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm9, %ymm14, %ymm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3,4,5],xmm15[6],xmm10[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm10, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2],xmm15[3],xmm9[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm11, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm18, %zmm15 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,7,10,14,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm12 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,u,0,3,7,u] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm14 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm16, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm12, %zmm0 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,7,0,0,4,7,0] +; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,4,8,11,15,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %ymm9, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm10, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm11, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -3560,13 +3564,13 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[0,1,2,1,4,5,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6],xmm7[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm7 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] @@ -3589,26 +3593,26 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,1,2] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7,8,9,10,11],ymm11[12],ymm10[13,14,15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4],ymm11[5,6,7,8,9,10,11],ymm12[12],ymm11[13,14,15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %ymm11, %ymm10, %ymm10 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %ymm11, %ymm10, %ymm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] @@ -3706,146 +3710,146 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,4,7,11,14,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,9,u,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,9,u,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm6[0,1,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4],xmm14[5],xmm1[6],xmm14[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm13[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5],xmm3[6],xmm15[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm14 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm14, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3,4,5],xmm15[6],xmm10[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm10, %ymm8, %ymm8 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2],xmm15[3],xmm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm11, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm18, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,7,10,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm12 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,u,0,3,7,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm14 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm16, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,4,8,11,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -3861,51 +3865,44 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,6,13,20,27,34,41] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,0,7,14,21,28,35,42] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,1,8,15,22,29,36,43] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,2,9,16,23,30,37,44] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,3,10,17,24,31,38,45] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,4,11,18,25,32,39,46] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] -; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,5,12,19,26,33,40,47] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -3927,51 +3924,44 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] -; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,6,13,20,27,34,41] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,0,7,14,21,28,35,42] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] -; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,1,8,15,22,29,36,43] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] -; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,2,9,16,23,30,37,44] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,3,10,17,24,31,38,45] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,4,11,18,25,32,39,46] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] -; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,5,12,19,26,33,40,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -3993,51 +3983,44 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] -; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,6,13,20,27,34,41] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] -; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,0,7,14,21,28,35,42] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] -; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,1,8,15,22,29,36,43] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] -; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,2,9,16,23,30,37,44] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] -; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,3,10,17,24,31,38,45] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,4,11,18,25,32,39,46] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] -; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,5,12,19,26,33,40,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -4059,51 +4042,44 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,6,13,20,27,34,41] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,0,7,14,21,28,35,42] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] -; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,1,8,15,22,29,36,43] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] -; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,2,9,16,23,30,37,44] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,3,10,17,24,31,38,45] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,4,11,18,25,32,39,46] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] -; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,5,12,19,26,33,40,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -5563,15 +5539,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i16_stride7_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm11[1],ymm8[2,3,4],ymm11[5],ymm8[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3,4],ymm13[5],ymm8[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 @@ -5580,33 +5555,33 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa %ymm5, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm5[2],ymm11[3,4,5],ymm5[6],ymm11[7] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm14[1],ymm9[2,3,4],ymm14[5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] -; AVX2-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 @@ -5614,61 +5589,64 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7] -; AVX2-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm3, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6,7] +; AVX2-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa %ymm9, %ymm3 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm14[2,3],ymm9[4,5],ymm14[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm14[2,3,0,1] -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] +; AVX2-NEXT: vmovdqa %ymm10, %ymm3 +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] -; AVX2-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10 +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm12 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa %ymm7, %ymm14 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7] +; AVX2-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7] +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm7 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] @@ -5680,9 +5658,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] @@ -5694,17 +5672,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,0,2] -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5],xmm14[6],xmm1[7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] @@ -5713,10 +5691,10 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm15[2],ymm12[3,4,5],ymm15[6],ymm12[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] @@ -5725,7 +5703,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] @@ -5735,8 +5713,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -5747,10 +5726,10 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3,4,5,6,7],ymm12[8],ymm0[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] @@ -5760,8 +5739,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -5773,11 +5752,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] @@ -5786,16 +5765,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[0,1],ymm7[2],mem[3,4],ymm7[5],mem[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm14[2],ymm10[3,4],ymm14[5],ymm10[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm13[2],ymm15[3,4,5],ymm13[6],ymm15[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] ; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15] @@ -5804,17 +5781,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm10[2],ymm8[3,4,5],ymm10[6],ymm8[7] ; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] @@ -5830,7 +5807,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] @@ -5846,7 +5823,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] @@ -5859,7 +5836,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX2-NEXT: vmovdqa %ymm11, %ymm14 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7] ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 @@ -5871,21 +5849,21 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6] ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7],ymm5[8,9,10,11,12],ymm12[13,14,15] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] -; AVX2-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15] -; AVX2-NEXT: vpshufb %ymm8, %ymm12, %ymm8 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7] +; AVX2-NEXT: vmovdqa %ymm15, %ymm13 +; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1,2,3,4,5,6],ymm15[7,8],ymm12[9,10,11,12,13,14],ymm15[15] +; AVX2-NEXT: vpshufb %ymm8, %ymm12, %ymm8 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] @@ -5904,9 +5882,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] @@ -5917,8 +5895,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7] ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 @@ -5928,11 +5906,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15] ; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] @@ -5943,7 +5921,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) @@ -5975,10 +5953,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i16_stride7_vf32: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $552, %rsp # imm = 0x228 -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm6 @@ -5992,25 +5969,27 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7] +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm5[2],ymm1[3,4,5],ymm5[6],ymm1[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] +; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] @@ -6033,8 +6012,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] @@ -6043,22 +6022,23 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm13, %ymm15 +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] @@ -6068,17 +6048,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 @@ -6093,38 +6073,38 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,1,0,2] -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,2] +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm15, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm4[2],ymm9[3,4,5],ymm4[6],ymm9[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm15, %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6,7],ymm12[8],ymm3[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7] -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm15 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4],xmm3[5],xmm15[6],xmm3[7] +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] @@ -6132,32 +6112,33 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,2] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,1,2] -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm12 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] @@ -6170,7 +6151,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -6181,120 +6162,121 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] +; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] ; AVX2-FP-NEXT: vmovd {{.*#+}} xmm14 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm9[2],mem[3,4,5],ymm9[6],mem[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm8 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm15 +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm10 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3,4,5,6,7],ymm8[8],ymm3[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm8[2],ymm14[3,4,5],ymm8[6],ymm14[7] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 ; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm12[2,3],ymm1[4,5],ymm12[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm14, %xmm14 +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0],ymm7[1,2,3,4,5,6,7],ymm2[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm10 -; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7] -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3],xmm7[4],xmm8[5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm7, %xmm8 +; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,5],xmm12[6],xmm11[7] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7],ymm10[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6],ymm13[7,8],ymm10[9,10,11,12,13,14],ymm13[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4,5,6],ymm15[7,8],ymm10[9,10,11,12,13,14],ymm15[15] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6,7,8],ymm10[9],ymm5[10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7] @@ -6309,33 +6291,33 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm10, %xmm10 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm11[1],xmm12[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] @@ -6392,7 +6374,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 @@ -6405,7 +6387,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,1,0,4,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,1,u,4,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 @@ -6430,11 +6412,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,1,0,5,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,6,1,u,5,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -6473,9 +6455,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,2,5,3,6,2,5] ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm13, %ymm2 @@ -6483,8 +6465,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm12 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,2] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6493,15 +6475,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm13, %ymm13 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm3[0,1,0,2] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm15 +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[0,1,0,2] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm7[2],ymm10[3,4,5],ymm7[6],ymm10[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm15 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] @@ -6525,35 +6506,36 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm7[3],ymm10[4,5],ymm7[6],ymm10[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm14 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm14 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm13 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] @@ -6568,7 +6550,6 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -6579,16 +6560,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm5 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] @@ -6598,7 +6580,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,2,6,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,7,2,6,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm9, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 @@ -6609,15 +6591,16 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] @@ -6629,10 +6612,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] -; AVX2-FCP-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,1,4,2,5,1,4] ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 @@ -6640,14 +6624,13 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm15 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,7,3,6,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,7,3,6,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm15 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm15 @@ -6655,73 +6638,72 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm11 -; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm5 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd (%rsp), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,4,7,0,0,4,7,0] +; AVX2-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm14, %ymm2 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,1,5,2,6,1,5] +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,4,0,3,7,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,0,3,7,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm8, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm14, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FCP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX2-FCP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] @@ -6761,14 +6743,15 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm4[2],ymm8[3,4,5],ymm4[6],ymm8[7] +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm13[2],ymm4[3,4,5],ymm13[6],ymm4[7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] @@ -6788,8 +6771,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX512-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,1,0,3] +; AVX512-NEXT: vmovdqa 224(%rdi), %xmm14 +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,1,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512-NEXT: movw $992, %ax # imm = 0x3E0 @@ -6797,26 +6780,26 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm21 {%k1} ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX512-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX512-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX512-NEXT: vmovdqa 240(%rdi), %xmm15 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm2 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] ; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22 @@ -6824,37 +6807,41 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5,6,7] ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm19 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm27 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX512-NEXT: vmovdqa64 %xmm14, %xmm17 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm23 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX512-NEXT: vmovdqa64 %ymm12, %ymm29 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm8[1],ymm4[2,3,4],ymm8[5],ymm4[6,7] -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm25 -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm4[1],ymm13[2,3,4],ymm4[5],ymm13[6,7] +; AVX512-NEXT: vmovdqa64 %ymm13, %ymm25 +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm28 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm20 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm0[0,1,1,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -6865,9 +6852,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512-NEXT: vpbroadcastw 232(%rdi), %xmm1 -; AVX512-NEXT: vpsrlq $48, %xmm14, %xmm2 +; AVX512-NEXT: vpsrlq $48, %xmm15, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18 +; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm20 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] @@ -6880,11 +6867,10 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] -; AVX512-NEXT: vpsrld $16, %xmm13, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm13, %xmm31 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; AVX512-NEXT: vmovdqa64 %xmm14, %xmm16 -; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm17 +; AVX512-NEXT: vpsrld $16, %xmm14, %xmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; AVX512-NEXT: vmovdqa64 %xmm15, %xmm16 +; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm19 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[3],xmm3[4],xmm0[5],xmm3[6,7] @@ -6906,19 +6892,19 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1,2,3],xmm7[4],xmm14[5],xmm7[6],xmm14[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1,2,3,4,5,6],ymm12[7,8],ymm14[9,10,11,12,13,14],ymm12[15] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6],ymm7[7,8],ymm3[9,10,11,12,13,14],ymm7[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6],xmm3[7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm7[3,4,5,6],xmm12[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] @@ -6928,7 +6914,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3,4],ymm4[5],ymm15[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] @@ -6936,7 +6922,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6],xmm3[7] -; AVX512-NEXT: vmovdqa64 %ymm28, %ymm7 +; AVX512-NEXT: vmovdqa64 %ymm27, %ymm7 ; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ~mem) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7] @@ -6947,7 +6933,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] @@ -6957,7 +6943,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm3 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm27 @@ -6965,19 +6951,19 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm13[2],ymm8[3,4,5],ymm13[6],ymm8[7] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7] +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm12[4],xmm3[5],xmm12[6],xmm3[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,2,0] -; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpor %ymm0, %ymm14, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0] +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpor %ymm0, %ymm12, %ymm0 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm26 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm9 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7] @@ -6988,19 +6974,20 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm2 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm16 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX512-NEXT: vmovdqa64 %ymm29, %ymm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15] -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] +; AVX512-NEXT: vmovdqa64 %ymm25, %ymm9 +; AVX512-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] @@ -7014,400 +7001,408 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3],ymm2[4,5,6,7,8,9,10],ymm10[11],ymm2[12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3,4,5],xmm14[6],xmm10[7] +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX512-NEXT: vpor %ymm2, %ymm10, %ymm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7,8,9,10,11],ymm14[12],ymm2[13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm9 -; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2],xmm14[3],xmm9[4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4],ymm2[5,6,7,8,9,10,11],ymm12[12],ymm2[13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3],xmm14[4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512-NEXT: vpor %ymm2, %ymm12, %ymm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6],ymm9[7,8],ymm2[9,10,11,12,13,14],ymm9[15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1,2,3,4,5,6],ymm12[7,8],ymm2[9,10,11,12,13,14],ymm12[15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6,7,8],ymm11[9],ymm9[10,11,12,13,14,15] -; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6,7,8],ymm12[9],ymm11[10,11,12,13,14,15] ; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm12 -; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm14 -; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm25 +; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm14 +; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm24 +; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm25 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,4,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,6,4,6,7] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2],xmm0[3],xmm8[4,5,6,7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4,5],ymm15[6],ymm4[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,3,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm4 & (zmm11 ^ zmm21)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm5 ^ (mem & (zmm22 ^ zmm5)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm4 & (zmm12 ^ zmm22)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm4 & (zmm23 ^ zmm19)) +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7],ymm4[8,9,10],ymm5[11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm6 ^ (mem & (zmm22 ^ zmm6)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm5 & (zmm12 ^ zmm21)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm5 & (zmm14 ^ zmm22)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm23 # 64-byte Folded Reload +; AVX512-NEXT: # zmm23 = zmm23 ^ (zmm5 & (zmm23 ^ mem)) ; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa32 %zmm14, %zmm23 {%k1} -; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm4 & (zmm27 ^ zmm20)) +; AVX512-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} +; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload +; AVX512-NEXT: # zmm27 = zmm27 ^ (zmm5 & (zmm27 ^ mem)) ; AVX512-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1} -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm18 ^ (zmm4 & (zmm1 ^ zmm18)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm20 ^ (zmm5 & (zmm1 ^ zmm20)) ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm23, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm27, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm17 ^ (zmm4 & (zmm2 ^ zmm17)) -; AVX512-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm19 ^ (zmm5 & (zmm2 ^ zmm19)) +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm16 ^ (zmm4 & (zmm3 ^ zmm16)) -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm16 ^ (zmm5 & (zmm3 ^ zmm16)) +; AVX512-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride7_vf32: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,4,8,11,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermd %zmm9, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14] -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpermd %zmm22, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm28 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm28[0,1,0,2] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [10,3,6,15,12,13,6,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,6,10,13,3,6,10,13] +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,6,9,u,13,u,u,u] +; AVX512-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,5,9,u,12,u,u,u] +; AVX512-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermd %zmm21, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,u,u,u,4,7,11,14] +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,12,5,12,5,14,15] +; AVX512-FCP-NEXT: vpermd %zmm30, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,0,2] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm27 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm8, %ymm11, %ymm23 -; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm11 +; AVX512-FCP-NEXT: vporq %ymm7, %ymm8, %ymm24 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1} ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3],xmm15[4],xmm14[5],xmm15[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm1, %zmm24 {%k1} +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm14[3,4,5,6],xmm2[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 ; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vporq %ymm4, %ymm0, %ymm20 +; AVX512-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm4[2],ymm11[3,4,5],ymm4[6],ymm11[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm12[1],xmm14[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm21 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm12[1],xmm14[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm31 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX512-FCP-NEXT: vpermd %ymm28, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm18, %zmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [2,5,2,5,2,5,2,5] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2],xmm1[3],xmm13[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,u,u,u,4,8,11,15] +; AVX512-FCP-NEXT: vporq %ymm3, %ymm1, %ymm22 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm13, %zmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6],xmm3[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm9, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm21, %zmm17, %zmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [10,3,6,15,12,13,6,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,5,8,12,15] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm18, %zmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm2, %ymm13, %ymm2 -; AVX512-FCP-NEXT: vpermd %zmm9, %zmm19, %zmm9 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2],xmm2[3],xmm9[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm3, %ymm2, %ymm18 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,u,u,5,8,12,15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpermd %zmm21, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm28[0,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm13 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512-FCP-NEXT: vpor %ymm10, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpsrlq $48, %xmm14, %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512-FCP-NEXT: vpermd %zmm22, %zmm16, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0] -; AVX512-FCP-NEXT: vpermd %ymm28, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2,3,4,5],xmm2[6],xmm9[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512-FCP-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX512-FCP-NEXT: vpsrld $16, %xmm12, %xmm2 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm1 +; AVX512-FCP-NEXT: vpsrlq $48, %xmm14, %xmm2 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,7,10,14,u,u,u] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,u,0,3,7,u] +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpsrld $16, %xmm12, %xmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm0 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm4 +; AVX512-FCP-NEXT: vpermd %zmm30, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm30 & (zmm16 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm4 & (zmm16 ^ zmm0)) ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3,4,5],xmm9[6],xmm11[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,0,0,0,6,9,13,0] -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512-FCP-NEXT: vpor %ymm4, %ymm10, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,7,11,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,u,u,u,6,9,13,u] +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512-FCP-NEXT: vpor %ymm11, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,4,7,11,14,u,u,u] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm28, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: vpermd %zmm22, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15] +; AVX512-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm9[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,7,0,0,4,7,0] +; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm28 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2],xmm1[3],xmm9[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,0,6,10,13,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2],xmm2[3],xmm9[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [3,u,u,u,6,10,13,u] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512-FCP-NEXT: vpor %ymm1, %ymm9, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,8,11,15,0,0,0] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] +; AVX512-FCP-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,4,8,11,15,u,u,u] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vpermd %zmm31, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpermd %zmm30, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,10,3,14,7,10,3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm3 & (zmm26 ^ zmm23)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm20 ^ (mem & (zmm24 ^ zmm20)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm3 & (zmm27 ^ zmm24)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm30 & (zmm15 ^ zmm21)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm3 & (zmm26 ^ zmm24)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm5 ^ (mem & (zmm25 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm3 & (zmm27 ^ zmm25)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm4 & (zmm15 ^ zmm31)) ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm30 & (zmm19 ^ zmm18)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k1} +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm4 & (zmm20 ^ zmm22)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm20 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm25 ^ (zmm30 & (zmm11 ^ zmm25)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm21 ^ (zmm4 & (zmm11 ^ zmm21)) ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm11 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm10 ^ (zmm30 & (zmm0 ^ zmm10)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm10 ^ (zmm4 & (zmm0 ^ zmm10)) ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) @@ -7416,18 +7411,19 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-LABEL: load_i16_stride7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7] +; AVX512DQ-NEXT: pushq %rax +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm18 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm7 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] @@ -7444,22 +7440,23 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQ-NEXT: vpbroadcastw 252(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,1,0,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm5 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm14 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7474,12 +7471,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm21 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -7489,63 +7486,66 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm24 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm21 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm27 +; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm26 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm28 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,1,3] +; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm18 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm0[0,1,1,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,1,2,1,4,5,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-NEXT: vpbroadcastw 232(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-NEXT: vpbroadcastw 232(%rdi), %xmm1 -; AVX512DQ-NEXT: vpsrlq $48, %xmm14, %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm20 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlq $48, %xmm14, %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4],xmm3[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm25 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5],xmm2[6],xmm7[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,5],xmm2[6],xmm10[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm17 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpsrld $16, %xmm13, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm27 +; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm29 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm28 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm30 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm11 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] @@ -7563,11 +7563,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] @@ -7583,16 +7583,16 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~mem) ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 @@ -7602,63 +7602,63 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm25 & (zmm22 ^ zmm19)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm24 & (zmm21 ^ zmm19)) ; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm22 {%k1} +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm21 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,1,2,0,4,5,6,4] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm13[2],ymm5[3,4,5],ymm13[6],ymm5[7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2],ymm1[3,4],ymm14[5],ymm1[6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm13[2],ymm5[3,4,5],ymm13[6],ymm5[7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm15 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4],xmm3[5],xmm15[6],xmm3[7] +; AVX512DQ-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm25 & (zmm19 ^ zmm17)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm24 & (zmm19 ^ zmm18)) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1} -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm23 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm26 +; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6],ymm2[7,8,9,10,11,12,13],ymm0[14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm22 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm11[0,1,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7,8,9,10],ymm15[11],ymm3[12,13,14,15] -; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm12 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2,3,4,5],xmm12[6],xmm15[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] @@ -7668,399 +7668,403 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero ; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm20 ^ (zmm25 & (zmm11 ^ zmm20)) -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1} +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm25 ^ (zmm24 & (zmm0 ^ zmm25)) +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7,8,9,10,11],ymm12[12],ymm3[13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero ; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1,2,3,4,5,6],ymm12[7,8],ymm2[9,10,11,12,13,14],ymm12[15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm8 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] -; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5,6,7],ymm12[8,9,10,11,12],ymm7[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm12, %zmm7, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm8 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm10[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5,6,7],ymm10[8,9,10,11,12],ymm4[13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm10 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm10, %zmm4, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm15 +; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm24 ^ (zmm25 & (zmm2 ^ zmm24)) -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,4,6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm23 ^ (zmm24 & (zmm2 ^ zmm23)) +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm25, %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6,7,8],ymm7[9],ymm3[10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,4,6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2],xmm4[3],xmm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2],xmm7[3],xmm10[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm11[2],ymm14[3,4,5],ymm11[6],ymm14[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm27 ^ (zmm25 & (zmm3 ^ zmm27)) +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm17 ^ (zmm24 & (zmm3 ^ zmm17)) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm1 & (zmm8 ^ zmm18)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm0 ^ (mem & (zmm21 ^ zmm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm21)) +; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm8 = zmm8 ^ (zmm1 & (zmm8 ^ mem)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm4 ^ (mem & (zmm20 ^ zmm4)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm20)) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14] -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm25[0,1,0,2] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vporq %ymm8, %ymm11, %ymm23 -; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,6,9,u,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [1,u,u,u,4,8,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,5,9,u,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm19, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,u,u,u,4,7,11,14] +; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm24[0,1,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm12[4],xmm7[5],xmm12[6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm13 +; AVX512DQ-FCP-NEXT: vporq %ymm6, %ymm7, %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm6[2],xmm13[2],xmm6[3],xmm13[3] ; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3],xmm15[4],xmm14[5],xmm15[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm10, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1,2],xmm14[3,4,5,6],xmm9[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vporq %ymm10, %ymm0, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm0, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm2[2],ymm10[3,4,5],ymm2[6],ymm10[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm4, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm3[1],xmm14[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm23 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm8, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm2[3],ymm10[4,5],ymm2[6],ymm10[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3,4,5],xmm0[6],xmm8[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm16 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm20, %zmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6],xmm14[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm17, %zmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm28 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,5,8,12,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm19 & (zmm16 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm13 = [2,5,2,5,2,5,2,5] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm29 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm20, %zmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3,4,5,6],xmm13[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm19, %zmm20, %zmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [1,u,u,u,5,8,12,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm26, %zmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpermd %zmm19, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3],xmm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm13, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm11)) ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm9, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,1,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm20 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4],xmm9[5],xmm12[6],xmm9[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm14, %zmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7],ymm14[8,9,10],ymm9[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm3[2],ymm9[3,4,5],ymm3[6],ymm9[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm13[4],xmm8[5],xmm13[6],xmm8[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,11,2,11,12,5,8,9] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm13, %zmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm12 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm15, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm19 & (zmm10 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,6,9,13,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4,5,6,7],ymm13[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm18 & (zmm19 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm12 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,7,10,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm11, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm26 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2],xmm2[3],xmm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm14, %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm18 & (zmm2 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,u,u,u,6,9,13,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm12, %zmm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm18, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm13, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,4,7,11,14,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm16, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,u,0,3,7,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm30, %xmm10 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,4,7,11,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm17, %zmm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2],xmm9[3],xmm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [3,0,0,0,6,10,13,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm13, %zmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm20, %zmm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [3,u,u,u,6,10,13,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm13, %zmm13 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm12 ^ (zmm19 & (zmm11 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,4,8,11,15,0,0,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm11, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm12 ^ (zmm18 & (zmm10 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,4,8,11,15,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpermd %zmm25, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm19 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm8 ^ (zmm18 & (zmm0 ^ zmm8)) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm23)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm21 ^ (mem & (zmm24 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm24)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm1 & (zmm11 ^ zmm22)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm17 ^ (mem & (zmm23 ^ zmm17)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm23)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -8071,54 +8075,54 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm8 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm9 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm9, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -8128,60 +8132,60 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] +; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm13 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) @@ -8193,54 +8197,54 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -8250,60 +8254,60 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm4 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) @@ -8315,54 +8319,54 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -8372,60 +8376,60 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k2} +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) @@ -8437,54 +8441,54 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -8494,60 +8498,60 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) @@ -11485,7 +11489,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] @@ -11529,7 +11533,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] @@ -11610,7 +11614,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] @@ -12367,7 +12371,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7] @@ -12410,7 +12414,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] @@ -13045,7 +13049,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] @@ -13196,306 +13200,295 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,1,0,4,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,1,u,4,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm8 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm7 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX2-FCP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2],ymm15[3],mem[4,5],ymm15[6],mem[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm4[2],ymm14[3,4],ymm4[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,1,0,5,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,1,u,5,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm8 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm7 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm11 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm7, %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[1,3,2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = ymm11[0,1,2],mem[3],ymm11[4,5],mem[6],ymm11[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,2,5,3,6,2,5] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,6,2,5,3,6,2,5] +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm6 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,0,2] ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm9 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm8 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm9 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm8 +; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FCP-NEXT: vpblendd $31, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm14 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5],xmm14[6],xmm5[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4],xmm1[5],xmm13[6],xmm1[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm14[2],ymm12[3,4,5],ymm14[6],ymm12[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6],xmm1[7] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] @@ -13503,11 +13496,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] @@ -13515,27 +13519,28 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm2, %ymm3 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm5 -; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7] +; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm5 +; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -13549,294 +13554,296 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm15 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm9 ; AVX2-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm2, %ymm3 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm12 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm13 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,3] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[0,1,1,3] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm11 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,1,3] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm8[0,1,1,3] ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[0,1,1,3] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm14[0,1,1,3] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm10 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,3,7,2,6,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,7,2,6,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm7[2],ymm13[3,4],ymm7[5],ymm13[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm4 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm5[2],ymm8[3,4,5],ymm5[6],ymm8[7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm15, %xmm11 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm14[2],ymm1[3,4,5],ymm14[6],ymm1[7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm12, %ymm2 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4,5],ymm2[6],mem[7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $221, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm1[1],mem[2,3,4],ymm1[5],mem[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm1 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,7,3,6,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa %xmm10, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm12 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm10 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,4,7,3,6,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm15, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm13[1,2,3,4,5,6,7],ymm0[8],ymm13[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,3,3,3,0,3,7,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm13 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm13 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm6 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm10, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm13 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2],ymm7[3],ymm10[4,5],ymm7[6],ymm10[7] +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm15, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm13 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm13 -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm13 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX2-FCP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[0,1,2],ymm14[3],mem[4,5],ymm14[6],mem[7] +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm15, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm13 +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm11, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] @@ -13845,59 +13852,60 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,4,7,0,0,4,7,0] +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm5 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,4,0,3,7,0,0,0] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm7 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,4,0,3,7,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm7 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm6 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm9 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm11 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload @@ -13905,92 +13913,93 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm11 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm10 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm8 +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FCP-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -14013,13 +14022,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512-NEXT: vpbroadcastw 700(%rdi), %xmm4 ; AVX512-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastw 700(%rdi), %xmm2 ; AVX512-NEXT: vmovdqa64 672(%rdi), %xmm22 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm22[0,1,0,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm22[0,1,0,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 @@ -14228,10 +14237,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512-NEXT: vpbroadcastw 680(%rdi), %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX512-NEXT: vpbroadcastw 680(%rdi), %xmm5 -; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] @@ -14396,7 +14405,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] ; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] ; AVX512-NEXT: vpshufb %ymm9, %ymm12, %ymm12 @@ -14805,7 +14814,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm8 ; AVX512-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = mem[0,1],ymm8[2],mem[3,4,5],ymm8[6],mem[7] -; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload ; AVX512-NEXT: # zmm10 = mem ^ (zmm9 & (zmm10 ^ mem)) @@ -14893,12 +14902,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1800, %rsp # imm = 0x708 +; AVX512-FCP-NEXT: subq $1736, %rsp # imm = 0x6C8 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,5,9,u,12,u,u,u] ; AVX512-FCP-NEXT: vpermd %zmm26, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm3, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm5 @@ -14912,16 +14921,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vporq %ymm4, %ymm6, %ymm17 +; AVX512-FCP-NEXT: vporq %ymm4, %ymm6, %ymm16 ; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %xmm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20 ; AVX512-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm7 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,0,2] -; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm24[0,1,0,2] +; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 @@ -14929,452 +14938,458 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm16 +; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm17 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm2 -; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm3 +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm13[2],ymm2[3,4,5],ymm13[6],ymm2[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm13 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm6 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm9 ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm9, %ymm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm4[2],ymm5[3,4,5],ymm4[6],ymm5[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm25 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,0,2] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm22 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm22[0,1,0,2] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm5[7] ; AVX512-FCP-NEXT: vmovdqa 688(%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [2,6,9,0,13,0,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,6,9,u,13,u,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm12 ; AVX512-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 ; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermd %zmm26, %zmm23, %zmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpermd %zmm26, %zmm5, %zmm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpor %ymm10, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm26 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm30 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm18 = [2,5,2,5,2,5,2,5] -; AVX512-FCP-NEXT: vpermd %ymm31, %ymm18, %ymm12 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20 +; AVX512-FCP-NEXT: vpermd %ymm24, %ymm18, %ymm13 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm19 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm14 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm14 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm10 -; AVX512-FCP-NEXT: vpermd %zmm22, %zmm23, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpermd %ymm25, %ymm18, %ymm7 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm11 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2,3,4,5],xmm5[6],xmm9[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpermd %ymm22, %ymm18, %ymm9 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm18 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15] -; AVX512-FCP-NEXT: vpermd %zmm29, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpor %ymm10, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15] +; AVX512-FCP-NEXT: vpermd %zmm29, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm11 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpor %ymm11, %ymm9, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,1,3] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2],xmm8[3],xmm13[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm10 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm22[0,1,1,3] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm10 ; AVX512-FCP-NEXT: vpsrlq $48, %xmm20, %xmm9 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm27 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [3,6,10,13,3,6,10,13] -; AVX512-FCP-NEXT: vpermd %zmm21, %zmm24, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [3,6,10,13,3,6,10,13] +; AVX512-FCP-NEXT: vpermd %zmm20, %zmm26, %zmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] ; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm28, %zmm4, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm28, %zmm4, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512-FCP-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm7 ; AVX512-FCP-NEXT: vpsrlq $48, %xmm18, %xmm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] -; AVX512-FCP-NEXT: vpermd %ymm31, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] +; AVX512-FCP-NEXT: vpermd %ymm24, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm15 +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm9 ; AVX512-FCP-NEXT: vpsrld $16, %xmm19, %xmm8 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX512-FCP-NEXT: vmovdqa %xmm10, %xmm3 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,4,7,11,14] -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3,4,5,6],xmm13[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm13 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm16 {%k1} # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm25, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm31 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,u,u,u,4,7,11,14] +; AVX512-FCP-NEXT: vpermd %zmm28, %zmm16, %zmm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3,4,5,6],xmm10[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm26, %zmm6 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm17 {%k1} # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermd %ymm22, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpermd %zmm21, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX512-FCP-NEXT: vpsrld $16, %xmm14, %xmm2 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,4,7,0,0,4,7,0] +; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm31, %ymm12, %ymm0 +; AVX512-FCP-NEXT: vpermd %ymm24, %ymm13, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm17, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,6,9,13,2,6,9,13] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm6[6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3],xmm11[4],xmm13[5],xmm11[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3],xmm8[4],xmm10[5],xmm8[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18 +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm9 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1,2],xmm10[3,4,5,6],xmm9[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm10 ; AVX512-FCP-NEXT: vpermd %zmm10, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,0,0,4,8,11,15] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm11, %zmm13 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3],xmm1[4],xmm9[5],xmm1[6],xmm9[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,u,u,u,4,8,11,15] +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm11 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3,4,5,6],xmm13[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm10, %zmm17, %zmm13 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4],xmm13[5],xmm1[6],xmm13[7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm11, %zmm6 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4,5,6],xmm11[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %zmm10, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm6[3],ymm15[4,5],ymm6[6],ymm15[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm1[4],xmm11[5],xmm1[6],xmm11[7] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpermd %zmm28, %zmm16, %zmm9 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm15[2],ymm4[3,4,5],ymm15[6],ymm4[7] -; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm4[2],ymm8[3,4,5],ymm4[6],ymm8[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm16 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,u,u,5,8,12,15] +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm9 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpermd %zmm10, %zmm24, %zmm6 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpermd %zmm10, %zmm26, %zmm9 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm8 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5],xmm9[6],xmm1[7] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpermd %zmm28, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm22, %ymm13, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpermd %zmm21, %zmm17, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512-FCP-NEXT: vpermd %zmm20, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7] +; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,11,2,11,12,5,8,9] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,11,2,11,12,5,8,9] +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm9, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 832(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512-FCP-NEXT: vmovdqa 832(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4,5],ymm1[6],ymm6[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm24 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm23 +; AVX512-FCP-NEXT: vpermd %zmm28, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,7,10,14,0,0,0] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpermd %zmm24, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,3,7,10,14,u,u,u] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermd %zmm12, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm29 & (zmm27 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7] -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm26 & (zmm25 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm19 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm22 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm23 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm11[1],xmm3[2,3,4,5],xmm11[6],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [2,0,0,0,6,9,13,0] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm20, %zmm13 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,u,u,u,6,9,13,u] +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm11, %zmm13 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm13 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] @@ -15383,245 +15398,245 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm13 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm10[2],ymm4[3,4],ymm10[5],ymm4[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm13 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm3 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vpermd %zmm25, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vpermd %zmm20, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm26 & (zmm27 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpermd %zmm28, %zmm11, %zmm3 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm20 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm11[1],xmm3[2,3,4,5],xmm11[6],xmm3[7] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm23 -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm25 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm27 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm16 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,4,7,11,14,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm24, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,4,7,11,14,u,u,u] +; AVX512-FCP-NEXT: vpermd %zmm12, %zmm17, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm2[1],ymm9[2,3,4],ymm2[5],ymm9[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2],xmm6[3],xmm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [3,0,0,0,6,10,13,0] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm18, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5,6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3,4,5],xmm12[6],xmm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm19 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,u,u,u,6,10,13,u] +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2,3],ymm4[4,5],ymm10[6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm12 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm10 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpermd %zmm25, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpermd %zmm20, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] -; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm21 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm18, %zmm8 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2],xmm8[3],xmm12[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpermd %zmm28, %zmm21, %zmm7 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm8[1],ymm15[2,3,4],ymm8[5],ymm15[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] ; AVX512-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm24, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,10,3,14,7,10,3] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm9, %zmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm7[1,2],ymm15[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,8,11,15,u,u,u] +; AVX512-FCP-NEXT: vpermd %zmm22, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm7 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] +; AVX512-FCP-NEXT: vpermd %zmm18, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm6[1,2],ymm13[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm4 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpermd %zmm25, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpermd %zmm20, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm11 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,1,3,4,5,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm12 = mem ^ (zmm9 & (zmm12 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3],xmm9[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm10 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,1,3,4,5,6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpermd %zmm28, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm11 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm11 = mem ^ (zmm8 & (zmm11 ^ mem)) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm13 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm13 = mem ^ (zmm9 & (zmm13 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm17 = zmm17 ^ (zmm9 & (zmm17 ^ mem)) -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm21 = zmm21 ^ (zmm9 & (zmm21 ^ mem)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm12)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm9 & (zmm11 ^ zmm13)) -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3,4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm13 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm13 = mem ^ (zmm8 & (zmm13 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm16 = zmm16 ^ (zmm8 & (zmm16 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm31 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm31 = zmm31 ^ (zmm8 & (zmm31 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm8 & (zmm5 ^ zmm11)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm8 & (zmm10 ^ zmm13)) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm9[1,2],ymm7[3,4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7],ymm7[8,9,10],ymm3[11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm19 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm19 = zmm19 ^ (zmm29 & (zmm19 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm19 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm19 = zmm19 ^ (zmm26 & (zmm19 ^ mem)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm29 & (zmm2 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm19 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm8 = zmm8 ^ (zmm29 & (zmm8 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm6 = zmm6 ^ (zmm29 & (zmm6 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm8 = zmm8 ^ (zmm26 & (zmm8 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm26 & (zmm2 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm29 & (zmm1 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm26 & (zmm1 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm29 & (zmm1 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm26 & (zmm1 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm17, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm29 & (zmm4 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm26 & (zmm4 ^ mem)) ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = mem ^ (zmm29 & (zmm0 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = mem ^ (zmm26 & (zmm0 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-FCP-NEXT: addq $1800, %rsp # imm = 0x708 +; AVX512-FCP-NEXT: addq $1736, %rsp # imm = 0x6C8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -15667,8 +15682,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm9 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm18[0,1,0,2] ; AVX512DQ-NEXT: vpbroadcastw 252(%rdi), %xmm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm18[0,1,0,2] ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm15 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] @@ -15845,11 +15860,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,1,4,5,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-NEXT: vpbroadcastw 232(%rdi), %xmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-NEXT: vpbroadcastw 232(%rdi), %xmm6 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512DQ-NEXT: vpsrlq $48, %xmm21, %xmm7 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-NEXT: vpsrlq $48, %xmm21, %xmm6 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7] @@ -16027,7 +16042,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm12, %ymm12 @@ -16458,7 +16473,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm0 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm1 = mem ^ (zmm0 & (zmm1 ^ mem)) @@ -16501,9 +16516,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: subq $1240, %rsp # imm = 0x4D8 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,0,12,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,5,9,u,12,u,u,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm12 @@ -16559,7 +16574,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm5[1],xmm13[2,3,4,5,6,7] @@ -16596,7 +16611,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm1[1],xmm14[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,9,0,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,9,u,13,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 @@ -16667,7 +16682,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm28, %zmm16, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 @@ -16681,11 +16696,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm24[0,1,1,3] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm23 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] @@ -16714,7 +16729,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm4 @@ -16725,7 +16740,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13] ; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm27, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 @@ -16746,12 +16761,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm30, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 @@ -16768,10 +16783,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm17 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3],xmm6[4],xmm3[5],xmm6[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,4,7,11,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,u,u,u,4,7,11,14] ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm14 @@ -16799,7 +16814,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm7, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,6,9,13,2,6,9,13] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] @@ -16811,7 +16826,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -16834,16 +16849,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [1,u,u,u,4,8,11,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm20, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4,5,6],xmm10[7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm10[6,7] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -16867,7 +16882,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,u,u,5,8,12,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 @@ -16922,7 +16937,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm9, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 @@ -16967,7 +16982,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,3,7,10,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,3,7,10,14,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm21, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] @@ -16986,7 +17001,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,6,9,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,u,u,u,6,9,13,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 @@ -17038,7 +17053,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,4,7,11,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,4,7,11,14,u,u,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm22, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 @@ -17055,7 +17070,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,0,0,0,6,10,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,u,u,u,6,10,13,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 @@ -17104,7 +17119,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,4,8,11,15,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,4,8,11,15,u,u,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 @@ -17120,7 +17135,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,10,3,14,7,10,3] ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 @@ -17165,7 +17180,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm1 = mem ^ (zmm25 & (zmm1 ^ mem)) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm2 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm3 = mem ^ (zmm2 & (zmm3 ^ mem)) @@ -17225,36 +17240,36 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm17 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm18, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm17 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm9 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm18 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm16 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm18, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm19 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm22 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] @@ -17264,25 +17279,25 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm20, %zmm18 +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm20, %zmm19 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm17 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm21 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm23, %zmm24 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 @@ -17308,7 +17323,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm26 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm21 ; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1} @@ -17318,19 +17333,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm25 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm20 ; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm27, %zmm28 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm23 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1} @@ -17352,7 +17367,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm28 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm29, %zmm30 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1} @@ -17362,21 +17377,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm27 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm29 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm25, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm25, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm14, %zmm0 @@ -17421,36 +17436,36 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] +; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm17, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm18, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm17 ; AVX512BW-FCP-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm19, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm19, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] -; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm18, %zmm19 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] +; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm21, %zmm22 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] @@ -17460,25 +17475,25 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm20, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm20, %zmm19 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm17 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm21 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00 @@ -17504,7 +17519,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm25, %zmm26 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1} @@ -17514,19 +17529,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm25 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] ; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm26, %zmm25 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm27, %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1} @@ -17548,7 +17563,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm27, %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm29, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1} @@ -17558,21 +17573,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm27 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm25, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm14, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm25, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm14, %zmm0 @@ -17617,36 +17632,36 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] +; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm18, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm17 ; AVX512DQ-BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm18 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] -; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm18, %zmm19 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] +; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm18 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm22 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] @@ -17656,25 +17671,25 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm20, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm20, %zmm19 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm17 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm21 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00 @@ -17700,7 +17715,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm26 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1} @@ -17710,19 +17725,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm25 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] ; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm27, %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1} @@ -17744,7 +17759,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm29, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1} @@ -17754,21 +17769,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm27 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm25, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm25, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm14, %zmm0 @@ -17813,36 +17828,36 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm17, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57] ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm18, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm19, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm19, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm18, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42] @@ -17852,25 +17867,25 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm20, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm20, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00 @@ -17896,7 +17911,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm25, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1} @@ -17906,19 +17921,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61] ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm24, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm26, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45] ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm27, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1} @@ -17940,7 +17955,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm27, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm29, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1} @@ -17950,21 +17965,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm25, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm14, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm25, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm14, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index fff21f9aad1bb..a5346f920f6fc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -45,228 +45,228 @@ define void @load_i16_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-LABEL: load_i16_stride8_vf2: ; AVX: # %bb.0: ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vmovd %xmm2, (%rsi) ; AVX-NEXT: vpextrd $1, %xmm2, (%rdx) ; AVX-NEXT: vpextrd $2, %xmm2, (%rcx) ; AVX-NEXT: vpextrd $3, %xmm2, (%r8) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vmovd %xmm0, (%r9) -; AVX-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX-NEXT: vpextrd $1, %xmm0, (%rdx) +; AVX-NEXT: vpextrd $2, %xmm0, (%rcx) ; AVX-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i16_stride8_vf2: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vmovd %xmm2, (%rsi) ; AVX2-NEXT: vpextrd $1, %xmm2, (%rdx) ; AVX2-NEXT: vpextrd $2, %xmm2, (%rcx) ; AVX2-NEXT: vpextrd $3, %xmm2, (%r8) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vmovd %xmm0, (%r9) -; AVX2-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX2-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX2-NEXT: vpextrd $1, %xmm0, (%rdx) +; AVX2-NEXT: vpextrd $2, %xmm0, (%rcx) ; AVX2-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i16_stride8_vf2: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FP-NEXT: vmovd %xmm2, (%rsi) ; AVX2-FP-NEXT: vpextrd $1, %xmm2, (%rdx) ; AVX2-FP-NEXT: vpextrd $2, %xmm2, (%rcx) ; AVX2-FP-NEXT: vpextrd $3, %xmm2, (%r8) +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FP-NEXT: vmovd %xmm0, (%r9) -; AVX2-FP-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX2-FP-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX2-FP-NEXT: vpextrd $1, %xmm0, (%rdx) +; AVX2-FP-NEXT: vpextrd $2, %xmm0, (%rcx) ; AVX2-FP-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride8_vf2: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX2-FCP-NEXT: vpextrd $1, %xmm2, (%rdx) ; AVX2-FCP-NEXT: vpextrd $2, %xmm2, (%rcx) ; AVX2-FCP-NEXT: vpextrd $3, %xmm2, (%r8) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FCP-NEXT: vmovd %xmm0, (%r9) -; AVX2-FCP-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX2-FCP-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX2-FCP-NEXT: vpextrd $1, %xmm0, (%rdx) +; AVX2-FCP-NEXT: vpextrd $2, %xmm0, (%rcx) ; AVX2-FCP-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride8_vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vmovd %xmm2, (%rsi) ; AVX512-NEXT: vpextrd $1, %xmm2, (%rdx) ; AVX512-NEXT: vpextrd $2, %xmm2, (%rcx) ; AVX512-NEXT: vpextrd $3, %xmm2, (%r8) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vmovd %xmm0, (%r9) -; AVX512-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX512-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX512-NEXT: vpextrd $1, %xmm0, (%rdx) +; AVX512-NEXT: vpextrd $2, %xmm0, (%rcx) ; AVX512-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride8_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512-FCP-NEXT: vpextrd $1, %xmm2, (%rdx) ; AVX512-FCP-NEXT: vpextrd $2, %xmm2, (%rcx) ; AVX512-FCP-NEXT: vpextrd $3, %xmm2, (%r8) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-FCP-NEXT: vmovd %xmm0, (%r9) -; AVX512-FCP-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX512-FCP-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX512-FCP-NEXT: vpextrd $1, %xmm0, (%rdx) +; AVX512-FCP-NEXT: vpextrd $2, %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride8_vf2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-NEXT: vpextrd $1, %xmm2, (%rdx) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpextrd $2, %xmm2, (%rcx) +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vpextrd $3, %xmm2, (%r8) ; AVX512DQ-NEXT: vmovd %xmm0, (%r9) -; AVX512DQ-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX512DQ-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX512DQ-NEXT: vpextrd $1, %xmm0, (%rax) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vpextrd $2, %xmm0, (%rax) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf2: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vpextrd $1, %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpextrd $3, %xmm2, (%r8) ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%r9) -; AVX512DQ-FCP-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX512DQ-FCP-NEXT: vpextrd $1, %xmm0, (%rax) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm0, (%rax) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride8_vf2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512BW-NEXT: vmovd %xmm2, (%rsi) ; AVX512BW-NEXT: vpextrd $1, %xmm2, (%rdx) ; AVX512BW-NEXT: vpextrd $2, %xmm2, (%rcx) ; AVX512BW-NEXT: vpextrd $3, %xmm2, (%r8) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512BW-NEXT: vmovd %xmm0, (%r9) -; AVX512BW-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX512BW-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX512BW-NEXT: vpextrd $1, %xmm0, (%rdx) +; AVX512BW-NEXT: vpextrd $2, %xmm0, (%rcx) ; AVX512BW-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride8_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vpextrd $1, %xmm2, (%rdx) ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm2, (%rcx) ; AVX512BW-FCP-NEXT: vpextrd $3, %xmm2, (%r8) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%r9) -; AVX512BW-FCP-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX512BW-FCP-NEXT: vpextrd $1, %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride8_vf2: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vpextrd $1, %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-BW-NEXT: vpextrd $3, %xmm2, (%r8) ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%r9) -; AVX512DQ-BW-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vpextrd $1, %xmm0, (%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, (%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf2: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vpextrd $1, %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-BW-FCP-NEXT: vpextrd $3, %xmm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%r9) -; AVX512DQ-BW-FCP-NEXT: vpextrd $1, %xmm0, (%r11) -; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, (%r10) +; AVX512DQ-BW-FCP-NEXT: vpextrd $1, %xmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpextrd $3, %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i16>, ptr %in.vec, align 64 @@ -496,7 +496,7 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [3,7,3,3] ; AVX512-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -527,11 +527,11 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,5,1,1] ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm8 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [3,7,3,3] ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -564,7 +564,7 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [3,7,3,3] ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -595,11 +595,11 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,5,1,1] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [3,7,3,3] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1124,7 +1124,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,4] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] @@ -1143,7 +1143,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm15 = [3,7,0,0] ; AVX512-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -1186,7 +1186,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,0,4] ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7 @@ -1198,18 +1198,18 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm2 ; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm10 = [3,7,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm14 @@ -1250,7 +1250,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,4] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] @@ -1269,7 +1269,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm15 = [3,7,0,0] ; AVX512DQ-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -1312,7 +1312,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,0,4] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7 @@ -1324,18 +1324,18 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm10 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm14 @@ -1373,23 +1373,23 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1407,23 +1407,23 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1441,23 +1441,23 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -1475,23 +1475,23 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -2487,7 +2487,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,0,4] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] @@ -2552,7 +2552,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] ; AVX512-NEXT: vpermt2d %xmm4, %xmm17, %xmm14 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -2655,7 +2655,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm29 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,0,4] ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm13 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 @@ -2689,7 +2689,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] @@ -2705,7 +2705,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -2722,7 +2722,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm18 = [3,7,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -2826,7 +2826,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,0,4] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] @@ -2891,7 +2891,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm17, %xmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -2994,7 +2994,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,0,4] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 @@ -3028,7 +3028,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] @@ -3044,7 +3044,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -3061,7 +3061,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm18 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -3162,52 +3162,44 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -3230,52 +3222,44 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -3298,52 +3282,44 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -3366,52 +3342,44 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,8,16,24,32,40,48,56] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,1,9,17,25,33,41,49,57] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,10,18,26,34,42,50,58] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,12,20,28,36,44,52,60] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -5729,7 +5697,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,4] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] @@ -5891,7 +5859,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm6 = [3,7,0,0] ; AVX512-NEXT: vpermt2d %xmm11, %xmm6, %xmm7 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm0 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] @@ -5929,7 +5897,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,4] ; AVX512-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload @@ -6061,7 +6029,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm12 = [3,7,0,0] ; AVX512-NEXT: vpermt2d %xmm16, %xmm12, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm11 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] @@ -6118,7 +6086,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,4] ; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 @@ -6215,7 +6183,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm11 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1] @@ -6245,7 +6213,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] @@ -6281,7 +6249,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm27 = [3,7,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] @@ -6318,7 +6286,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,4] ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] @@ -6384,7 +6352,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = [1,5,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -6413,7 +6381,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] @@ -6500,7 +6468,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,4] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] @@ -6662,7 +6630,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm6 = [3,7,0,0] ; AVX512DQ-NEXT: vpermt2d %xmm11, %xmm6, %xmm7 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] @@ -6700,7 +6668,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,4] ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload @@ -6832,7 +6800,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm12 = [3,7,0,0] ; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm12, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] @@ -6889,7 +6857,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,4] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 @@ -6986,7 +6954,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm11 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1] @@ -7016,7 +6984,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] @@ -7052,7 +7020,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm27 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] @@ -7089,7 +7057,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,4] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] @@ -7155,7 +7123,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -7184,7 +7152,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] @@ -7284,9 +7252,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm9 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -7296,9 +7264,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm9, %zmm10 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -7308,9 +7276,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm11 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -7320,9 +7288,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm12 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -7332,9 +7300,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm13 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -7344,9 +7312,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm14 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -7356,9 +7324,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm15 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -7403,9 +7371,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -7415,9 +7383,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -7427,9 +7395,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -7439,9 +7407,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -7451,9 +7419,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -7463,9 +7431,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -7475,9 +7443,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -7522,9 +7490,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -7534,9 +7502,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -7546,9 +7514,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -7558,9 +7526,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -7570,9 +7538,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -7582,9 +7550,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -7594,9 +7562,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -7641,9 +7609,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -7653,9 +7621,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -7665,9 +7633,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -7677,9 +7645,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -7689,9 +7657,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -7701,9 +7669,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -7713,9 +7681,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm15, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -12657,7 +12625,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,0,4] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] @@ -13045,7 +13013,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -13118,7 +13086,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,0,4] ; AVX512-NEXT: vpermt2d %xmm5, %xmm10, %xmm6 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -13459,7 +13427,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0] ; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1 ; AVX512-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -13566,7 +13534,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,0,4] ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 @@ -13797,7 +13765,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm13 = [1,5,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm30 @@ -13869,7 +13837,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm28 @@ -13970,7 +13938,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1 ; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload @@ -14042,7 +14010,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,0,4] ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -14221,7 +14189,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1] @@ -14288,7 +14256,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3] ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -14386,7 +14354,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] ; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -14492,7 +14460,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,0,4] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] @@ -14880,7 +14848,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -14953,7 +14921,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,0,4] ; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm10, %xmm6 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -15294,7 +15262,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm1 ; AVX512DQ-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -15401,7 +15369,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,0,4] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 @@ -15632,7 +15600,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm13 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm30 @@ -15704,7 +15672,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm28 @@ -15805,7 +15773,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload @@ -15877,7 +15845,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,0,4] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -16056,7 +16024,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1] @@ -16123,7 +16091,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -16221,7 +16189,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -16428,29 +16396,29 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -16647,29 +16615,29 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -16866,29 +16834,29 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -17085,29 +17053,29 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index f2c5a91d2cca3..17ca6c10f3972 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -194,81 +194,81 @@ define void @load_i32_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-LABEL: load_i32_stride2_vf4: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vmovaps (%rdi), %xmm1 -; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512-NEXT: vmovaps %xmm1, (%rdx) +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vpmovqd %ymm1, (%rsi) +; AVX512-NEXT: vmovaps %xmm0, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride2_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm1 -; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512-FCP-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512-FCP-NEXT: vmovaps %xmm1, (%rdx) +; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-FCP-NEXT: vpmovqd %ymm1, (%rsi) +; AVX512-FCP-NEXT: vmovaps %xmm0, (%rdx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride2_vf4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm1 -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512DQ-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512DQ-NEXT: vmovaps %xmm1, (%rdx) +; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-NEXT: vpmovqd %ymm1, (%rsi) +; AVX512DQ-NEXT: vmovaps %xmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512DQ-FCP-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm1, (%rsi) +; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride2_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vmovaps (%rdi), %xmm1 -; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512BW-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512BW-NEXT: vmovaps %xmm1, (%rdx) +; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpmovqd %ymm1, (%rsi) +; AVX512BW-NEXT: vmovaps %xmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride2_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512BW-FCP-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vpmovqd %ymm1, (%rsi) +; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride2_vf4: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512DQ-BW-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vpmovqd %ymm1, (%rsi) +; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <8 x i32>, ptr %in.vec, align 64 @@ -362,11 +362,11 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-FCP-LABEL: load_i32_stride2_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512-FCP-NEXT: vmovaps %ymm1, (%rdx) +; AVX512-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-FCP-NEXT: vpmovqd %zmm1, (%rsi) +; AVX512-FCP-NEXT: vmovaps %ymm0, (%rdx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -383,11 +383,11 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-FCP-LABEL: load_i32_stride2_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovaps %ymm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqd %zmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -404,11 +404,11 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512BW-FCP-LABEL: load_i32_stride2_vf8: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%rdx) +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpmovqd %zmm1, (%rsi) +; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -425,11 +425,11 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride2_vf8: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %zmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 @@ -479,14 +479,14 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm2[1,3],ymm1[5,7],ymm2[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm4[1,3],ymm0[5,7],ymm4[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7] ; AVX-NEXT: vmovaps %ymm5, (%rsi) -; AVX-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX-NEXT: vmovaps %ymm4, 32(%rsi) ; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX-NEXT: vzeroupper @@ -559,9 +559,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -572,9 +572,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -585,9 +585,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -598,9 +598,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -611,9 +611,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -624,9 +624,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -637,9 +637,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -650,9 +650,9 @@ define void @load_i32_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -737,24 +737,24 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm6[0,2],ymm1[4,6],ymm6[4,6] +; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6] +; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm5[0,2],ymm1[4,6],ymm5[4,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,2],ymm8[0,2],ymm0[4,6],ymm8[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3],mem[2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,2],ymm8[0,2],ymm0[4,6],ymm8[4,6] ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2],ymm10[0,2],ymm2[4,6],ymm10[4,6] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm6[1,3],ymm1[5,7],ymm6[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2],ymm9[0,2],ymm2[4,6],ymm9[4,6] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm5[1,3],ymm1[5,7],ymm5[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm8[1,3],ymm0[5,7],ymm8[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm10[1,3],ymm2[5,7],ymm10[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm9[1,3],ymm2[5,7],ymm9[5,7] ; AVX-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX-NEXT: vmovaps %ymm9, (%rsi) +; AVX-NEXT: vmovaps %ymm10, (%rsi) ; AVX-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX-NEXT: vmovaps %ymm6, 96(%rsi) ; AVX-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vmovaps %ymm3, 96(%rdx) @@ -879,11 +879,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -899,11 +899,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -919,11 +919,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -939,11 +939,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -959,11 +959,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -979,11 +979,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -999,11 +999,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1019,11 +1019,11 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1195,64 +1195,64 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; ; AVX-LABEL: load_i32_stride2_vf64: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX-NEXT: vmovaps (%rdi), %ymm1 -; AVX-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX-NEXT: vmovaps 128(%rdi), %ymm8 ; AVX-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2],ymm8[0,2],ymm10[4,6],ymm8[4,6] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm7[0,2],ymm11[4,6],ymm7[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,3],ymm7[1,3],ymm11[5,7],ymm7[5,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,3],ymm12[1,3],ymm13[5,7],ymm12[5,7] +; AVX-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm7[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm7 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm10 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm3[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm12 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[0,2],ymm10[4,6],ymm9[4,6] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,3],ymm8[1,3],ymm10[5,7],ymm8[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,2],ymm15[0,2],ymm4[4,6],ymm15[4,6] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm15[1,3],ymm4[5,7],ymm15[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,2],ymm13[0,2],ymm6[4,6],ymm13[4,6] -; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm13[1,3],ymm6[5,7],ymm13[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm11[0,2],ymm9[4,6],ymm11[4,6] -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3],ymm11[1,3],ymm9[5,7],ymm11[5,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm11[0,2],ymm1[4,6],ymm11[4,6] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm11[1,3],ymm1[5,7],ymm11[5,7] -; AVX-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX-NEXT: vmovaps %ymm13, 64(%rsi) -; AVX-NEXT: vmovaps %ymm0, (%rsi) -; AVX-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm6, %ymm14 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm8[2,3],mem[2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6] +; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm8, %ymm8 +; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,3],ymm9[1,3],ymm10[5,7],ymm9[5,7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm4 +; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,3],ymm11[1,3],ymm12[5,7],ymm11[5,7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3],mem[2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm14[1,3],ymm13[1,3],ymm14[5,7],ymm13[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,2],ymm5[0,2],ymm7[4,6],ymm5[4,6] +; AVX-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm2 +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,3],ymm5[1,3],ymm7[5,7],ymm5[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2],ymm11[0,2],ymm2[4,6],ymm11[4,6] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm11[1,3],ymm2[5,7],ymm11[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,2],ymm9[0,2],ymm4[4,6],ymm9[4,6] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm9[1,3],ymm4[5,7],ymm9[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2],ymm15[0,2],ymm8[4,6],ymm15[4,6] +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3],ymm15[1,3],ymm8[5,7],ymm15[5,7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm0[0,2],ymm14[0,2],ymm0[4,6],ymm14[4,6] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm14[1,3],ymm0[5,7],ymm14[5,7] +; AVX-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX-NEXT: vmovaps %ymm11, 128(%rsi) +; AVX-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX-NEXT: vmovaps %ymm15, (%rsi) +; AVX-NEXT: vmovaps %ymm1, 160(%rsi) ; AVX-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX-NEXT: vmovaps %ymm1, (%rdx) -; AVX-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX-NEXT: vmovaps %ymm14, 160(%rdx) -; AVX-NEXT: vmovaps %ymm12, 96(%rdx) -; AVX-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX-NEXT: vmovaps %ymm13, 224(%rsi) +; AVX-NEXT: vmovaps %ymm0, (%rdx) +; AVX-NEXT: vmovaps %ymm8, 64(%rdx) +; AVX-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX-NEXT: vmovaps %ymm5, 224(%rdx) +; AVX-NEXT: vmovaps %ymm12, 160(%rdx) +; AVX-NEXT: vmovaps %ymm10, 96(%rdx) +; AVX-NEXT: vmovaps %ymm6, 32(%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1479,7 +1479,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1487,7 +1487,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1513,7 +1513,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1521,7 +1521,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1547,7 +1547,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1555,7 +1555,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1581,7 +1581,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1589,7 +1589,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1615,7 +1615,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1623,7 +1623,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1649,7 +1649,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1657,7 +1657,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1683,7 +1683,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1691,7 +1691,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 @@ -1717,7 +1717,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1725,7 +1725,7 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 34f23213500c1..3d655b1ba3e62 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -104,7 +104,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i32_stride3_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,6,7] ; AVX512-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 ; AVX512-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] @@ -131,7 +131,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,6,7] ; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] @@ -158,7 +158,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i32_stride3_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,6,7] ; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] @@ -185,7 +185,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcastss 8(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] @@ -305,12 +305,12 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i32_stride3_vf4: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,3,6,9] ; AVX512-NEXT: vmovaps (%rdi), %zmm1 ; AVX512-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512-NEXT: vmovaps {{.*#+}} xmm2 = [1,4,7,10] ; AVX512-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512-NEXT: vmovaps {{.*#+}} xmm3 = [2,5,8,11] ; AVX512-NEXT: vpermps %zmm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovaps %xmm0, (%rsi) ; AVX512-NEXT: vmovaps %xmm2, (%rdx) @@ -320,12 +320,12 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i32_stride3_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,3,6,9] ; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,4,7,10] ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,5,8,11] ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovaps %xmm0, (%rsi) ; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx) @@ -335,12 +335,12 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i32_stride3_vf4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [0,3,6,9] ; AVX512DQ-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm2 = [1,4,7,10] ; AVX512DQ-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm3 = [2,5,8,11] ; AVX512DQ-NEXT: vpermps %zmm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi) ; AVX512DQ-NEXT: vmovaps %xmm2, (%rdx) @@ -350,12 +350,12 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,3,6,9] ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,4,7,10] ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,5,8,11] ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx) @@ -365,12 +365,12 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i32_stride3_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] +; AVX512BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,3,6,9] ; AVX512BW-NEXT: vmovaps (%rdi), %zmm1 ; AVX512BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512BW-NEXT: vmovaps {{.*#+}} xmm2 = [1,4,7,10] ; AVX512BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512BW-NEXT: vmovaps {{.*#+}} xmm3 = [2,5,8,11] ; AVX512BW-NEXT: vpermps %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) ; AVX512BW-NEXT: vmovaps %xmm2, (%rdx) @@ -380,12 +380,12 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i32_stride3_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,3,6,9] ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,4,7,10] ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,5,8,11] ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx) @@ -395,12 +395,12 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i32_stride3_vf4: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,3,6,9] ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm2 = [1,4,7,10] ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm3 = [2,5,8,11] ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rdx) @@ -410,12 +410,12 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,3,6,9] ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,4,7,10] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,5,8,11] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx) @@ -579,11 +579,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-NEXT: vmovdqa %ymm3, (%rdx) @@ -595,11 +595,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -611,11 +611,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx) @@ -627,11 +627,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -643,11 +643,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -659,11 +659,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -675,11 +675,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -691,11 +691,11 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -993,17 +993,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1016,17 +1016,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1039,17 +1039,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1062,17 +1062,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1085,17 +1085,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1108,17 +1108,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1131,17 +1131,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1154,17 +1154,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1817,23 +1817,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -1854,23 +1854,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -1891,23 +1891,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -1928,23 +1928,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -1965,23 +1965,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -2002,23 +2002,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -2039,23 +2039,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -2076,23 +2076,23 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 @@ -3581,10 +3581,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3594,10 +3594,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3607,9 +3607,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -3646,10 +3646,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3659,10 +3659,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3672,9 +3672,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -3711,10 +3711,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3724,10 +3724,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3737,9 +3737,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -3776,10 +3776,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3789,10 +3789,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3802,9 +3802,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -3841,10 +3841,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3854,10 +3854,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3867,9 +3867,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -3906,10 +3906,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3919,10 +3919,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3932,9 +3932,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -3971,10 +3971,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -3984,10 +3984,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -3997,9 +3997,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 @@ -4036,10 +4036,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 @@ -4049,10 +4049,10 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 @@ -4062,9 +4062,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 0bf1260738439..fed360c89f2ec 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -106,7 +106,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [1,5,1,1] ; AVX512-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) @@ -134,12 +134,12 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] -; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,1,1] +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -164,7 +164,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [1,5,1,1] ; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) @@ -192,12 +192,12 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] -; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,1,1] +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -364,14 +364,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i32_stride4_vf4: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12] ; AVX512-NEXT: vmovaps (%rdi), %zmm1 ; AVX512-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13] ; AVX512-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14] ; AVX512-NEXT: vpermps %zmm1, %zmm3, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15] ; AVX512-NEXT: vpermps %zmm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovaps %xmm0, (%rsi) ; AVX512-NEXT: vmovaps %xmm2, (%rdx) @@ -382,14 +382,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i32_stride4_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12] ; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13] ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14] ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15] ; AVX512-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovaps %xmm0, (%rsi) ; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx) @@ -400,14 +400,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i32_stride4_vf4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12] ; AVX512DQ-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13] ; AVX512DQ-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14] ; AVX512DQ-NEXT: vpermps %zmm1, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15] ; AVX512DQ-NEXT: vpermps %zmm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi) ; AVX512DQ-NEXT: vmovaps %xmm2, (%rdx) @@ -418,14 +418,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13] ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14] ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15] ; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx) @@ -436,14 +436,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i32_stride4_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] +; AVX512BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12] ; AVX512BW-NEXT: vmovaps (%rdi), %zmm1 ; AVX512BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512BW-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13] ; AVX512BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512BW-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14] ; AVX512BW-NEXT: vpermps %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512BW-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15] ; AVX512BW-NEXT: vpermps %zmm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) ; AVX512BW-NEXT: vmovaps %xmm2, (%rdx) @@ -454,14 +454,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i32_stride4_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12] ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13] ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14] ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15] ; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx) @@ -472,14 +472,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf4: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12] ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13] ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14] ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15] ; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rdx) @@ -490,14 +490,14 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx) @@ -610,20 +610,20 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-LABEL: load_i32_stride4_vf8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,4,0,4,0,4,0,4] -; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm6 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm5 +; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vpermps %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vpermps %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm7 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm8 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [1,5,1,5,1,5,1,5] ; AVX2-NEXT: vpermps %ymm2, %ymm5, %ymm9 ; AVX2-NEXT: vpermps %ymm1, %ymm5, %ymm10 @@ -637,11 +637,11 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm11 ; AVX2-NEXT: vpermps %ymm1, %ymm9, %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vpermps %ymm3, %ymm9, %ymm3 +; AVX2-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [3,7,3,7,3,7,3,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] @@ -649,9 +649,9 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-NEXT: vmovaps %ymm0, (%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -659,20 +659,20 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-LABEL: load_i32_stride4_vf8: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,4,0,4,0,4,0,4] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm4, %ymm5 -; AVX2-FP-NEXT: vpermps %ymm1, %ymm4, %ymm6 +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] +; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm5 +; AVX2-FP-NEXT: vpermps %ymm1, %ymm3, %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpermps %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm7 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm8 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [1,5,1,5,1,5,1,5] ; AVX2-FP-NEXT: vpermps %ymm2, %ymm5, %ymm9 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm5, %ymm10 @@ -686,11 +686,11 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpermps %ymm2, %ymm9, %ymm11 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm9, %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm9, %ymm3 +; AVX2-FP-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [3,7,3,7,3,7,3,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] @@ -698,9 +698,9 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-FP-NEXT: vmovaps %ymm0, (%r8) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -708,20 +708,20 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-LABEL: load_i32_stride4_vf8: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,4,0,4,0,4,0,4] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm5 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm3, %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm8 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [1,5,1,5,1,5,1,5] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm9 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm10 @@ -735,11 +735,11 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm11 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm9, %ymm3 +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [3,7,3,7,3,7,3,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] @@ -747,24 +747,24 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i32_stride4_vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-NEXT: vmovdqa %ymm3, (%rdx) @@ -775,15 +775,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i32_stride4_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -794,15 +794,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i32_stride4_vf8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx) @@ -813,15 +813,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i32_stride4_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -832,15 +832,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i32_stride4_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -851,15 +851,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i32_stride4_vf8: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -870,15 +870,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i32_stride4_vf8: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -889,15 +889,15 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride4_vf8: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1156,17 +1156,17 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i32_stride4_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $104, %rsp -; AVX2-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm3 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm1 ; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] @@ -1178,66 +1178,66 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm1 -; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm6 +; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps (%rdi), %xmm12 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm13 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1,5,1,5,1,5,1,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5] -; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm1 -; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpermps %ymm5, %ymm9, %ymm0 +; AVX2-NEXT: vpermps %ymm3, %ymm9, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; AVX2-NEXT: vpermps %ymm4, %ymm9, %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-NEXT: vpermps %ymm7, %ymm9, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 176(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-NEXT: vpermps %ymm15, %ymm9, %ymm0 +; AVX2-NEXT: vmovaps 176(%rdi), %xmm4 +; AVX2-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm8 -; AVX2-NEXT: vpermps %ymm7, %ymm0, %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vpermps %ymm5, %ymm0, %ymm9 -; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,6,2,6,2,6,2,6] +; AVX2-NEXT: vpermps %ymm2, %ymm1, %ymm8 +; AVX2-NEXT: vpermps %ymm7, %ymm1, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermps %ymm5, %ymm1, %ymm0 +; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [3,7,3,7,3,7,3,7] -; AVX2-NEXT: vpermps %ymm5, %ymm9, %ymm5 -; AVX2-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3,7,3,7,3,7,3,7] +; AVX2-NEXT: vpermps %ymm5, %ymm1, %ymm5 +; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-NEXT: vpermps %ymm7, %ymm9, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpermps %ymm7, %ymm1, %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-NEXT: vpermps %ymm15, %ymm9, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX2-NEXT: vpermps %ymm15, %ymm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1250,7 +1250,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm8, 32(%rcx) ; AVX2-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-NEXT: vmovaps %ymm4, (%r8) +; AVX2-NEXT: vmovaps %ymm3, (%r8) ; AVX2-NEXT: addq $104, %rsp ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1258,17 +1258,17 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i32_stride4_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $104, %rsp -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] @@ -1280,66 +1280,66 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm12 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm13 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1,5,1,5,1,5,1,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm9, %ymm0 +; AVX2-FP-NEXT: vpermps %ymm3, %ymm9, %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; AVX2-FP-NEXT: vpermps %ymm4, %ymm9, %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm9, %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 176(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-FP-NEXT: vpermps %ymm15, %ymm9, %ymm0 +; AVX2-FP-NEXT: vmovaps 176(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm8 -; AVX2-FP-NEXT: vpermps %ymm7, %ymm0, %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm0, %ymm9 -; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,6,2,6,2,6,2,6] +; AVX2-FP-NEXT: vpermps %ymm2, %ymm1, %ymm8 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm1, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [3,7,3,7,3,7,3,7] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm9, %ymm5 -; AVX2-FP-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3,7,3,7,3,7,3,7] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm1, %ymm5 +; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-FP-NEXT: vpermps %ymm7, %ymm9, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm1, %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FP-NEXT: vpermps %ymm15, %ymm9, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX2-FP-NEXT: vpermps %ymm15, %ymm1, %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1352,7 +1352,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm8, 32(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FP-NEXT: vmovaps %ymm3, (%r8) ; AVX2-FP-NEXT: addq $104, %rsp ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -1360,17 +1360,17 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i32_stride4_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $104, %rsp -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] @@ -1382,66 +1382,66 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm12 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm13 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1,5,1,5,1,5,1,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5] -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 176(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vmovaps 176(%rdi), %xmm4 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,6,2,6,2,6,2,6] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm1, %ymm8 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [3,7,3,7,3,7,3,7] -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm9, %ymm5 -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3,7,3,7,3,7,3,7] +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm1, %ymm5 +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm9, %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm1, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm9, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1454,7 +1454,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm8, 32(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8) ; AVX2-FCP-NEXT: addq $104, %rsp ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1470,23 +1470,23 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] +; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2d %zmm3, %zmm8, %zmm2 +; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1506,23 +1506,23 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1542,23 +1542,23 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm8, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1578,23 +1578,23 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1614,23 +1614,23 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1650,23 +1650,23 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1686,23 +1686,23 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1722,23 +1722,23 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -2306,8 +2306,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm11 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-NEXT: vpermps %ymm9, %ymm0, %ymm1 ; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -2458,63 +2458,63 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload ; AVX2-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7] -; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm11 -; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm11 = [3,7,3,7,3,7,3,7] +; AVX2-NEXT: vpermps %ymm15, %ymm11, %ymm12 +; AVX2-NEXT: vpermps %ymm14, %ymm11, %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vpermps %ymm9, %ymm0, %ymm3 -; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vpermps %ymm9, %ymm11, %ymm3 +; AVX2-NEXT: vpermps %ymm6, %ymm11, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm6 -; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm8 +; AVX2-NEXT: vpermps %ymm8, %ymm11, %ymm6 +; AVX2-NEXT: vpermps %ymm4, %ymm11, %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-NEXT: vmovaps %ymm13, (%rcx) -; AVX2-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-NEXT: vmovaps %ymm5, 32(%r8) ; AVX2-NEXT: vmovaps %ymm3, 64(%r8) ; AVX2-NEXT: vmovaps %ymm2, (%r8) @@ -2532,8 +2532,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] +; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-FP-NEXT: vpermps %ymm9, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -2684,63 +2684,63 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7] -; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm11 -; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm11 = [3,7,3,7,3,7,3,7] +; AVX2-FP-NEXT: vpermps %ymm15, %ymm11, %ymm12 +; AVX2-FP-NEXT: vpermps %ymm14, %ymm11, %ymm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm9, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm9, %ymm11, %ymm3 +; AVX2-FP-NEXT: vpermps %ymm6, %ymm11, %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm8 +; AVX2-FP-NEXT: vpermps %ymm8, %ymm11, %ymm6 +; AVX2-FP-NEXT: vpermps %ymm4, %ymm11, %ymm8 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm13, (%rcx) -; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-FP-NEXT: vmovaps %ymm5, 32(%r8) ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8) ; AVX2-FP-NEXT: vmovaps %ymm2, (%r8) @@ -2758,8 +2758,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -2910,63 +2910,63 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7] -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vpermps %ymm14, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm11 = [3,7,3,7,3,7,3,7] +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm11, %ymm12 +; AVX2-FCP-NEXT: vpermps %ymm14, %ymm11, %ymm13 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm9, %ymm11, %ymm3 +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm11, %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm11, %ymm6 +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm11, %ymm8 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm13, (%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8) @@ -2983,21 +2983,21 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 +; AVX512-NEXT: vpermt2d %zmm4, %zmm7, %zmm10 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm3, %zmm7, %zmm10 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 @@ -3008,7 +3008,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 @@ -3019,14 +3019,14 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 +; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm8 ; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 ; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -3045,21 +3045,21 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] +; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm7, %zmm10 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm10 +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 @@ -3070,7 +3070,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 @@ -3081,14 +3081,14 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -3107,21 +3107,21 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] +; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm7, %zmm10 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm7, %zmm10 +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 @@ -3132,7 +3132,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 @@ -3143,14 +3143,14 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm14, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -3169,21 +3169,21 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] +; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 @@ -3194,7 +3194,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 @@ -3205,14 +3205,14 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -3231,21 +3231,21 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm10 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 @@ -3256,7 +3256,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 @@ -3267,14 +3267,14 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -3293,21 +3293,21 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 @@ -3318,7 +3318,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 @@ -3329,14 +3329,14 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -3355,21 +3355,21 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 @@ -3380,7 +3380,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 @@ -3391,14 +3391,14 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -3417,21 +3417,21 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 @@ -3442,7 +3442,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 @@ -3453,14 +3453,14 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -4645,8 +4645,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 224(%rdi), %ymm14 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vmovaps 224(%rdi), %ymm14 ; AVX2-NEXT: vpermps %ymm14, %ymm2, %ymm0 ; AVX2-NEXT: vpermps %ymm10, %ymm2, %ymm1 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5134,8 +5134,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm14 ; AVX2-FP-NEXT: vpermps %ymm14, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm2, %ymm1 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5623,8 +5623,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm14 ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6110,28 +6110,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512-NEXT: vpermt2d %zmm11, %zmm19, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-NEXT: vpermt2d %zmm7, %zmm19, %zmm14 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512-NEXT: vpermt2d %zmm17, %zmm19, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 @@ -6142,17 +6142,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 +; AVX512-NEXT: vpermt2d %zmm7, %zmm21, %zmm22 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512-NEXT: vpermt2d %zmm13, %zmm21, %zmm23 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 @@ -6163,17 +6163,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512-NEXT: vpermt2d %zmm11, %zmm24, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512-NEXT: vpermt2d %zmm7, %zmm24, %zmm26 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512-NEXT: vpermt2d %zmm13, %zmm24, %zmm27 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 @@ -6185,21 +6185,21 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 +; AVX512-NEXT: vpermt2d %zmm13, %zmm28, %zmm15 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm10, %zmm28, %zmm6 ; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm11, %zmm28, %zmm12 +; AVX512-NEXT: vpermt2d %zmm7, %zmm28, %zmm9 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -6211,7 +6211,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -6224,28 +6224,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm19, %zmm14 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 @@ -6256,17 +6256,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm21, %zmm22 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm23 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 @@ -6277,17 +6277,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm24, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm26 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm27 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 @@ -6299,21 +6299,21 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm15 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm28, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm9 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -6325,7 +6325,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -6338,28 +6338,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm19, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm19, %zmm14 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm19, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 @@ -6370,17 +6370,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm21, %zmm22 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm21, %zmm23 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 @@ -6391,17 +6391,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm24, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm24, %zmm26 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm24, %zmm27 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 @@ -6413,21 +6413,21 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm28, %zmm15 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm28, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm28, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm28, %zmm9 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -6439,7 +6439,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6452,28 +6452,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 @@ -6484,17 +6484,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm23 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 @@ -6505,17 +6505,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm24, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm26 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm27 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 @@ -6527,21 +6527,21 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm15 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm28, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm9 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -6553,7 +6553,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -6566,28 +6566,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm19, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm19, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 @@ -6598,17 +6598,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm21, %zmm22 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm23 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 @@ -6619,17 +6619,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm24, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm26 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm27 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 @@ -6641,21 +6641,21 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm15 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm28, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm28, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm28, %zmm9 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -6667,7 +6667,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -6680,28 +6680,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm19, %zmm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 @@ -6712,17 +6712,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm21, %zmm22 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm23 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 @@ -6733,17 +6733,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm24, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm26 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm27 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 @@ -6755,21 +6755,21 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm15 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm28, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm9 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -6781,7 +6781,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -6794,28 +6794,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm19, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm19, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 @@ -6826,17 +6826,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm21, %zmm22 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm23 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 @@ -6847,17 +6847,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm24, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm26 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm27 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 @@ -6869,21 +6869,21 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm15 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm28, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm28, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -6895,7 +6895,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -6908,28 +6908,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm19, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm19, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 @@ -6940,17 +6940,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 @@ -6961,17 +6961,17 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm24, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 @@ -6983,21 +6983,21 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm28, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -7009,7 +7009,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index c08442f9d9d01..f12a7fd5b3be9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -49,11 +49,11 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX-NEXT: vmovq %xmm3, (%rsi) ; AVX-NEXT: vmovq %xmm4, (%rdx) ; AVX-NEXT: vpextrq $1, %xmm5, (%rcx) +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX-NEXT: vmovq %xmm0, (%r8) ; AVX-NEXT: vmovq %xmm1, (%r9) ; AVX-NEXT: retq @@ -67,14 +67,14 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX2-NEXT: vmovq %xmm3, (%rsi) ; AVX2-NEXT: vmovq %xmm4, (%rdx) ; AVX2-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX2-NEXT: vmovq %xmm0, (%r8) -; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vmovq %xmm1, (%r9) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -87,14 +87,14 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-FP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) ; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) ; AVX2-FP-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX2-FP-NEXT: vmovq %xmm0, (%r8) -; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vmovq %xmm1, (%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -107,14 +107,14 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) ; AVX2-FCP-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -122,20 +122,20 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512-NEXT: vpbroadcastd 16(%rdi), %ymm5 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm4, (%rdx) -; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) ; AVX512-NEXT: vmovq %xmm2, (%r9) ; AVX512-NEXT: vzeroupper @@ -146,10 +146,10 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [1,6,0,0] ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm5 = [2,7,0,0] ; AVX512-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 @@ -166,20 +166,20 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-NEXT: vpextrd $2, %xmm1, %eax -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512DQ-NEXT: vpextrd $3, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-NEXT: vpbroadcastd 16(%rdi), %ymm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) ; AVX512DQ-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-NEXT: vzeroupper @@ -190,10 +190,10 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [1,6,0,0] ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm5 = [2,7,0,0] ; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 @@ -210,20 +210,20 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512BW-NEXT: vpbroadcastd 16(%rdi), %ymm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) ; AVX512BW-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-NEXT: vzeroupper @@ -234,10 +234,10 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [1,6,0,0] ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm5 = [2,7,0,0] ; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 @@ -254,20 +254,20 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512DQ-BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-BW-NEXT: vpbroadcastd 16(%rdi), %ymm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper @@ -278,10 +278,10 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [1,6,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm5 = [2,7,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 @@ -356,30 +356,30 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps (%rdi), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX-NEXT: vmovaps (%rdi), %xmm4 +; AVX-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1],xmm5[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2],xmm6[3] ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX-NEXT: vmovaps (%rdi), %xmm3 -; AVX-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3] -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,2,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[1,2,3,3] ; AVX-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[0] ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm7 = xmm7[1,0] -; AVX-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[1] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],mem[1,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[2] +; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],mem[1,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm7[0,1,2],xmm6[1] +; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[3] ; AVX-NEXT: vmovaps %xmm2, (%rsi) -; AVX-NEXT: vmovaps %xmm5, (%rdx) -; AVX-NEXT: vmovaps %xmm7, (%rcx) -; AVX-NEXT: vmovaps %xmm3, (%r8) +; AVX-NEXT: vmovaps %xmm3, (%rdx) +; AVX-NEXT: vmovaps %xmm5, (%rcx) +; AVX-NEXT: vmovaps %xmm4, (%r8) ; AVX-NEXT: vmovaps %xmm0, (%r9) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -388,16 +388,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,2,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,3,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,3,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [2,7,4,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vpermd %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpbroadcastd 68(%rdi), %xmm6 @@ -406,7 +406,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm7 = [4,1,6,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [4,1,6,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] @@ -422,16 +422,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,2,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,3,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,3,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,7,4,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpermd %ymm6, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpbroadcastd 68(%rdi), %xmm6 @@ -440,7 +440,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [4,1,6,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,1,6,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] @@ -456,16 +456,16 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,2,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,3,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,3,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,7,4,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpbroadcastd 68(%rdi), %xmm6 @@ -474,7 +474,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [4,1,6,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,1,6,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] @@ -490,15 +490,15 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-NEXT: vmovdqa %xmm3, (%rdx) @@ -512,15 +512,15 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -534,15 +534,15 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) @@ -556,15 +556,15 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -578,15 +578,15 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -600,15 +600,15 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -622,15 +622,15 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -644,15 +644,15 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,11,16] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,7,12,17] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,8,13,18] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,9,14,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -768,21 +768,21 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX: # %bb.0: ; AVX-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX-NEXT: vmovaps (%rdi), %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3] -; AVX-NEXT: vinsertf128 $1, 128(%rdi), %ymm5, %ymm7 +; AVX-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX-NEXT: vmovaps (%rdi), %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2],xmm5[3] +; AVX-NEXT: vinsertf128 $1, 128(%rdi), %ymm4, %ymm7 +; AVX-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm8 -; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm5[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4],ymm9[5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5,6],ymm7[7] -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6],ymm7[7] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[3,0],ymm9[6,4],ymm7[7,4] ; AVX-NEXT: vmovaps (%rdi), %xmm9 @@ -795,7 +795,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[2,0],ymm8[7,4],ymm1[6,4] ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm1[2,1],ymm8[6,4],ymm1[6,5] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] ; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] @@ -804,7 +804,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm5[0,0],ymm1[3,0],ymm5[4,4],ymm1[7,4] ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,2],ymm12[6,4],ymm1[6,6] ; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,0],mem[1,3] @@ -813,14 +813,14 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX-NEXT: vmovaps %ymm5, (%rsi) +; AVX-NEXT: vmovaps %ymm4, (%rsi) ; AVX-NEXT: vmovaps %ymm7, (%rdx) ; AVX-NEXT: vmovaps %ymm8, (%rcx) ; AVX-NEXT: vmovaps %ymm9, (%r8) @@ -833,28 +833,28 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,0,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,5,2,7] +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6],ymm6[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,6,3,0,5,2,7,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [1,6,3,0,5,2,7,u] ; AVX2-NEXT: vpermd %ymm6, %ymm7, %ymm6 ; AVX2-NEXT: vpbroadcastd 144(%rdi), %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm9 ; AVX2-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm8[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6],ymm9[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,5,0,5,0,5,0,5] @@ -862,22 +862,22 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [1,6,1,6,1,6,1,6] ; AVX2-NEXT: vpermd %ymm0, %ymm8, %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm10 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,1,6,2,7,4,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,2,7,4,u,u] ; AVX2-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,1,6,0] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[0,1],ymm5[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,1,6,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,7,2,7,2,7,2,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-NEXT: vmovdqa %ymm3, (%rsi) ; AVX2-NEXT: vmovdqa %ymm6, (%rdx) ; AVX2-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-NEXT: vmovdqa %ymm8, (%r8) @@ -890,28 +890,28 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,0,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,5,2,7] +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpermd %ymm7, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6],ymm6[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,6,3,0,5,2,7,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,6,3,0,5,2,7,u] ; AVX2-FP-NEXT: vpermd %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vpbroadcastd 144(%rdi), %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm9 ; AVX2-FP-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX2-FP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,5,0,5,0,5,0,5] @@ -919,22 +919,22 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [1,6,1,6,1,6,1,6] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm8, %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm10 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,1,6,2,7,4,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,2,7,4,u,u] ; AVX2-FP-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,1,6,0] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[0,1],ymm5[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,1,6,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,7,2,7,2,7,2,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-FP-NEXT: vmovdqa %ymm3, (%rsi) ; AVX2-FP-NEXT: vmovdqa %ymm6, (%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm8, (%r8) @@ -947,28 +947,28 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,0,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,5,2,7] +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6],ymm6[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,6,3,0,5,2,7,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,6,3,0,5,2,7,u] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,7,4,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm9 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,5,0,5,0,5,0,5] @@ -976,22 +976,22 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [1,6,1,6,1,6,1,6] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm10 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,1,6,2,7,4,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,2,7,4,u,u] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,1,6,0] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[0,1],ymm5[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,1,6,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,7,2,7,2,7,2,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rsi) ; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm8, (%r8) @@ -1003,26 +1003,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] -; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] ; AVX512-NEXT: vpbroadcastd 144(%rdi), %ymm4 +; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-NEXT: vmovdqa %ymm3, (%rdx) @@ -1036,26 +1036,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] ; AVX512-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1069,26 +1069,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] ; AVX512DQ-NEXT: vpbroadcastd 144(%rdi), %ymm4 +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx) @@ -1102,26 +1102,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] ; AVX512DQ-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1135,26 +1135,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] ; AVX512BW-NEXT: vpbroadcastd 144(%rdi), %ymm4 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1168,26 +1168,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] ; AVX512BW-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1201,26 +1201,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] ; AVX512DQ-BW-NEXT: vpbroadcastd 144(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -1234,26 +1234,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,5,10,15,20,25,30,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [17,22,27,0,5,10,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -1513,15 +1513,15 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX: # %bb.0: ; AVX-NEXT: subq $136, %rsp ; AVX-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX-NEXT: vmovaps 256(%rdi), %ymm7 +; AVX-NEXT: vmovaps 256(%rdi), %ymm8 ; AVX-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX-NEXT: vmovaps 160(%rdi), %ymm13 +; AVX-NEXT: vmovaps 160(%rdi), %ymm14 ; AVX-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX-NEXT: vmovaps (%rdi), %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] -; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm3 @@ -1533,39 +1533,39 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm11[2,3],ymm14[4,5],ymm11[6,7] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm4 ; AVX-NEXT: vmovaps 256(%rdi), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4],ymm8[5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4],ymm7[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm5[1,3],ymm4[6,5],ymm5[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7] ; AVX-NEXT: vmovaps (%rdi), %xmm15 ; AVX-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1],xmm10[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[3,0],ymm1[6,4],ymm7[7,4] ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX-NEXT: vbroadcastss 144(%rdi), %ymm7 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 144(%rdi), %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,0,1] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[3,0],ymm1[6,4],ymm4[7,4] ; AVX-NEXT: vmovaps 160(%rdi), %xmm9 -; AVX-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3] +; AVX-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm7[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] @@ -1574,45 +1574,45 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm14[4,5],ymm12[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm2[2,0],ymm3[7,4],ymm2[6,4] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,1],ymm0[6,4],ymm2[6,5] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm11[4,5],ymm13[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm11[4,5],ymm14[6,7] ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm5[2,2],ymm13[6,4],ymm5[6,6] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm5[2,2],ymm14[6,4],ymm5[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3] -; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3,4,5,6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4] +; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] +; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,0],ymm2[3,0],ymm8[4,4],ymm2[7,4] ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6] -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,0],mem[1,3] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm3[1,0],ymm0[6,4],ymm3[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] ; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload @@ -1623,7 +1623,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5],ymm2[6,7] ; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload ; AVX-NEXT: # ymm4 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -1639,10 +1639,10 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm2, (%rdx) -; AVX-NEXT: vmovaps %ymm14, 32(%rcx) +; AVX-NEXT: vmovaps %ymm13, 32(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm2, (%rcx) -; AVX-NEXT: vmovaps %ymm8, 32(%r8) +; AVX-NEXT: vmovaps %ymm7, 32(%r8) ; AVX-NEXT: vmovaps %ymm10, (%r8) ; AVX-NEXT: vmovaps %ymm0, 32(%r9) ; AVX-NEXT: vmovaps %ymm1, (%r9) @@ -1652,355 +1652,346 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-LABEL: load_i32_stride5_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-NEXT: subq $40, %rsp +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,1,0,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4],ymm12[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6],ymm11[7] +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,5,2,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-NEXT: vpermd %ymm6, %ymm7, %ymm10 +; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm11 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4],ymm12[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6],ymm11[7] +; AVX2-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4],ymm10[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX2-NEXT: vpermd %ymm11, %ymm7, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] -; AVX2-NEXT: vpermd %ymm12, %ymm10, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm10, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] -; AVX2-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,6,3,0,5,2,7,0] -; AVX2-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-NEXT: vpbroadcastd 304(%rdi), %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] -; AVX2-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX2-NEXT: vpbroadcastd 144(%rdi), %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm6[2,3],ymm1[4,5],ymm6[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,3,0,5,2,7,u] +; AVX2-NEXT: vpermd %ymm7, %ymm12, %ymm7 +; AVX2-NEXT: vpbroadcastd 304(%rdi), %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] +; AVX2-NEXT: vpermd %ymm7, %ymm12, %ymm7 +; AVX2-NEXT: vpbroadcastd 144(%rdi), %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm14 +; AVX2-NEXT: vpermd %ymm13, %ymm7, %ymm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4,5,6],ymm14[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,5,0,5,0,5,0,5] +; AVX2-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-NEXT: vpermd %ymm0, %ymm14, %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-NEXT: vpermd %ymm15, %ymm7, %ymm7 ; AVX2-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm15 -; AVX2-NEXT: vmovdqa %ymm7, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5,6],ymm15[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX2-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3],ymm0[4],ymm7[5,6],ymm0[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,6,2,7,4,0,0] -; AVX2-NEXT: vpermd %ymm0, %ymm7, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6] -; AVX2-NEXT: vpermd %ymm12, %ymm0, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm11 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX2-NEXT: vpermd %ymm10, %ymm7, %ymm7 -; AVX2-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[0,1],ymm9[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,1,6,0] -; AVX2-NEXT: vpermd %ymm4, %ymm6, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] -; AVX2-NEXT: vpermd %ymm12, %ymm7, %ymm8 +; AVX2-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3],ymm7[4],ymm10[5,6],ymm7[7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,2,7,4,u,u] +; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm7 +; AVX2-NEXT: vpermd %ymm12, %ymm10, %ymm11 +; AVX2-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX2-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm15, %ymm10, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,1,6,u] +; AVX2-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [2,7,2,7,2,7,2,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] +; AVX2-NEXT: vpermd %ymm13, %ymm9, %ymm8 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm2[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vpermd %ymm15, %ymm7, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm6[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-NEXT: vpermd %ymm15, %ymm9, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-NEXT: vmovdqa %ymm14, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-NEXT: vmovdqa %ymm4, (%r9) -; AVX2-NEXT: addq $72, %rsp +; AVX2-NEXT: addq $40, %rsp ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i32_stride5_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $72, %rsp -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FP-NEXT: subq $40, %rsp +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,1,0,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4],ymm12[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6],ymm11[7] +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,5,2,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpermd %ymm6, %ymm7, %ymm10 +; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm11 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4],ymm12[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6],ymm11[7] +; AVX2-FP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4],ymm10[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpermd %ymm11, %ymm7, %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vpermd %ymm12, %ymm10, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm10, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] -; AVX2-FP-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,6,3,0,5,2,7,0] -; AVX2-FP-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-FP-NEXT: vpbroadcastd 304(%rdi), %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] -; AVX2-FP-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX2-FP-NEXT: vpbroadcastd 144(%rdi), %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm6[2,3],ymm1[4,5],ymm6[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,3,0,5,2,7,u] +; AVX2-FP-NEXT: vpermd %ymm7, %ymm12, %ymm7 +; AVX2-FP-NEXT: vpbroadcastd 304(%rdi), %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] +; AVX2-FP-NEXT: vpermd %ymm7, %ymm12, %ymm7 +; AVX2-FP-NEXT: vpbroadcastd 144(%rdi), %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm14 +; AVX2-FP-NEXT: vpermd %ymm13, %ymm7, %ymm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,5,0,5,0,5,0,5] +; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-FP-NEXT: vpermd %ymm0, %ymm14, %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-FP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vpermd %ymm15, %ymm7, %ymm7 ; AVX2-FP-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm15 -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX2-FP-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3],ymm0[4],ymm7[5,6],ymm0[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,6,2,7,4,0,0] -; AVX2-FP-NEXT: vpermd %ymm0, %ymm7, %ymm2 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6] -; AVX2-FP-NEXT: vpermd %ymm12, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm11 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX2-FP-NEXT: vpermd %ymm10, %ymm7, %ymm7 -; AVX2-FP-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[0,1],ymm9[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,1,6,0] -; AVX2-FP-NEXT: vpermd %ymm4, %ymm6, %ymm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] -; AVX2-FP-NEXT: vpermd %ymm12, %ymm7, %ymm8 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3],ymm7[4],ymm10[5,6],ymm7[7] +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,2,7,4,u,u] +; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm10, %ymm11 +; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX2-FP-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm10, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,1,6,u] +; AVX2-FP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [2,7,2,7,2,7,2,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FP-NEXT: vpermd %ymm13, %ymm9, %ymm8 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm2[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vpermd %ymm15, %ymm7, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm6[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FP-NEXT: vpermd %ymm15, %ymm9, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm14, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm4, (%r9) -; AVX2-FP-NEXT: addq $72, %rsp +; AVX2-FP-NEXT: addq $40, %rsp ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride5_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $72, %rsp -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FCP-NEXT: subq $40, %rsp +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,1,0,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4],ymm12[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6],ymm11[7] +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,5,2,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm10 +; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm11 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4],ymm12[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6],ymm11[7] +; AVX2-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4],ymm10[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] -; AVX2-FCP-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,6,3,0,5,2,7,0] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vpbroadcastd 304(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm6[2,3],ymm1[4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,6,3,0,5,2,7,u] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 +; AVX2-FCP-NEXT: vpbroadcastd 304(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 +; AVX2-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,7,4,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm7, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,5,0,5,0,5,0,5] +; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm14, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm15 -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3],ymm0[4],ymm7[5,6],ymm0[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,6,2,7,4,0,0] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm2 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[0,1],ymm9[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,1,6,0] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm7, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3],ymm7[4],ymm10[5,6],ymm7[7] +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,2,7,4,u,u] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm11 +; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,1,6,u] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [2,7,2,7,2,7,2,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm9, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm2[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm6[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm9, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm14, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9) -; AVX2-FCP-NEXT: addq $72, %rsp +; AVX2-FCP-NEXT: addq $40, %rsp ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -2014,52 +2005,52 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] -; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 -; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] -; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [17,22,27,0,5,10,15,u] +; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,17,22,27,u,u] +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] -; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm6, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: vzeroupper @@ -2075,52 +2066,52 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [17,22,27,0,5,10,15,u] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,17,22,27,u,u] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -2136,52 +2127,52 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [17,22,27,0,5,10,15,u] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,17,22,27,u,u] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-NEXT: vzeroupper @@ -2197,52 +2188,52 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [17,22,27,0,5,10,15,u] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,17,22,27,u,u] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -2258,52 +2249,52 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [17,22,27,0,5,10,15,u] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,17,22,27,u,u] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512BW-NEXT: vzeroupper @@ -2319,52 +2310,52 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [17,22,27,0,5,10,15,u] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,17,22,27,u,u] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper @@ -2380,52 +2371,52 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper @@ -2441,52 +2432,52 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [17,22,27,0,5,10,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,7,12,17,22,27,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [17,22,27,0,5,10,15,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,17,22,27,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,8,13,18,23,28,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,9,14,19,24,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -3001,77 +2992,77 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i32_stride5_vf32: ; AVX: # %bb.0: -; AVX-NEXT: subq $952, %rsp # imm = 0x3B8 +; AVX-NEXT: subq $984, %rsp # imm = 0x3D8 ; AVX-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 544(%rdi), %ymm14 -; AVX-NEXT: vmovaps 576(%rdi), %ymm5 -; AVX-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX-NEXT: vmovaps 544(%rdi), %ymm8 +; AVX-NEXT: vmovaps 576(%rdi), %ymm4 +; AVX-NEXT: vmovaps 512(%rdi), %ymm14 ; AVX-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rdi), %ymm8 +; AVX-NEXT: vmovaps 224(%rdi), %ymm13 ; AVX-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] +; AVX-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] +; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 ; AVX-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm13 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX-NEXT: vmovaps %ymm4, %ymm6 -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm5[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm4[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rdi), %ymm15 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4],ymm1[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vmovaps 384(%rdi), %ymm10 -; AVX-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX-NEXT: vmovaps 416(%rdi), %ymm5 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 @@ -3079,24 +3070,24 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7] ; AVX-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm9[2,3],ymm13[4,5],ymm9[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,0],ymm2[3,0],ymm3[6,4],ymm2[7,4] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 304(%rdi), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX-NEXT: vbroadcastss 304(%rdi), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX-NEXT: vmovaps 480(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3109,187 +3100,191 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vbroadcastss 624(%rdi), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX-NEXT: vmovaps (%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[3,0],ymm0[6,4],ymm2[7,4] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX-NEXT: vbroadcastss 144(%rdi), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 144(%rdi), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX-NEXT: vmovaps 320(%rdi), %xmm3 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vbroadcastss 464(%rdi), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm8[2,0],ymm13[7,4],ymm8[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] -; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,1],ymm0[6,4],ymm13[6,5] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm7[4,5],ymm12[6,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[0,0],ymm5[5,4],ymm3[4,4] -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 288(%rdi), %ymm4 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm2[0,0],ymm4[5,4],ymm2[4,4] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm14[2,0],ymm12[7,4],ymm14[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[2,1],ymm0[6,4],ymm14[6,5] -; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,0],ymm8[2,0],ymm15[7,4],ymm8[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] +; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovaps 608(%rdi), %ymm7 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm2[0,0],ymm7[5,4],ymm2[4,4] -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm8[0,0],ymm7[5,4],ymm8[4,4] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm11[2,0],ymm0[7,4],ymm11[6,4] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,1],ymm0[6,4],ymm11[6,5] -; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm13[0,0],ymm9[5,4],ymm13[4,4] +; AVX-NEXT: vmovaps 128(%rdi), %ymm5 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm15[0,0],ymm5[5,4],ymm15[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,1],ymm0[6,4],ymm10[6,5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX-NEXT: # ymm6 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2],xmm6[3] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,1],ymm0[6,4],ymm10[6,5] ; AVX-NEXT: vshufpd {{.*#+}} xmm6 = xmm6[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovaps 448(%rdi), %ymm6 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX-NEXT: vmovaps 448(%rdi), %ymm9 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm9[1,0],ymm12[0,0],ymm9[5,4],ymm12[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm8[3,0],ymm4[4,4],ymm8[7,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,0],ymm3[1,0],ymm5[6,4],ymm3[5,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm3[0,0],ymm13[3,0],ymm3[4,4],ymm13[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm13[2,2],ymm14[6,4],ymm13[6,6] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm4[2,0],ymm2[1,0],ymm4[6,4],ymm2[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm14[3,0],ymm3[4,4],ymm14[7,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm14[2,2],ymm15[6,4],ymm14[6,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[2,2],ymm14[6,4],ymm1[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,0],ymm2[1,0],ymm7[6,4],ymm2[5,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,0],ymm8[1,0],ymm7[6,4],ymm8[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm11[3,0],ymm2[4,4],ymm11[7,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm11[2,2],ymm15[6,4],ymm11[6,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm11[3,0],ymm1[4,4],ymm11[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm11[2,2],ymm14[6,4],ymm11[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm13[1,0],ymm9[6,4],ymm13[5,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,0],ymm15[1,0],ymm5[6,4],ymm15[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,0],ymm10[3,0],ymm6[4,4],ymm10[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm10[2,2],ymm14[6,4],ymm10[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX-NEXT: # ymm8 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm9[2,0],ymm12[1,0],ymm9[6,4],ymm12[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5],mem[6,7] ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm5[7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 16-byte Folded Reload -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload -; AVX-NEXT: # ymm8 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 16-byte Folded Reload +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7] ; AVX-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload ; AVX-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm7[7] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 16-byte Folded Reload -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: # ymm7 = mem[0,1,2,3],ymm2[4,5],mem[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 16-byte Folded Reload +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] ; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm9[7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm5[7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 16-byte Folded Reload -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: # ymm5 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm6[7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm9[7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm4, 64(%rsi) @@ -3315,7 +3310,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX-NEXT: vmovaps %ymm15, 64(%r8) +; AVX-NEXT: vmovaps %ymm14, 64(%r8) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm4, (%r8) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -3326,134 +3321,135 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm2, (%r9) ; AVX-NEXT: vmovaps %ymm3, 96(%r9) ; AVX-NEXT: vmovaps %ymm0, 32(%r9) -; AVX-NEXT: addq $952, %rsp # imm = 0x3B8 +; AVX-NEXT: addq $984, %rsp # imm = 0x3D8 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride5_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX2-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-NEXT: vmovdqa 416(%rdi), %ymm14 +; AVX2-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-NEXT: vmovdqa 576(%rdi), %ymm13 +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 480(%rdi), %ymm10 ; AVX2-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovdqa 480(%rdi), %ymm15 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,0,3] +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vinserti128 $1, 608(%rdi), %ymm3, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,3] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4],ymm1[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX2-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm12 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm15 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] +; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,6,3,0,5,2,7,0] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,6,3,0,5,2,7,u] ; AVX2-NEXT: vpbroadcastd 304(%rdi), %ymm2 +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] +; AVX2-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm6[2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-NEXT: vmovdqa %ymm5, %ymm13 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpbroadcastd 464(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm12, %ymm9 -; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm4[2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovdqa %ymm14, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpbroadcastd 144(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [2,7,4,u] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vmovdqa %ymm11, %ymm14 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm15 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpermd %ymm15, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] -; AVX2-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vmovdqa %ymm9, %ymm11 +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 608(%rdi), %ymm2 @@ -3461,110 +3457,110 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm4[4,5],ymm8[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vmovdqa %ymm10, %ymm6 +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm10 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm0, %ymm10 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm7 ; AVX2-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm5 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] -; AVX2-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermd %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX2-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX2-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm7[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,6,2,7,4,0,0] -; AVX2-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,2,7,4,u,u] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpermd %ymm12, %ymm10, %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpermd %ymm15, %ymm10, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] -; AVX2-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm4[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3],ymm0[4],ymm9[5,6],ymm0[7] ; AVX2-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = mem[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm14[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3],ymm0[4],ymm9[5,6],ymm0[7] ; AVX2-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpermd %ymm7, %ymm10, %ymm15 -; AVX2-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vpermd %ymm8, %ymm10, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3],ymm0[4],ymm9[5,6],ymm0[7] ; AVX2-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vpermd %ymm15, %ymm10, %ymm5 +; AVX2-NEXT: vpermd %ymm4, %ymm10, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm6[4,5],mem[6,7] -; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[0,1],ymm11[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,1,6,0] -; AVX2-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[0,1],ymm13[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [4,1,6,u] +; AVX2-NEXT: vpermd %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,7,2,7,2,7,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] -; AVX2-NEXT: vpermd %ymm12, %ymm5, %ymm4 +; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = mem[0,1],ymm13[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7] -; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0,1,2,3],ymm11[4,5],mem[6,7] +; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[0,1],ymm12[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5],ymm5[6,7] +; AVX2-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[0,1],ymm1[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX2-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] -; AVX2-NEXT: vpermd %ymm14, %ymm5, %ymm2 +; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0,1],ymm15[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7] +; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX2-NEXT: vpermd %ymm5, %ymm9, %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-NEXT: vpermd %ymm8, %ymm10, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX2-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[0,1],ymm7[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-NEXT: vpermd %ymm15, %ymm5, %ymm1 +; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[0,1],ymm6[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm4, 64(%rsi) @@ -3602,134 +3598,135 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-NEXT: vmovdqa %ymm3, 96(%r9) ; AVX2-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-NEXT: addq $968, %rsp # imm = 0x3C8 +; AVX2-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i32_stride5_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX2-FP-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm14 +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,0,3] +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vinserti128 $1, 608(%rdi), %ymm3, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,3] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4],ymm1[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FP-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm15 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,6,3,0,5,2,7,0] -; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,6,3,0,5,2,7,u] ; AVX2-FP-NEXT: vpbroadcastd 304(%rdi), %ymm2 +; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm6[2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm13 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpbroadcastd 464(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm9 -; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm4[2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm14, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpbroadcastd 144(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,7,4,u] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm14 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm15 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm0, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FP-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm11 +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm2 @@ -3737,110 +3734,110 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm4[4,5],ymm8[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm6 +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm10 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FP-NEXT: vpermd %ymm8, %ymm0, %ymm10 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vpermd %ymm1, %ymm5, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm5, %ymm7 ; AVX2-FP-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm5 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FP-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermd %ymm8, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX2-FP-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm7[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,6,2,7,4,0,0] -; AVX2-FP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,2,7,4,u,u] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vpermd %ymm12, %ymm10, %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm10, %ymm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm4[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3],ymm0[4],ymm9[5,6],ymm0[7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = mem[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm14[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3],ymm0[4],ymm9[5,6],ymm0[7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FP-NEXT: vpermd %ymm7, %ymm10, %ymm15 -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vpermd %ymm8, %ymm10, %ymm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3],ymm0[4],ymm9[5,6],ymm0[7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vpermd %ymm15, %ymm10, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm4, %ymm10, %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm6[4,5],mem[6,7] -; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[0,1],ymm11[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,1,6,0] -; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[0,1],ymm13[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,1,6,u] +; AVX2-FP-NEXT: vpermd %ymm0, %ymm9, %ymm0 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,7,2,7,2,7,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] -; AVX2-FP-NEXT: vpermd %ymm12, %ymm5, %ymm4 +; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = mem[0,1],ymm13[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-FP-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3],ymm11[4,5],mem[6,7] +; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[0,1],ymm12[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5],ymm5[6,7] +; AVX2-FP-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[0,1],ymm1[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX2-FP-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FP-NEXT: vpermd %ymm14, %ymm5, %ymm2 +; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm15[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7] +; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX2-FP-NEXT: vpermd %ymm5, %ymm9, %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FP-NEXT: vpermd %ymm8, %ymm10, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX2-FP-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1],ymm7[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FP-NEXT: vpermd %ymm15, %ymm5, %ymm1 +; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[0,1],ymm6[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi) @@ -3878,134 +3875,135 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-FP-NEXT: addq $968, %rsp # imm = 0x3C8 +; AVX2-FP-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride5_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX2-FCP-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vinserti128 $1, 608(%rdi), %ymm3, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,6,3,0,5,2,7,0] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,6,3,0,5,2,7,u] ; AVX2-FCP-NEXT: vpbroadcastd 304(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm6[2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 464(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm9 -; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm4[2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,4,0] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,7,4,u] +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm14 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm15 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm11 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm2 @@ -4013,110 +4011,110 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm4[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm7 ; AVX2-FCP-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm5 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm7[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,6,2,7,4,0,0] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,2,7,4,u,u] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm10, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm4[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3],ymm0[4],ymm9[5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = mem[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm14[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3],ymm0[4],ymm9[5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm15 -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3],ymm0[4],ymm9[5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm10, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm6[4,5],mem[6,7] -; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm11[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,1,6,0] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm13[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,1,6,u] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,7,2,7,2,7,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[0,1],ymm13[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm11[4,5],mem[6,7] +; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm12[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5],ymm5[6,7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[0,1],ymm1[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = mem[0,1],ymm15[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1],ymm7[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm6[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi) @@ -4154,7 +4152,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-FCP-NEXT: addq $968, %rsp # imm = 0x3C8 +; AVX2-FCP-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -4174,13 +4172,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4190,17 +4188,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4210,13 +4208,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4224,7 +4222,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4233,12 +4231,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] ; AVX512-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4273,13 +4271,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4289,17 +4287,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4309,13 +4307,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4323,7 +4321,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4332,12 +4330,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4372,13 +4370,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512DQ-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4388,17 +4386,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512DQ-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4408,13 +4406,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4422,7 +4420,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4431,12 +4429,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4471,13 +4469,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4487,17 +4485,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4507,13 +4505,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4521,7 +4519,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4530,12 +4528,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4570,13 +4568,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4586,17 +4584,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4606,13 +4604,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4620,7 +4618,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4629,12 +4627,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4669,13 +4667,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4685,17 +4683,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4705,13 +4703,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4719,7 +4717,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4728,12 +4726,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4768,13 +4766,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4784,17 +4782,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4804,13 +4802,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4818,7 +4816,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4827,12 +4825,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -4867,13 +4865,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,5,10,15,20,25,30,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm6 @@ -4883,17 +4881,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,7,12,17,22,27,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] @@ -4903,13 +4901,13 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,8,13,18,23,28,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] @@ -4917,7 +4915,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm17, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm0, %zmm16 @@ -4926,12 +4924,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,9,14,19,24,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm16, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 @@ -5981,18 +5979,18 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i32_stride5_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $2488, %rsp # imm = 0x9B8 +; AVX-NEXT: subq $2552, %rsp # imm = 0x9F8 ; AVX-NEXT: vmovaps 832(%rdi), %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 544(%rdi), %ymm8 +; AVX-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 576(%rdi), %ymm8 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 576(%rdi), %ymm9 -; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 512(%rdi), %ymm6 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 480(%rdi), %ymm7 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX-NEXT: vmovaps 224(%rdi), %ymm15 ; AVX-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6007,7 +6005,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4],ymm2[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6018,8 +6016,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] @@ -6031,12 +6029,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 896(%rdi), %ymm9 +; AVX-NEXT: vmovaps 896(%rdi), %ymm14 ; AVX-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 @@ -6049,14 +6047,15 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: vmovaps 1184(%rdi), %ymm13 -; AVX-NEXT: vmovaps 1216(%rdi), %ymm15 +; AVX-NEXT: vmovaps 1184(%rdi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 1216(%rdi), %ymm13 ; AVX-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] @@ -6069,12 +6068,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 @@ -6087,17 +6086,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: vmovaps 384(%rdi), %ymm14 +; AVX-NEXT: vmovaps 384(%rdi), %ymm12 ; AVX-NEXT: vmovaps 416(%rdi), %ymm7 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] +; AVX-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6106,14 +6105,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: vmovaps 704(%rdi), %ymm12 +; AVX-NEXT: vmovaps 704(%rdi), %ymm11 ; AVX-NEXT: vmovaps 736(%rdi), %ymm6 ; AVX-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4],ymm1[5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] @@ -6124,42 +6123,41 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: vmovaps 1024(%rdi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1056(%rdi), %ymm10 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX-NEXT: vmovaps 1024(%rdi), %ymm10 +; AVX-NEXT: vmovaps 1056(%rdi), %ymm5 ; AVX-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 1088(%rdi), %ymm3, %ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX-NEXT: vmovaps 160(%rdi), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] +; AVX-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,0],ymm2[3,0],ymm3[6,4],ymm2[7,4] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX-NEXT: vbroadcastss 304(%rdi), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 304(%rdi), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX-NEXT: vmovaps 480(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6172,30 +6170,31 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vbroadcastss 624(%rdi), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm1[1,3],ymm0[6,5],ymm1[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm3[1,3],ymm0[6,5],ymm3[5,7] ; AVX-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm14[2,3],ymm3[4,5],ymm14[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[3,0],ymm0[6,4],ymm2[7,4] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX-NEXT: vbroadcastss 944(%rdi), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 944(%rdi), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm4[1,3],ymm0[6,5],ymm4[5,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm13[2,3],ymm4[4,5],ymm13[6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX-NEXT: vmovaps 1120(%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] @@ -6205,26 +6204,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vbroadcastss 1264(%rdi), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX-NEXT: vmovaps (%rdi), %xmm3 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX-NEXT: vmovaps %ymm8, %ymm14 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[3,0],ymm0[6,4],ymm2[7,4] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX-NEXT: vbroadcastss 144(%rdi), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 144(%rdi), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6239,30 +6239,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] -; AVX-NEXT: vmovaps %ymm12, %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] ; AVX-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[3,0],ymm0[6,4],ymm2[7,4] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX-NEXT: vbroadcastss 784(%rdi), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 784(%rdi), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps %ymm10, %ymm1 -; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3,0,1] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] @@ -6272,10 +6269,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vbroadcastss 1104(%rdi), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[2,0],ymm0[7,4],ymm8[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -6283,20 +6280,20 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovaps 288(%rdi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm15[2,0],ymm0[7,4],ymm15[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[2,1],ymm0[6,4],ymm15[6,5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] @@ -6309,10 +6306,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm3[2,0],ymm0[7,4],ymm3[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,1],ymm0[6,4],ymm3[6,5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -6328,29 +6324,28 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,1],ymm0[6,4],ymm13[6,5] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[2,0],ymm0[7,4],ymm4[6,4] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,1],ymm0[6,4],ymm4[6,5] ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm6[0,0],ymm1[5,4],ymm6[4,4] +; AVX-NEXT: vmovaps 1248(%rdi), %ymm6 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm7[0,0],ymm6[5,4],ymm7[4,4] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[2,0],ymm0[7,4],ymm8[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm14[2,0],ymm0[7,4],ymm14[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[2,1],ymm0[6,4],ymm14[6,5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -6365,10 +6360,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm14[2,0],ymm0[7,4],ymm14[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,1],ymm1[6,4],ymm14[6,5] -; AVX-NEXT: vmovaps %ymm14, %ymm8 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm12[2,0],ymm0[7,4],ymm12[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm12[2,1],ymm1[6,4],ymm12[6,5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX-NEXT: # ymm2 = mem[0,1,2,3],ymm0[4,5],mem[6,7] @@ -6376,16 +6371,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] +; AVX-NEXT: vmovaps 448(%rdi), %ymm13 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm0[0,0],ymm12[5,4],ymm0[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,0],ymm0[0,0],ymm13[5,4],ymm0[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm7[2,1],ymm2[6,4],ymm7[6,5] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm11[2,0],ymm0[7,4],ymm11[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm11[2,1],ymm2[6,4],ymm11[6,5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] @@ -6393,26 +6389,26 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] ; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX-NEXT: vmovaps 768(%rdi), %ymm13 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX-NEXT: vmovaps 768(%rdi), %ymm11 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm13[1,0],ymm0[0,0],ymm13[5,4],ymm0[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm11[1,0],ymm0[0,0],ymm11[5,4],ymm0[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm9[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[2,1],ymm3[6,4],ymm10[6,5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5],mem[6,7] -; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3,4,5,6,7] -; AVX-NEXT: vmovaps 1088(%rdi), %ymm11 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,0],ymm3[0,0],ymm11[5,4],ymm3[4,4] +; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX-NEXT: # ymm9 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2],xmm9[3] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[2,1],ymm3[6,4],ymm10[6,5] +; AVX-NEXT: vshufpd {{.*#+}} xmm9 = xmm9[1,0] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm3[3,4,5,6,7] +; AVX-NEXT: vmovaps 1088(%rdi), %ymm14 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,0],ymm3[0,0],ymm14[5,4],ymm3[4,4] ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] @@ -6420,11 +6416,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm1[3,0],ymm5[4,4],ymm1[7,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm8[3,0],ymm5[4,4],ymm8[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload @@ -6450,10 +6445,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -6462,7 +6457,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload @@ -6471,20 +6466,20 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm6[1,0],ymm1[6,4],ymm6[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm7[1,0],ymm6[6,4],ymm7[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm3[1,0],ymm11[6,4],ymm3[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,0],ymm3[1,0],ymm14[6,4],ymm3[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6492,13 +6487,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm7, %ymm14 -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm7[3,0],ymm9[4,4],ymm7[7,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm12[3,0],ymm9[4,4],ymm12[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm12[2,2],ymm15[6,4],ymm12[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm10[1,0],ymm13[6,4],ymm10[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm10[1,0],ymm11[6,4],ymm10[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6506,13 +6501,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm8, %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,0],ymm8[3,0],ymm6[4,4],ymm8[7,4] -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,0],ymm7[3,0],ymm6[4,4],ymm7[7,4] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm3[1,0],ymm12[6,4],ymm3[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm3[1,0],ymm13[6,4],ymm3[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6549,8 +6544,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm1[7] +; AVX-NEXT: vblendps $128, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: # ymm5 = ymm1[0,1,2,3,4,5,6],mem[7] ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 16-byte Folded Reload ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] @@ -6560,23 +6555,23 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm12[7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm13[7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 16-byte Folded Reload -; AVX-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = ymm5[0,1,2,3,4],mem[5],ymm5[6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5],mem[6,7] ; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: # ymm5 = ymm5[0,1,2,3,4],mem[5],ymm5[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload ; AVX-NEXT: # ymm6 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 16-byte Folded Reload +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 16-byte Folded Reload ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload @@ -6584,7 +6579,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm13[7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm11[7] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 16-byte Folded Reload @@ -6610,18 +6605,18 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] -; AVX-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm11[7] +; AVX-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm14[7] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 16-byte Folded Reload -; AVX-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: # ymm7 = ymm7[0,1,2,3,4],mem[5],ymm7[6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = ymm7[0,1,2,3,4],mem[5],ymm7[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX-NEXT: vblendps $128, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload @@ -6698,61 +6693,61 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm1, 64(%r9) ; AVX-NEXT: vmovaps %ymm4, 32(%r9) ; AVX-NEXT: vmovaps %ymm0, (%r9) -; AVX-NEXT: addq $2488, %rsp # imm = 0x9B8 +; AVX-NEXT: addq $2552, %rsp # imm = 0x9F8 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride5_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX2-NEXT: vmovdqa 864(%rdi), %ymm4 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-NEXT: subq $2088, %rsp # imm = 0x828 +; AVX2-NEXT: vmovdqa 864(%rdi), %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 832(%rdi), %ymm15 -; AVX2-NEXT: vmovdqa 800(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 896(%rdi), %ymm6 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 832(%rdi), %ymm12 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 480(%rdi), %ymm11 +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 480(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4],ymm3[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX2-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vinserti128 $1, 608(%rdi), %ymm3, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-NEXT: vmovdqa %ymm8, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7] -; AVX2-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX2-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1152(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa 1120(%rdi), %ymm1 @@ -6762,19 +6757,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 1184(%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1216(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-NEXT: vmovdqa 1216(%rdi), %ymm15 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,0,3] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6782,12 +6777,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 672(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -6798,70 +6793,71 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 992(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vmovdqa 960(%rdi), %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm13 +; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5,6,7] -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm11[0,1,0,3] -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,3] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,6,3,0,5,2,7,0] -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,6,3,0,5,2,7,u] ; AVX2-NEXT: vpbroadcastd 304(%rdi), %ymm2 +; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa %ymm11, %ymm9 +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastd 944(%rdi), %ymm2 @@ -6869,60 +6865,59 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastd 1264(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastd 464(%rdi), %ymm2 +; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastd 784(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm5[0,1],mem[2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastd 1104(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastd 144(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm13 = [2,7,4,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm5[4,5],ymm12[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm13 = [2,7,4,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm13, %ymm0 ; AVX2-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2 @@ -6931,10 +6926,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 608(%rdi), %ymm2 @@ -6942,12 +6937,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1,2,3],ymm9[4,5],mem[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 928(%rdi), %ymm2 @@ -6955,24 +6950,23 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] +; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm9 -; AVX2-NEXT: vpermd %ymm9, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -6980,8 +6974,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1,2,3],ymm15[4,5],mem[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm4 ; AVX2-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -6993,21 +6987,20 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm5 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm15 +; AVX2-NEXT: vmovdqa 768(%rdi), %ymm10 +; AVX2-NEXT: vpermd %ymm10, %ymm0, %ymm15 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm13, %ymm1 @@ -7016,83 +7009,81 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm14 -; AVX2-NEXT: vpermd %ymm14, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,2,7,4,0,0] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,2,7,4,u,u] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6] +; AVX2-NEXT: vpermd %ymm15, %ymm1, %ymm15 ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = mem[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = mem[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] +; AVX2-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = mem[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vpermd %ymm9, %ymm0, %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload ; AVX2-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vpermd %ymm14, %ymm0, %ymm15 +; AVX2-NEXT: vpermd %ymm5, %ymm0, %ymm15 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm10[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm10[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm11[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: vpermd %ymm10, %ymm0, %ymm15 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm14[4,5],ymm5[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm9[4,5],ymm5[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpermd %ymm10, %ymm0, %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpermd %ymm12, %ymm0, %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] @@ -7107,28 +7098,29 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm8 = [4,1,6,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [4,1,6,u] ; AVX2-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-NEXT: vpermd %ymm15, %ymm7, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm3[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm6[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[0,1],ymm14[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[0,1],ymm9[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5],ymm2[6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vpermd %ymm10, %ymm7, %ymm2 +; AVX2-NEXT: vpermd %ymm12, %ymm7, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -7141,18 +7133,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] -; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[0,1],ymm12[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] +; AVX2-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0,1],ymm14[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5],ymm4[6,7] ; AVX2-NEXT: vpermd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: vpermd %ymm10, %ymm7, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = mem[0,1],ymm10[0,1] @@ -7161,13 +7154,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = mem[0,1,2,3],ymm3[4,5],mem[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[0,1],ymm11[0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = mem[0,1],ymm3[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm3[5],ymm10[6,7] ; AVX2-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3,4,5,6,7] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload @@ -7232,8 +7225,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm7, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-NEXT: vmovdqa %ymm11, 64(%r8) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm7, 128(%r8) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -7254,61 +7246,61 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %ymm1, 64(%r9) ; AVX2-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX2-NEXT: addq $2088, %rsp # imm = 0x828 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i32_stride5_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-FP-NEXT: subq $2088, %rsp # imm = 0x828 +; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4],ymm3[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FP-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vinserti128 $1, 608(%rdi), %ymm3, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %ymm1 @@ -7318,19 +7310,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %ymm15 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,0,3] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FP-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7338,12 +7330,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -7354,70 +7346,71 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FP-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovdqa 960(%rdi), %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm11[0,1,0,3] -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,3] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,6,3,0,5,2,7,0] -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,6,3,0,5,2,7,u] ; AVX2-FP-NEXT: vpbroadcastd 304(%rdi), %ymm2 +; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm9 +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastd 944(%rdi), %ymm2 @@ -7425,60 +7418,59 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastd 1264(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastd 464(%rdi), %ymm2 +; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastd 784(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm5[0,1],mem[2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastd 1104(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpbroadcastd 144(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [2,7,4,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm5[4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [2,7,4,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm13, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2 @@ -7487,10 +7479,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm2 @@ -7498,12 +7490,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3],ymm9[4,5],mem[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm2 @@ -7511,24 +7503,23 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] +; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm9 -; AVX2-FP-NEXT: vpermd %ymm9, %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -7536,8 +7527,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3],ymm15[4,5],mem[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm4 ; AVX2-FP-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -7549,21 +7540,20 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm5 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm15 +; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm10 +; AVX2-FP-NEXT: vpermd %ymm10, %ymm0, %ymm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm1 @@ -7572,83 +7562,81 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm14 -; AVX2-FP-NEXT: vpermd %ymm14, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,2,7,4,0,0] -; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,2,7,4,u,u] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6] +; AVX2-FP-NEXT: vpermd %ymm15, %ymm1, %ymm15 ; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = mem[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = mem[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] +; AVX2-FP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = mem[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vpermd %ymm9, %ymm0, %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vpermd %ymm14, %ymm0, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm0, %ymm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm10[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm10[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm11[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermd %ymm10, %ymm0, %ymm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm14[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm9[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpermd %ymm10, %ymm0, %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vpermd %ymm12, %ymm0, %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] @@ -7663,28 +7651,29 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [4,1,6,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,1,6,u] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm15, %ymm7, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm3[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm6[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5],ymm2[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[0,1],ymm14[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[0,1],ymm9[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5],ymm2[6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vpermd %ymm10, %ymm7, %ymm2 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm7, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -7697,18 +7686,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm12[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm14[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5],ymm4[6,7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm8, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermd %ymm10, %ymm7, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = mem[0,1],ymm10[0,1] @@ -7717,13 +7707,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3],ymm3[4,5],mem[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[0,1],ymm11[0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0,1],ymm3[0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm3[5],ymm10[6,7] ; AVX2-FP-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload @@ -7788,8 +7778,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-FP-NEXT: vmovdqa %ymm11, 64(%r8) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm7, 128(%r8) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -7810,61 +7799,61 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FP-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX2-FP-NEXT: addq $2088, %rsp # imm = 0x828 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride5_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-FCP-NEXT: subq $2088, %rsp # imm = 0x828 +; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vinserti128 $1, 608(%rdi), %ymm3, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %ymm1 @@ -7874,19 +7863,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7894,12 +7883,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -7910,70 +7899,71 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm11[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,6,3,0,5,2,7,0] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,6,3,0,5,2,7,u] ; AVX2-FCP-NEXT: vpbroadcastd 304(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm9 +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastd 944(%rdi), %ymm2 @@ -7981,60 +7971,59 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastd 1264(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastd 464(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastd 784(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm5[0,1],mem[2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastd 1104(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [2,7,4,0] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm5[4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [2,7,4,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 @@ -8043,10 +8032,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm2 @@ -8054,12 +8043,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3],ymm9[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %ymm2 @@ -8067,24 +8056,23 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 @@ -8092,8 +8080,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3],ymm15[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -8105,21 +8093,20 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm5 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm15 +; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm0, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1 @@ -8128,83 +8115,81 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,2,7,4,0,0] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,2,7,4,u,u] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6] +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm15 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = mem[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm15 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm15 = mem[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm15 = mem[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm0, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm10[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm10[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm11[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm0, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm14[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm9[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm0, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] @@ -8219,28 +8204,29 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [4,1,6,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,1,6,u] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm3[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm6[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[0,1],ymm14[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[0,1],ymm9[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm7, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -8253,18 +8239,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[0,1],ymm12[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = mem[0,1],ymm14[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5],ymm4[6,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm8, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm10[0,1] @@ -8273,13 +8260,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3],ymm3[4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm11[0,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm3[0,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm3[5],ymm10[6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload @@ -8344,8 +8331,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 64(%r8) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm7, 128(%r8) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -8366,7 +8352,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FCP-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX2-FCP-NEXT: addq $2088, %rsp # imm = 0x828 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -8389,7 +8375,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -8404,7 +8390,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -8414,7 +8400,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8446,12 +8432,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] ; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] ; AVX512-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -8492,7 +8478,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -8508,7 +8494,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -8520,7 +8506,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -8529,7 +8515,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -8538,7 +8524,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -8593,7 +8579,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -8608,7 +8594,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -8618,7 +8604,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8650,12 +8636,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -8696,7 +8682,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -8712,7 +8698,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -8724,7 +8710,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -8733,7 +8719,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -8742,7 +8728,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -8797,7 +8783,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -8812,7 +8798,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -8822,7 +8808,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8854,12 +8840,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -8900,7 +8886,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -8916,7 +8902,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -8928,7 +8914,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -8937,7 +8923,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -8946,7 +8932,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9001,7 +8987,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9016,7 +9002,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9026,7 +9012,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9058,12 +9044,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9104,7 +9090,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9120,7 +9106,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9132,7 +9118,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9141,7 +9127,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9150,7 +9136,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9205,7 +9191,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9220,7 +9206,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9230,7 +9216,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9262,12 +9248,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9308,7 +9294,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9324,7 +9310,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9336,7 +9322,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9345,7 +9331,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9354,7 +9340,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9409,7 +9395,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9424,7 +9410,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9434,7 +9420,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9466,12 +9452,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9512,7 +9498,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9528,7 +9514,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9540,7 +9526,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9549,7 +9535,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9558,7 +9544,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9613,7 +9599,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9628,7 +9614,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9638,7 +9624,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9670,12 +9656,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9716,7 +9702,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9732,7 +9718,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9744,7 +9730,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9753,7 +9739,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9762,7 +9748,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} @@ -9817,7 +9803,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,5,10,15,20,25,30,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 @@ -9832,7 +9818,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [17,22,27,0,5,10,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -9842,7 +9828,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [2,7,12,17,22,27,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9874,12 +9860,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,8,13,18,23,28,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,9,14,19,24,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 @@ -9920,7 +9906,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 @@ -9936,7 +9922,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 @@ -9948,7 +9934,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 @@ -9957,7 +9943,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 @@ -9966,7 +9952,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 85ed61811af53..db145831d1fc5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -160,31 +160,31 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i32_stride6_vf2: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vextractps $2, %xmm1, %r10d -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512-NEXT: vextractps $3, %xmm1, %r10d +; AVX512-NEXT: vextractps $2, %xmm1, %eax +; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm2 +; AVX512-NEXT: vextractps $3, %xmm1, %eax ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512-NEXT: vmovd %xmm2, %r10d -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] +; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] +; AVX512-NEXT: vpermps %ymm5, %ymm4, %ymm4 +; AVX512-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] ; AVX512-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512-NEXT: vmovq %xmm3, (%rsi) +; AVX512-NEXT: vmovq %xmm2, (%rsi) ; AVX512-NEXT: vmovq %xmm1, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) +; AVX512-NEXT: vmovq %xmm3, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm2, (%r9) +; AVX512-NEXT: vmovlps %xmm4, (%r9) ; AVX512-NEXT: vmovlps %xmm5, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -192,21 +192,21 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i32_stride6_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = [1,7,0,0] ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm6 = [7,1,0,0] ; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm5 = [5,3,0,0] ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi) ; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) @@ -219,31 +219,31 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i32_stride6_vf2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vextractps $2, %xmm1, %r10d -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512DQ-NEXT: vextractps $3, %xmm1, %r10d +; AVX512DQ-NEXT: vextractps $2, %xmm1, %eax +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm0, %xmm2 +; AVX512DQ-NEXT: vextractps $3, %xmm1, %eax ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-NEXT: vmovd %xmm2, %r10d -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-NEXT: vmovd %xmm4, %eax +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] +; AVX512DQ-NEXT: vpermps %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] ; AVX512DQ-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm2, (%r9) +; AVX512DQ-NEXT: vmovlps %xmm4, (%r9) ; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -251,21 +251,21 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i32_stride6_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = [1,7,0,0] ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm6 = [7,1,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm5 = [5,3,0,0] ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) @@ -278,31 +278,31 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i32_stride6_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vextractps $2, %xmm1, %r10d -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512BW-NEXT: vextractps $3, %xmm1, %r10d +; AVX512BW-NEXT: vextractps $2, %xmm1, %eax +; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm0, %xmm2 +; AVX512BW-NEXT: vextractps $3, %xmm1, %eax ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512BW-NEXT: vmovd %xmm2, %r10d -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] +; AVX512BW-NEXT: vmovsd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] +; AVX512BW-NEXT: vpermps %ymm5, %ymm4, %ymm4 +; AVX512BW-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] ; AVX512BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm2, (%r9) +; AVX512BW-NEXT: vmovlps %xmm4, (%r9) ; AVX512BW-NEXT: vmovlps %xmm5, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -310,21 +310,21 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i32_stride6_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = [1,7,0,0] ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [7,1,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm5 = [5,3,0,0] ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) @@ -337,31 +337,31 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i32_stride6_vf2: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vextractps $2, %xmm1, %r10d -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r10d +; AVX512DQ-BW-NEXT: vextractps $2, %xmm1, %eax +; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm0, %xmm2 +; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %eax ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vmovd %xmm2, %r10d -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-BW-NEXT: vmovd %xmm4, %eax +; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] +; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] +; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] ; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r9) ; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -369,21 +369,21 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = [1,7,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [7,1,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm5 = [5,3,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) @@ -510,13 +510,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,6,4,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,5,u] ; AVX2-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm5 @@ -531,13 +531,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm5[2,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0] +; AVX2-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [5,3,0,0] ; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) @@ -554,13 +554,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,6,4,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,5,u] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm5 @@ -575,13 +575,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm5[2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm9, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm5 = [5,3,0,0] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsi) @@ -598,34 +598,34 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,4,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,6,4,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [1,7,5,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,5,u] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,0,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,0,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm7 ; AVX2-FCP-NEXT: vpbroadcastd %xmm7, %xmm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,1,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,1,7,7] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX2-FCP-NEXT: vpbroadcastd 84(%rdi), %xmm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,2,0,0] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,3,0,0] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,3,0,0] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsi) @@ -642,17 +642,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-NEXT: vmovdqa %xmm3, (%rdx) @@ -668,17 +668,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -694,17 +694,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) @@ -720,17 +720,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -746,17 +746,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -772,17 +772,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -798,17 +798,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -824,17 +824,17 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,12,18] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,7,13,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,7,13,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,8,14,20] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,20] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,9,15,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,10,16,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,10,16,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,11,17,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,11,17,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1209,47 +1209,47 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i32_stride6_vf8: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,4,u] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,4,u] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm8, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm11 = [1,7,5,u] ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm11, %ymm7 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm9, %ymm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm11 = [2,4,2,4,2,4,2,4] +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm11 = [u,u,u,4,2,u,u,u] ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm11 = [2,0,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,0,6,4,0,0,6,4] -; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm13, %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,1,7,5,0,1,7,5] -; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm13, %ymm11 +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,0,6,4,0,0,6,4] +; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermps %ymm13, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm14 = [0,1,7,5,0,1,7,5] +; AVX2-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm13, %ymm14, %ymm11 ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm13 = mem[3,3,3,3] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm13 = [3,1,7,5,0,u,u,u] @@ -1259,23 +1259,23 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm8, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm8, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6] ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm10, (%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm11, (%r8) @@ -1290,35 +1290,35 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] ; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,7,13,19,25,31,u,u] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] ; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] ; AVX512-NEXT: vpermd %zmm6, %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [2,8,14,20,26,u,u,u] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] ; AVX512-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [3,9,15,21,27,u,u,u] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] ; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] ; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] ; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] ; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 ; AVX512-NEXT: vmovdqa %ymm7, (%rsi) ; AVX512-NEXT: vmovdqa %ymm8, (%rdx) @@ -1335,35 +1335,35 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,7,13,19,25,31,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] ; AVX512-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,8,14,20,26,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] ; AVX512-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [3,9,15,21,27,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] ; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] ; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rdx) @@ -1380,35 +1380,35 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,7,13,19,25,31,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] ; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [2,8,14,20,26,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [3,9,15,21,27,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] ; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] ; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 ; AVX512DQ-NEXT: vmovdqa %ymm7, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm8, (%rdx) @@ -1425,35 +1425,35 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,7,13,19,25,31,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] ; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,8,14,20,26,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] ; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [3,9,15,21,27,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rdx) @@ -1470,35 +1470,35 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,7,13,19,25,31,u,u] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] ; AVX512BW-NEXT: vpermd %zmm6, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,8,14,20,26,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] ; AVX512BW-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [3,9,15,21,27,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] ; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] ; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 ; AVX512BW-NEXT: vmovdqa %ymm7, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm8, (%rdx) @@ -1515,35 +1515,35 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,7,13,19,25,31,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] ; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] ; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,8,14,20,26,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] ; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [3,9,15,21,27,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] ; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] ; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 ; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) @@ -1560,35 +1560,35 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,7,13,19,25,31,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] ; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] ; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,8,14,20,26,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] ; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [3,9,15,21,27,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] ; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] ; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 ; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%rdx) @@ -1605,35 +1605,35 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,7,13,19,25,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,8,14,20,26,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [3,9,15,21,27,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) @@ -1970,8 +1970,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm6 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm6[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,3] ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm15, %ymm8 +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,3] ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[2,0],ymm8[0,0],ymm9[6,4],ymm8[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm15[2,2],ymm4[6,4],ymm15[6,6] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] @@ -2137,175 +2137,176 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i32_stride6_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $392, %rsp # imm = 0x188 -; AVX2-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 288(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 224(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX2-NEXT: vmovaps {{.*#+}} xmm6 = [0,6,4,u] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-NEXT: vpermps %ymm8, %ymm6, %ymm7 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm15[0,1] +; AVX2-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,4,u] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm7 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[0,1],ymm15[0,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps %ymm0, %ymm7 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2] -; AVX2-NEXT: vpermps %ymm4, %ymm12, %ymm14 -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-NEXT: vpermps %ymm3, %ymm6, %ymm0 -; AVX2-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2],ymm10[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm11 +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] +; AVX2-NEXT: vmovaps %ymm6, %ymm7 +; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm11 +; AVX2-NEXT: vmovaps 256(%rdi), %ymm14 +; AVX2-NEXT: vmovaps %ymm13, %ymm0 +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm14[0,1],ymm13[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7] +; AVX2-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm0, %ymm12, %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm11[6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [1,7,5,u] ; AVX2-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3] -; AVX2-NEXT: vpermps %ymm4, %ymm14, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermps %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm10, %ymm2, %ymm2 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,0,2,0,4,4,6,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm2[2,3],ymm14[4,5],ymm2[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,2,0,4,4,6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm8[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1,3,3,7,5,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,1,4,5,7,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,1,4,5,7,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vmovaps %ymm7, %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1,3,3,7,5,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,7,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpermps %ymm3, %ymm7, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-NEXT: vpermps %ymm5, %ymm10, %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovaps 272(%rdi), %xmm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm11, %ymm7, %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vmovaps %ymm2, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,3,3,7,5,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,1,4,5,7,5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vpermps %ymm2, %ymm15, %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] +; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-NEXT: vpermps %ymm11, %ymm4, %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovaps 272(%rdi), %xmm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-NEXT: vpermps %ymm6, %ymm15, %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = mem[0,1],ymm9[2,3],mem[4,5,6,7] -; AVX2-NEXT: vpermps %ymm12, %ymm10, %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-NEXT: vpermps %ymm3, %ymm14, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpermps %ymm5, %ymm3, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7] -; AVX2-NEXT: vpermps %ymm11, %ymm14, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm12, %ymm3, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-NEXT: vpermps %ymm13, %ymm4, %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpermps %ymm11, %ymm2, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7] +; AVX2-NEXT: vpermps %ymm6, %ymm7, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm13, %ymm2, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-NEXT: vmovaps %ymm8, 32(%r8) -; AVX2-NEXT: vmovaps %ymm0, (%r8) -; AVX2-NEXT: vmovaps %ymm7, 32(%r9) -; AVX2-NEXT: vmovaps %ymm1, (%r9) +; AVX2-NEXT: vmovaps %ymm10, (%r8) +; AVX2-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-NEXT: vmovaps %ymm0, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-NEXT: vmovaps %ymm2, (%rax) +; AVX2-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-NEXT: vmovaps %ymm1, (%rax) ; AVX2-NEXT: addq $392, %rsp # imm = 0x188 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2313,345 +2314,349 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i32_stride6_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $392, %rsp # imm = 0x188 -; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm6 = [0,6,4,u] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm6, %ymm7 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[0,1],ymm15[0,1] +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,4,u] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm7 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[0,1],ymm15[0,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovaps %ymm0, %ymm7 -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2] -; AVX2-FP-NEXT: vpermps %ymm4, %ymm12, %ymm14 -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm6, %ymm0 -; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,2,2,2,4,6,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm11 +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovaps %ymm6, %ymm7 +; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm11 +; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm14 +; AVX2-FP-NEXT: vmovaps %ymm13, %ymm0 +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm14[0,1],ymm13[0,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm0, %ymm12, %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm0, %ymm2, %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm2 = [1,7,5,u] ; AVX2-FP-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3] -; AVX2-FP-NEXT: vpermps %ymm4, %ymm14, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpermps %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm10, %ymm2, %ymm2 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-FP-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FP-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,0,2,0,4,4,6,4] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm2[2,3],ymm14[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,2,0,4,4,6,4] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm8[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1,3,3,7,5,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,1,4,5,7,5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,1,4,5,7,5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vmovaps %ymm7, %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1,3,3,7,5,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,7,5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpermps %ymm3, %ymm7, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm10, %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm11, %ymm7, %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vmovaps %ymm2, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,3,3,7,5,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,1,4,5,7,5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vpermps %ymm2, %ymm15, %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] +; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm11, %ymm4, %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-FP-NEXT: vpermps %ymm6, %ymm15, %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = mem[0,1],ymm9[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm12, %ymm10, %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm14, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm3, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm11, %ymm14, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm12, %ymm3, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm13, %ymm4, %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] +; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpermps %ymm11, %ymm2, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm6, %ymm7, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm13, %ymm2, %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-FP-NEXT: vmovaps %ymm8, 32(%r8) -; AVX2-FP-NEXT: vmovaps %ymm0, (%r8) -; AVX2-FP-NEXT: vmovaps %ymm7, 32(%r9) -; AVX2-FP-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FP-NEXT: vmovaps %ymm10, (%r8) +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FP-NEXT: addq $392, %rsp # imm = 0x188 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride6_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: subq $424, %rsp # imm = 0x1A8 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [0,6,4,u] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm7 -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm5 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[0,1],ymm1[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4,2,4,2,4,2,4,2] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm1, %ymm14 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[0,1],ymm10[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,2,2,2,4,6,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,4,u] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[0,1],ymm15[0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm7 = [4,2,4,2,4,2,4,2] +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm7, %ymm11 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpermps %ymm10, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovaps %ymm13, %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm14[0,1],ymm13[0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [1,7,5,u] ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,3,5,3,5,3,5,3] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm14, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm7 = [5,3,5,3,5,3,5,3] +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm10, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [2,0,6,4,2,0,6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [2,0,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm13[4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [0,0,6,4,0,0,6,4] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm8, %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm6[4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,0,6,4,0,0,6,4] +; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm10, %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm6[2,3],ymm14[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4],ymm0[5],ymm3[6,7] ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,1,7,5,0,1,7,5] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm4 -; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm8 = [3,1,7,5,0,u,u,u] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm4 = mem[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4],ymm4[5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm3, %ymm8 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm11 = [3,1,7,5,0,u,u,u] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm11, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm11, %ymm0 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm7, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = ymm13[0,1],mem[2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm10, %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm7, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm11, %ymm4, %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = ymm12[0,1],mem[2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm12, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm14, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm3, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm14, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm12, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm13, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] +; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermps %ymm11, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm7, %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm13, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm8, 32(%r8) -; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8) -; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FCP-NEXT: vmovaps %ymm10, (%r8) +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) -; AVX2-FCP-NEXT: addq $360, %rsp # imm = 0x168 +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FCP-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -2667,7 +2672,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512-NEXT: movb $56, %dil ; AVX512-NEXT: kmovw %edi, %k2 @@ -2681,14 +2686,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2700,7 +2705,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] ; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -2713,7 +2718,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-NEXT: kmovw %edi, %k1 @@ -2727,7 +2732,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -2755,7 +2760,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512-FCP-NEXT: movb $56, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 @@ -2769,14 +2774,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2788,7 +2793,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -2801,7 +2806,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %edi, %k1 @@ -2815,7 +2820,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -2843,7 +2848,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-NEXT: movb $56, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 @@ -2857,14 +2862,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2876,7 +2881,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -2889,7 +2894,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %edi, %k1 @@ -2903,7 +2908,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -2931,7 +2936,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: movb $56, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 @@ -2945,14 +2950,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -2964,7 +2969,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -2977,7 +2982,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 @@ -2991,7 +2996,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -3019,7 +3024,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 @@ -3033,14 +3038,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -3052,7 +3057,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -3065,7 +3070,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -3079,7 +3084,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -3107,7 +3112,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: movb $56, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 @@ -3121,14 +3126,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -3140,7 +3145,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -3153,7 +3158,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -3167,7 +3172,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -3195,7 +3200,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: movb $56, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 @@ -3209,14 +3214,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -3228,7 +3233,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -3241,7 +3246,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -3255,7 +3260,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -3283,7 +3288,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 @@ -3297,14 +3302,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] @@ -3316,7 +3321,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -3329,7 +3334,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [20,26,0,6,12,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -3343,7 +3348,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [21,27,1,7,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] @@ -4003,8 +4008,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: subq $1048, %rsp # imm = 0x418 ; AVX-NEXT: vmovaps 416(%rdi), %ymm7 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 384(%rdi), %ymm9 -; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 384(%rdi), %ymm8 +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 160(%rdi), %ymm5 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 128(%rdi), %ymm6 @@ -4016,12 +4021,12 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm8, %xmm4 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm4[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,3] +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vextractf128 $1, %ymm9, %xmm4 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1],xmm4[2,3] ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,3] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -4032,22 +4037,22 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] ; AVX-NEXT: vextractf128 $1, %ymm11, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm3[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,3] -; AVX-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX-NEXT: vmovaps 480(%rdi), %ymm8 +; AVX-NEXT: vmovaps 448(%rdi), %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm5, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm9 -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,2],ymm1[6,4],ymm2[6,6] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,2],ymm1[6,4],ymm5[6,6] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovapd 544(%rdi), %ymm1 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 512(%rdi), %ymm2 -; AVX-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2] @@ -4074,7 +4079,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovapd 352(%rdi), %ymm2 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 320(%rdi), %ymm5 -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd %ymm5, (%rsp) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[0,1] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[3],ymm5[2] @@ -4106,28 +4111,29 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm4[3,0] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,2],xmm4[1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,0],xmm4[3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,2],xmm4[1,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX-NEXT: # ymm8 = ymm13[3,0],mem[1,0],ymm13[7,4],mem[5,4] +; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX-NEXT: # ymm9 = ymm13[3,0],mem[1,0],ymm13[7,4],mem[5,4] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm14[2,3],ymm8[6,4],ymm14[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm14[2,3],ymm9[6,4],ymm14[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm5[1,3],ymm0[7,5],ymm5[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm5[1,3],ymm0[7,5],ymm5[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm9[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,0],xmm3[3,0] ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,2],xmm3[1,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm9[1,0],ymm8[7,4],ymm9[5,4] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm8[3,0],mem[1,0],ymm8[7,4],mem[5,4] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[2,3],ymm4[6,4],ymm6[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1],ymm9[1,3],ymm0[7,5],ymm9[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] @@ -4142,7 +4148,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: # ymm3 = ymm3[2,0],mem[2,3],ymm3[6,4],mem[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: vshufps $215, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX-NEXT: # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] @@ -4192,13 +4198,13 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm7[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX-NEXT: # ymm9 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,1],ymm11[2,0],ymm10[6,5],ymm11[6,4] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm7[2,3,0,1] -; AVX-NEXT: vextractf128 $1, %ymm5, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm9, %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,0],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm9[2,0],xmm0[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2],ymm10[3,4,5,6,7] ; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload ; AVX-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] @@ -4209,14 +4215,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm9[2,1],ymm8[2,0],ymm9[6,5],ymm8[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm5[2,1],ymm8[2,0],ymm5[6,5],ymm8[6,4] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1] ; AVX-NEXT: vextractf128 $1, %ymm12, %xmm14 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] @@ -4247,15 +4253,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,1],xmm14[3,3] -; AVX-NEXT: vmovaps %ymm9, %ymm3 -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,1],ymm8[2,1],ymm9[7,5],ymm8[6,5] +; AVX-NEXT: vmovaps %ymm5, %ymm3 +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm8[2,1],ymm5[7,5],ymm8[6,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,1],ymm10[3,1],ymm11[4,5],ymm10[7,5] -; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm5[3,1],mem[3,3] +; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm9[3,1],mem[3,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,1],ymm15[2,1],ymm11[7,5],ymm15[6,5] @@ -4263,11 +4269,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4300,7 +4306,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm2[2,0],ymm1[4,6],ymm2[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX-NEXT: vmovaps 224(%rdi), %xmm0 @@ -4318,7 +4324,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm2[0,0],ymm6[6,4],ymm2[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2],ymm3[2,0],ymm2[4,6],ymm3[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7] @@ -4350,7 +4356,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm9[1,0],ymm1[7,4],ymm9[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,3],ymm1[2,0],ymm9[4,7],ymm1[6,4] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vblendps $12, (%rsp), %xmm4, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload ; AVX-NEXT: # xmm5 = xmm4[0,1],mem[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload @@ -4415,7 +4421,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm2, (%r8) ; AVX-NEXT: vmovaps %ymm11, 96(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm2, 32(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm2, (%r9) @@ -4432,53 +4438,53 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-LABEL: load_i32_stride6_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-NEXT: vmovaps 480(%rdi), %ymm9 -; AVX2-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 448(%rdi), %ymm9 +; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{.*#+}} xmm10 = [0,6,4,u] +; AVX2-NEXT: vmovaps {{.*#+}} xmm15 = [0,6,4,u] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] -; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] +; AVX2-NEXT: vpermps %ymm7, %ymm2, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm9[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[0,1],ymm10[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-NEXT: vpermps %ymm3, %ymm10, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-NEXT: vpermps %ymm4, %ymm15, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-NEXT: vmovaps %ymm6, %ymm9 +; AVX2-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps %ymm2, %ymm10 +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 288(%rdi), %ymm1 @@ -4486,129 +4492,133 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-NEXT: vpermps %ymm1, %ymm10, %ymm8 -; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm13[0,2,2,2,4,6,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7] +; AVX2-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermps %ymm1, %ymm15, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7] ; AVX2-NEXT: vmovaps 320(%rdi), %ymm6 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vpermps %ymm8, %ymm9, %ymm14 -; AVX2-NEXT: vmovaps %ymm9, %ymm0 -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpermps %ymm13, %ymm10, %ymm14 +; AVX2-NEXT: vmovaps %ymm10, %ymm0 +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 608(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX2-NEXT: vmovaps 608(%rdi), %ymm9 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] -; AVX2-NEXT: vpermps %ymm14, %ymm10, %ymm10 +; AVX2-NEXT: vmovaps 576(%rdi), %ymm10 +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-NEXT: vpermps %ymm12, %ymm15, %ymm14 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm6 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 640(%rdi), %ymm9 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm6[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] -; AVX2-NEXT: vmovaps 704(%rdi), %ymm6 -; AVX2-NEXT: vmovaps 736(%rdi), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[0,1],ymm6[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3,4,5,6,7] +; AVX2-NEXT: vmovaps 704(%rdi), %ymm9 +; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 736(%rdi), %ymm6 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{.*#+}} xmm9 = [1,7,5,u] ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [5,3,5,3,5,3,5,3] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm10 = [5,3,5,3,5,3,5,3] -; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm7, %ymm10, %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3,2,3,5,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm1, %ymm9, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,3,2,3,5,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm8, %ymm10, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm3, %ymm6, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm13, %ymm6, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm14, %ymm9, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-NEXT: vpermps %ymm12, %ymm9, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm15, %ymm10, %ymm1 +; AVX2-NEXT: vpermps %ymm14, %ymm6, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps $12, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,2,0,4,4,6,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,0,4,4,6,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,2,0,3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm1[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm0[0,0,2,0,4,4,6,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -4618,15 +4628,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7] +; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0,2,0,4,4,6,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm10[5,6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-NEXT: vmovaps %ymm12, %ymm10 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,3,3,7,5,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] @@ -4635,133 +4645,134 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] +; AVX2-NEXT: vmovaps %ymm2, %ymm5 +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4],ymm15[5],ymm3[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,1,3,3,7,5,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,1,3,3,7,5,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,7,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,3,1,4,5,7,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] -; AVX2-NEXT: vmovaps %ymm6, %ymm4 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vmovaps %ymm0, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vmovaps %ymm1, %ymm4 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,1,3,3,7,5,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,1,4,5,7,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7] +; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[3,1,3,3,7,5,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,7,5] +; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1,3,1,4,5,7,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpermps %ymm3, %ymm8, %ymm1 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] +; AVX2-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpermps %ymm7, %ymm3, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps 80(%rdi), %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm14, %ymm8, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm1 +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-NEXT: # ymm12 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpermps %ymm12, %ymm5, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vblendps $240, (%rsp), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovaps 80(%rdi), %xmm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm12, %ymm8, %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm10, %ymm6, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm15[5,6,7] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-NEXT: vpermps %ymm2, %ymm5, %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7] ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] +; AVX2-NEXT: vpermps %ymm14, %ymm6, %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm8[2,3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-NEXT: vpermps %ymm8, %ymm5, %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm4[5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-NEXT: vmovaps 656(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm15, %ymm8, %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpermps %ymm10, %ymm3, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpermps %ymm14, %ymm13, %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] +; AVX2-NEXT: vpermps %ymm15, %ymm6, %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpermps %ymm7, %ymm5, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vpermps %ymm10, %ymm11, %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] ; AVX2-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-NEXT: vpermps %ymm12, %ymm13, %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm12, %ymm6, %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm10[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5,6,7] +; AVX2-NEXT: vpermps %ymm14, %ymm11, %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm8, %ymm6, %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-NEXT: vpermps %ymm15, %ymm13, %ymm4 +; AVX2-NEXT: vpermps %ymm15, %ymm11, %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm10, %ymm6, %ymm4 +; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm4, 96(%rsi) @@ -4795,71 +4806,71 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm4, 64(%r8) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm4, (%r8) -; AVX2-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, (%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-NEXT: vmovaps %ymm5, 96(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-NEXT: vmovaps %ymm5, 64(%rax) +; AVX2-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-NEXT: vmovaps %ymm2, (%rax) -; AVX2-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX2-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i32_stride6_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm10 = [0,6,4,u] +; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm15 = [0,6,4,u] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm0, %ymm10, %ymm0 +; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm1 +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] +; AVX2-FP-NEXT: vpermps %ymm7, %ymm2, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm9[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[0,1],ymm10[0,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm10, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpermps %ymm4, %ymm15, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-FP-NEXT: vmovaps %ymm6, %ymm9 +; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps %ymm2, %ymm10 +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm1 @@ -4867,129 +4878,133 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vpermps %ymm1, %ymm10, %ymm8 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm13[0,2,2,2,4,6,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpermps %ymm1, %ymm15, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm9, %ymm14 -; AVX2-FP-NEXT: vmovaps %ymm9, %ymm0 -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm13, %ymm10, %ymm14 +; AVX2-FP-NEXT: vmovaps %ymm10, %ymm0 +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] -; AVX2-FP-NEXT: vpermps %ymm14, %ymm10, %ymm10 +; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FP-NEXT: vpermps %ymm12, %ymm15, %ymm14 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm6[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[0,1],ymm6[0,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm9 = [1,7,5,u] ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [5,3,5,3,5,3,5,3] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm10 = [5,3,5,3,5,3,5,3] -; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm7, %ymm10, %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3,2,3,5,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm1, %ymm9, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,3,2,3,5,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm10, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm3, %ymm6, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm13, %ymm6, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm14, %ymm9, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FP-NEXT: vpermps %ymm12, %ymm9, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm15, %ymm10, %ymm1 +; AVX2-FP-NEXT: vpermps %ymm14, %ymm6, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps $12, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,2,0,4,4,6,4] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,0,4,4,6,4] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,2,0,3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm0[0,0,2,0,4,4,6,4] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -4999,15 +5014,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0,2,0,4,4,6,4] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-FP-NEXT: vmovaps %ymm12, %ymm10 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,3,3,7,5,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] @@ -5016,133 +5031,134 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] +; AVX2-FP-NEXT: vmovaps %ymm2, %ymm5 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4],ymm15[5],ymm3[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,1,3,3,7,5,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,1,3,3,7,5,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,7,5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,3,1,4,5,7,5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] -; AVX2-FP-NEXT: vmovaps %ymm6, %ymm4 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vmovaps %ymm0, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vmovaps %ymm1, %ymm4 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,1,3,3,7,5,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,1,4,5,7,5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7] +; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[3,1,3,3,7,5,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,7,5] +; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1,3,1,4,5,7,5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpermps %ymm3, %ymm8, %ymm1 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] +; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm3, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm14, %ymm8, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm1 +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm12 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm12, %ymm5, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vblendps $240, (%rsp), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm12, %ymm8, %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm10, %ymm6, %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm1, %ymm3, %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm2, %ymm5, %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] +; AVX2-FP-NEXT: vpermps %ymm14, %ymm6, %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm8, %ymm5, %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm15, %ymm8, %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm10, %ymm3, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vpermps %ymm14, %ymm13, %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] +; AVX2-FP-NEXT: vpermps %ymm15, %ymm6, %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm7, %ymm5, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vpermps %ymm10, %ymm11, %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] ; AVX2-FP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpermps %ymm2, %ymm6, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm12, %ymm13, %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm12, %ymm6, %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm14, %ymm11, %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm8, %ymm6, %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm15, %ymm13, %ymm4 +; AVX2-FP-NEXT: vpermps %ymm15, %ymm11, %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm10, %ymm6, %ymm4 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi) @@ -5176,62 +5192,61 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r8) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FP-NEXT: vmovaps %ymm5, 96(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) -; AVX2-FP-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX2-FP-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride6_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $1192, %rsp # imm = 0x4A8 +; AVX2-FCP-NEXT: subq $1224, %rsp # imm = 0x4C8 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm9 = [0,6,4,u] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm11 = [0,6,4,u] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermps %ymm13, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm14 = [4,2,4,2,4,2,4,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm14, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[0,1],ymm6[0,1] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[0,1],ymm6[0,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm10[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm1 @@ -5239,7 +5254,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm14, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm1 @@ -5253,46 +5268,46 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,2,2,4,6,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm0 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm13, %ymm5, %ymm15 -; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm0 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm14, %ymm15 +; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm10[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpermps %ymm14, %ymm9, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vpermps %ymm12, %ymm11, %ymm15 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm5[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[0,1],ymm5[0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm14, %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm10 = [1,7,5,u] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermps %ymm13, %ymm10, %ymm11 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5305,48 +5320,47 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm13, %ymm5, %ymm0 +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm14, %ymm10, %ymm0 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FCP-NEXT: vpermps %ymm12, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm7 = [2,0,6,4,2,0,6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm8 = [2,0,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $12, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm8, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,0,6,4,0,0,6,4] -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm10, %ymm1 +; AVX2-FCP-NEXT: vblendps $207, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,0,6,4,0,0,6,4] +; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm14, %ymm8, %ymm1 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm13, %ymm8, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm10, %ymm1 +; AVX2-FCP-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm12 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpermps %ymm12, %ymm9, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -5356,66 +5370,66 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm8, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm8, %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpermps %ymm10, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm10, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm7 = mem[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4],ymm7[5],ymm3[6,7] ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,1,7,5,0,1,7,5] ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm7, %ymm2 -; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm10 = [3,1,7,5,0,u,u,u] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm10, %ymm3 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm9 = [3,1,7,5,0,u,u,u] +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm9, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm2 = mem[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1,2,3,4],ymm2[5],ymm14[6,7] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1,2,3,4],ymm2[5],ymm13[6,7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm12, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm2 = mem[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1,2,3,4],ymm2[5],ymm11[6,7] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1,2,3,4],ymm2[5],ymm10[6,7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm2 = mem[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5424,64 +5438,64 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm9, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, (%rsp), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm12 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm7[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $12, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm8 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm1[2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm11, %ymm5, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm8[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm4, %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm14 = ymm13[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm0[2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6] ; AVX2-FCP-NEXT: vpermps %ymm15, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm13, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,3,1,7,0,3,1,7] @@ -5492,21 +5506,21 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm13, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm5, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm13, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm9, %ymm5, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm11, %ymm13, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm5, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm13, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rsi) @@ -5552,7 +5566,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) -; AVX2-FCP-NEXT: addq $1192, %rsp # imm = 0x4A8 +; AVX2-FCP-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -5567,76 +5581,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512-NEXT: movb $56, %dil ; AVX512-NEXT: kmovw %edi, %k2 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512-NEXT: vpermt2d %zmm10, %zmm16, %zmm15 ; AVX512-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} -; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 +; AVX512-NEXT: vmovdqa32 %zmm15, %zmm8 {%k1} +; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm16 ; AVX512-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} -; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 +; AVX512-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm16 +; AVX512-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512-NEXT: vpermi2d %zmm2, %zmm5, %zmm17 ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512-NEXT: movw $31, %di ; AVX512-NEXT: kmovw %edi, %k2 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 -; AVX512-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} -; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 ; AVX512-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] @@ -5648,14 +5662,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 +; AVX512-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-NEXT: kmovw %edi, %k1 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} @@ -5673,15 +5687,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] -; AVX512-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 -; AVX512-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 +; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm11 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512-NEXT: vpermt2d %zmm12, %zmm13, %zmm1 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1} +; AVX512-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} -; AVX512-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 +; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm4 ; AVX512-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 ; AVX512-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} @@ -5712,76 +5726,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512-FCP-NEXT: movb $56, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm15 ; AVX512-FCP-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 +; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm8 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm16 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm16 +; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm17 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512-FCP-NEXT: movw $31, %di ; AVX512-FCP-NEXT: kmovw %edi, %k2 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 -; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] @@ -5793,14 +5807,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %edi, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} @@ -5818,15 +5832,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm11 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} @@ -5857,76 +5871,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512DQ-NEXT: movb $56, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm16, %zmm15 ; AVX512DQ-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} -; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 +; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm8 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm16 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} -; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 +; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm18, %zmm16 +; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm5, %zmm17 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512DQ-NEXT: movw $31, %di ; AVX512DQ-NEXT: kmovw %edi, %k2 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 -; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] @@ -5938,14 +5952,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} @@ -5963,15 +5977,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 -; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm11 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm13, %zmm1 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} @@ -6002,76 +6016,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: movb $56, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm15 ; AVX512DQ-FCP-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm16 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm17 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512DQ-FCP-NEXT: movw $31, %di ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] @@ -6083,14 +6097,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} @@ -6108,15 +6122,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} @@ -6147,76 +6161,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm15 ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm8 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm16 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm17 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512BW-NEXT: movw $31, %di ; AVX512BW-NEXT: kmovd %edi, %k2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] @@ -6228,14 +6242,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} @@ -6253,15 +6267,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} @@ -6292,76 +6306,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: movb $56, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm15 ; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm17 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512BW-FCP-NEXT: movw $31, %di ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] @@ -6373,14 +6387,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} @@ -6398,15 +6412,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} @@ -6437,76 +6451,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: movb $56, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm16, %zmm15 ; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm5, %zmm17 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512DQ-BW-NEXT: movw $31, %di ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] @@ -6518,14 +6532,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} @@ -6543,15 +6557,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} @@ -6582,76 +6596,76 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,6,12,18,24,30,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm16, %zmm15 ; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,7,13,19,25,31,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm5, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,8,14,20,26,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movw $31, %di ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] +; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [3,9,15,21,27,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] @@ -6663,14 +6677,14 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [20,26,0,6,12,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} @@ -6688,15 +6702,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [21,27,1,7,13,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm12, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} @@ -8052,9 +8066,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm13 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm13[0,3] ; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm13[0,3] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -8154,14 +8168,14 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,3] -; AVX-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm1 +; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6] +; AVX-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[0,0],ymm3[6,4],ymm1[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,2],ymm1[6,4],ymm2[6,6] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovapd 160(%rdi), %ymm1 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8582,9 +8596,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm6[3,1],ymm13[4,5],ymm6[7,5] ; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = xmm8[3,1],mem[3,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,1],ymm3[2,1],ymm10[7,5],ymm3[6,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -8608,8 +8622,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = xmm1[3,1],mem[3,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm6[2,1],ymm5[7,5],ymm6[6,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -8625,7 +8639,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX-NEXT: vmovapd 80(%rdi), %xmm1 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8697,8 +8711,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX-NEXT: vmovaps 800(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -8707,11 +8721,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX-NEXT: vmovapd 848(%rdi), %xmm1 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,0],ymm10[4,5],ymm1[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm10[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,0],ymm3[0,0],ymm10[6,4],ymm3[4,4] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm3[0,0],ymm14[6,4],ymm3[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8762,77 +8776,77 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX-NEXT: vmovaps 1360(%rdi), %xmm13 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] -; AVX-NEXT: vmovapd 1424(%rdi), %xmm13 -; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = ymm13[1],mem[0],ymm13[2],mem[3] +; AVX-NEXT: vmovaps 1360(%rdi), %xmm10 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3] +; AVX-NEXT: vmovapd 1424(%rdi), %xmm10 +; AVX-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX-NEXT: # ymm15 = ymm10[1],mem[0],ymm10[2],mem[3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,0],ymm13[4,5],ymm15[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4] ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,2],ymm13[2,0],ymm0[4,6],ymm13[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm11[1,0],ymm13[7,4],ymm11[5,4] +; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm13[5,6,7] +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm11[1,0],ymm10[7,4],ymm11[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,3],ymm13[2,0],ymm11[4,7],ymm13[6,4] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX-NEXT: # xmm13 = xmm13[0,1],mem[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = ymm15[3,1],mem[1,3],ymm15[7,5],mem[5,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload +; AVX-NEXT: # xmm13 = xmm10[0,1],mem[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] ; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm7[1,0],ymm13[7,4],ymm7[5,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm7[1,0],ymm10[7,4],ymm7[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm13[2,0],ymm7[4,7],ymm13[6,4] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX-NEXT: # xmm13 = xmm13[0,1],mem[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = ymm14[3,1],mem[1,3],ymm14[7,5],mem[5,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload +; AVX-NEXT: # xmm13 = xmm10[0,1],mem[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] ; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm5[1,0],ymm13[7,4],ymm5[5,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm5[1,0],ymm10[7,4],ymm5[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm13[2,0],ymm5[4,7],ymm13[6,4] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX-NEXT: # xmm13 = xmm13[0,1],mem[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = ymm14[3,1],mem[1,3],ymm14[7,5],mem[5,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload +; AVX-NEXT: # xmm13 = xmm10[0,1],mem[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] ; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm12[2,0],ymm4[4,7],ymm12[6,4] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1],mem[2,3] -; AVX-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload -; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = ymm13[3,1],mem[1,3],ymm13[7,5],mem[5,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload +; AVX-NEXT: # xmm12 = xmm10[0,1],mem[2,3] +; AVX-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload +; AVX-NEXT: # ymm13 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,1],ymm13[2,0],ymm15[5,5],ymm13[6,4] ; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7] -; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0],ymm3[1,0],ymm10[7,4],ymm3[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm14[3,0],ymm3[1,0],ymm14[7,4],ymm3[5,4] ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm10[2,0],ymm3[4,7],ymm10[6,4] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload @@ -8998,16 +9012,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{.*#+}} xmm9 = [0,6,4,u] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpermps %ymm14, %ymm9, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermps %ymm15, %ymm9, %ymm0 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] -; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm2 +; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9123,6 +9137,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpermps %ymm8, %ymm6, %ymm2 +; AVX2-NEXT: vmovaps %ymm6, %ymm14 +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 864(%rdi), %ymm1 @@ -9130,14 +9146,14 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: vpermps %ymm4, %ymm9, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9155,137 +9171,78 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermps %ymm2, %ymm9, %ymm0 ; AVX2-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1216(%rdi), %ymm5 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm1[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-NEXT: vmovaps 1216(%rdi), %ymm6 +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[0,1],ymm1[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] ; AVX2-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1312(%rdi), %ymm5 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm1 +; AVX2-NEXT: vmovaps 1312(%rdi), %ymm6 +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpermps %ymm6, %ymm14, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [1,7,5,u] -; AVX2-NEXT: vpermps %ymm14, %ymm0, %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,3,2,3,5,7,6,7] +; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm1 +; AVX2-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm7, %ymm1, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm13, %ymm0, %ymm13 ; AVX2-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX2-NEXT: vpermps %ymm12, %ymm1, %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm10 ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] ; AVX2-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm5, %ymm1, %ymm2 +; AVX2-NEXT: vpermps %ymm6, %ymm1, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9303,107 +9260,167 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,2,0,3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0,2,0,4,4,6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm9[2,3],mem[4,5],ymm9[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,2,0,4,4,6,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm7[2,3],mem[4,5],ymm7[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,0,2,3,6,4,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,2,0,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,2,0,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,0,2,0,4,4,6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] +; AVX2-NEXT: vmovaps %ymm14, %ymm13 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[3,1,3,3,7,5,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0,1,3,1,4,5,7,5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX2-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vmovaps %ymm9, %ymm14 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] @@ -9416,7 +9433,6 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] @@ -9436,26 +9452,26 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,3,3,7,5,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,7,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,7,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vmovaps %ymm13, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,1,3,3,7,5,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1,3,3,7,5,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,1,4,5,7,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[3,1,3,3,7,5,7,7] @@ -9466,8 +9482,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm14 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] @@ -9480,7 +9496,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload @@ -9490,112 +9506,111 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm3, %ymm6, %ymm2 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vpermps %ymm3, %ymm7, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm14, %ymm6, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpermps %ymm13, %ymm7, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm12, %ymm6, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpermps %ymm5, %ymm7, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 656(%rdi), %xmm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm12, %ymm6, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpermps %ymm5, %ymm7, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 848(%rdi), %xmm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpermps %ymm5, %ymm7, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-NEXT: vmovaps 1040(%rdi), %xmm13 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm14, %ymm6, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vpermps %ymm12, %ymm3, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpermps %ymm14, %ymm7, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX2-NEXT: # ymm12 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-NEXT: vpermps %ymm12, %ymm2, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9604,92 +9619,93 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 1232(%rdi), %xmm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm10, %ymm6, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vpermps %ymm8, %ymm3, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpermps %ymm10, %ymm7, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-NEXT: vpermps %ymm8, %ymm2, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-NEXT: vmovaps 1424(%rdi), %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-NEXT: vpermps %ymm5, %ymm7, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $85, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermps (%rsp), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3],ymm7[4,5,6,7] ; AVX2-NEXT: vpermps %ymm14, %ymm1, %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3,4,5,6,7] ; AVX2-NEXT: vpermps %ymm12, %ymm0, %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3,4],ymm12[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] ; AVX2-NEXT: vpermps %ymm10, %ymm1, %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5,6,7] ; AVX2-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6,7] ; AVX2-NEXT: vpermps %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 192(%rsi) @@ -9773,9 +9789,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm1, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-NEXT: vmovaps %ymm12, 160(%rax) -; AVX2-NEXT: vmovaps %ymm6, 128(%rax) +; AVX2-NEXT: vmovaps %ymm2, 128(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9809,16 +9825,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm9 = [0,6,4,u] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vpermps %ymm14, %ymm9, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpermps %ymm15, %ymm9, %ymm0 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] -; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm2 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9934,6 +9950,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm8, %ymm6, %ymm2 +; AVX2-FP-NEXT: vmovaps %ymm6, %ymm14 +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm1 @@ -9941,14 +9959,14 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpermps %ymm4, %ymm9, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9966,137 +9984,78 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermps %ymm2, %ymm9, %ymm0 ; AVX2-FP-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm1[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[0,1],ymm1[0,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm6, %ymm1 +; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm6, %ymm14, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm0 = [1,7,5,u] -; AVX2-FP-NEXT: vpermps %ymm14, %ymm0, %ymm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,3,2,3,5,7,6,7] +; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm7, %ymm1, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps %ymm13, %ymm0, %ymm13 ; AVX2-FP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm12, %ymm1, %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm10 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm1, %ymm2 +; AVX2-FP-NEXT: vpermps %ymm6, %ymm1, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -10114,107 +10073,167 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,0,4,4,6,4] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,2,0,3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,2,0,4,4,6,4] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm9[2,3],mem[4,5],ymm9[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,2,0,4,4,6,4] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm7[2,3],mem[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,0,2,3,6,4,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,2,0,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,2,0,3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,0,2,0,4,4,6,4] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[3,1,3,3,7,5,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0,2,0,4,4,6,4] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] +; AVX2-FP-NEXT: vmovaps %ymm14, %ymm13 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[3,1,3,3,7,5,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0,1,3,1,4,5,7,5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,3,3,7,5,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FP-NEXT: vpermilps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,7,5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vmovaps %ymm9, %ymm14 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] @@ -10227,7 +10246,6 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] @@ -10247,26 +10265,26 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,3,3,7,5,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,7,5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,7,5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vmovaps %ymm13, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,1,3,3,7,5,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1,3,3,7,5,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,1,4,5,7,5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] ; AVX2-FP-NEXT: vpermilps $247, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[3,1,3,3,7,5,7,7] @@ -10277,8 +10295,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups (%rsp), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] @@ -10291,7 +10309,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload @@ -10301,112 +10319,111 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm6, %ymm2 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpermps %ymm3, %ymm7, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] +; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm14, %ymm6, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vpermps %ymm13, %ymm7, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm12, %ymm6, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm7, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm12, %ymm6, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm7, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 848(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm6, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm7, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 1040(%rdi), %xmm13 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm14, %ymm6, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm12, %ymm3, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vpermps %ymm14, %ymm7, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm12 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm12, %ymm2, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10415,92 +10432,93 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 1232(%rdi), %xmm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm10, %ymm6, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm3, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vpermps %ymm10, %ymm7, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm8, %ymm2, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 1424(%rdi), %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FP-NEXT: vpermps %ymm5, %ymm6, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm7, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $85, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermps (%rsp), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm14, %ymm1, %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm12, %ymm0, %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm10, %ymm1, %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vpermps %ymm5, %ymm1, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi) @@ -10584,9 +10602,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FP-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-FP-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-FP-NEXT: vmovaps %ymm12, 160(%rax) -; AVX2-FP-NEXT: vmovaps %ymm6, 128(%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10602,40 +10620,39 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i32_stride6_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $2536, %rsp # imm = 0x9E8 -; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [0,6,4,u] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm12, %ymm0 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[0,1],ymm2[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermps %ymm14, %ymm12, %ymm0 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[0,1],ymm2[0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,2,2,2,4,6,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4,2,4,2,4,2,4,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[0,1],ymm5[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[0,1],ymm4[0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10650,7 +10667,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -10675,7 +10692,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -10700,7 +10717,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 @@ -10723,7 +10740,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm11, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm1 @@ -10745,8 +10762,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm1 @@ -10759,16 +10775,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm12, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -10790,15 +10806,15 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm6, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [1,7,5,u] -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-FCP-NEXT: vpermps %ymm14, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,3,5,3,5,3,5,3] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10821,26 +10837,26 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps %ymm13, %ymm0, %ymm13 ; AVX2-FCP-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm9 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm1, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,3,2,3,5,7,6,7] @@ -10855,35 +10871,35 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [2,0,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm15 = [0,0,6,4,0,0,6,4] ; AVX2-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10891,20 +10907,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermps %ymm9, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload @@ -10921,54 +10937,54 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $243, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $243, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm9, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm15, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload @@ -10993,46 +11009,47 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm7 = mem[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] ; AVX2-FCP-NEXT: vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1,2,3,4],ymm7[5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7 ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm7 = mem[3,3,3,3] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7] ; AVX2-FCP-NEXT: vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1,2,3,4],ymm7[5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7 -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm9, %ymm12, %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm7 = mem[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3,4],ymm7[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7 -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm12, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm6 = mem[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1,2,3,4],ymm6[5],ymm13[6,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm6 +; AVX2-FCP-NEXT: vpermps %ymm11, %ymm12, %ymm4 +; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm9 = mem[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm13[1,2,3,4],ymm4[5],ymm13[6,7] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1,2,3,4],ymm0[5],ymm9[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3,4],ymm0[5],ymm8[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1,2,3,4],ymm0[5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm5, %ymm0 @@ -11041,34 +11058,33 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpermilps {{.*#+}} xmm2 = mem[3,3,3,3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FCP-NEXT: vblendps $222, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1,2,3,4],ymm2[5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $15, (%rsp), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $15, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,2,0,6,0,2,0,6] +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11076,40 +11092,40 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11117,40 +11133,40 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 848(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload @@ -11161,109 +11177,110 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 1040(%rdi), %xmm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm14, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm14, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm12, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm12, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm10 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 1232(%rdi), %xmm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm10, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm10, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm8 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 1424(%rdi), %xmm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm3[2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 1424(%rdi), %xmm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps $8, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm12, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm1, %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -11350,7 +11367,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm8, 192(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm12, 160(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm4, 128(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11458,27 +11475,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512-NEXT: vpermt2d %zmm18, %zmm24, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm18, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm26 @@ -11493,60 +11510,60 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512-NEXT: vpermt2d %zmm1, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm23 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512-NEXT: vpermt2d %zmm23, %zmm5, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512-NEXT: vpermt2d %zmm23, %zmm8, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm28 ; AVX512-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm23 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] -; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm25 = [21,27,1,7,13,u,u,u] +; AVX512-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 @@ -11559,7 +11576,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 @@ -11579,8 +11596,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512-NEXT: vpermt2d %zmm7, %zmm25, %zmm10 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11632,7 +11649,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} @@ -11648,9 +11665,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11660,9 +11677,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11681,7 +11698,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm23, %zmm10 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rsi) @@ -11693,16 +11710,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm25, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm24, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm12, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%r8) ; AVX512-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rax) @@ -11808,27 +11825,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm24, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 @@ -11843,60 +11860,60 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm8, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm28 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [21,27,1,7,13,u,u,u] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 @@ -11909,7 +11926,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 @@ -11929,8 +11946,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm25, %zmm10 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11982,7 +11999,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} @@ -11998,9 +12015,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12010,9 +12027,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12031,7 +12048,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm10 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rsi) @@ -12043,16 +12060,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 192(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) @@ -12158,27 +12175,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm24, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm26 @@ -12193,60 +12210,60 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm24, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm5, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm8, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm28 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm25 = [21,27,1,7,13,u,u,u] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 @@ -12259,7 +12276,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 @@ -12279,8 +12296,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm25, %zmm10 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12332,7 +12349,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} @@ -12348,9 +12365,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12360,9 +12377,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12381,7 +12398,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm10 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rsi) @@ -12393,16 +12410,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 192(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rax) @@ -12508,27 +12525,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm24, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 @@ -12543,60 +12560,60 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm8, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [21,27,1,7,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 @@ -12609,7 +12626,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 @@ -12629,8 +12646,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm25, %zmm10 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12682,7 +12699,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} @@ -12698,9 +12715,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12710,9 +12727,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12731,7 +12748,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rsi) @@ -12743,16 +12760,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 192(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) @@ -12858,27 +12875,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm24, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm26 @@ -12893,60 +12910,60 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm23 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm28 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm23 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [21,27,1,7,13,u,u,u] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 @@ -12959,7 +12976,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 @@ -12979,8 +12996,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm10 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13032,7 +13049,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} @@ -13048,9 +13065,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13060,9 +13077,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13081,7 +13098,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) @@ -13093,16 +13110,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) @@ -13208,27 +13225,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm24, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 @@ -13243,60 +13260,60 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [21,27,1,7,13,u,u,u] +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 @@ -13309,7 +13326,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 @@ -13329,8 +13346,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm25, %zmm10 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13382,7 +13399,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} @@ -13398,9 +13415,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13410,9 +13427,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13431,7 +13448,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rsi) @@ -13443,16 +13460,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) @@ -13558,27 +13575,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm24, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm26 @@ -13593,60 +13610,60 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm24, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 @@ -13659,7 +13676,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 @@ -13679,8 +13696,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm10 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13732,7 +13749,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} @@ -13748,9 +13765,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13760,9 +13777,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13781,7 +13798,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm10 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) @@ -13793,16 +13810,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 192(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rax) @@ -13908,27 +13925,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm24, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26 @@ -13943,60 +13960,60 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,8,14,20,26,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,9,15,21,27,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [20,26,0,6,12,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [21,27,1,7,13,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [21,27,1,7,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 @@ -14009,7 +14026,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 @@ -14029,8 +14046,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm25, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14082,7 +14099,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} @@ -14098,9 +14115,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14110,9 +14127,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14131,7 +14148,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rsi) @@ -14143,16 +14160,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 128(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 7948141f6becd..bf06978b1c9e3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -95,8 +95,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] ; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm6 +; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] @@ -130,8 +130,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] ; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm6 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] @@ -165,8 +165,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] ; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm6 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] @@ -192,19 +192,19 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i32_stride7_vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512-NEXT: vmovd %xmm1, %r11d +; AVX512-NEXT: vmovd %xmm1, %r10d ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm3 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512-NEXT: vmovaps (%rdi), %ymm5 ; AVX512-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX512-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] @@ -229,26 +229,26 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm2 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm1, %xmm3 ; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm6 = [7,2,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm4, %xmm6 +; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512-FCP-NEXT: vmovlps %xmm4, (%r10) ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -256,19 +256,19 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i32_stride7_vf2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512DQ-NEXT: vmovd %xmm1, %r11d +; AVX512DQ-NEXT: vmovd %xmm1, %r10d ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm3 +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm5 ; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX512DQ-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] @@ -293,26 +293,26 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm1, %xmm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm6 = [7,2,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm4, %xmm6 +; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r10) ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -320,19 +320,19 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i32_stride7_vf2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512BW-NEXT: vmovd %xmm1, %r11d +; AVX512BW-NEXT: vmovd %xmm1, %r10d ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 +; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm3 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512BW-NEXT: vmovaps (%rdi), %ymm5 ; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX512BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] @@ -357,26 +357,26 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm2 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vpermi2d %xmm4, %xmm1, %xmm3 ; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [7,2,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm4, %xmm6 +; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r10) ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -384,19 +384,19 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-LABEL: load_i32_stride7_vf2: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 -; AVX512DQ-BW-NEXT: vmovd %xmm1, %r11d +; AVX512DQ-BW-NEXT: vmovd %xmm1, %r10d ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm5 ; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] @@ -421,26 +421,26 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm4, %xmm1, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [7,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm4, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -583,9 +583,9 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [0,7,6,u] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-NEXT: vbroadcastss 84(%rdi), %xmm4 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vbroadcastss 84(%rdi), %xmm3 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] ; AVX2-NEXT: vmovaps 80(%rdi), %xmm4 ; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] @@ -639,9 +639,9 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm2 = [0,7,6,u] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FP-NEXT: vbroadcastss 84(%rdi), %xmm4 ; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2 -; AVX2-FP-NEXT: vbroadcastss 84(%rdi), %xmm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] ; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm4 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] @@ -695,11 +695,10 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [0,7,6,u] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FCP-NEXT: vbroadcastss 84(%rdi), %xmm4 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vbroadcastss 84(%rdi), %xmm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,7,0,1,0,7,0] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm3 = [1,0,7,u,u,u,u,u] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm5 @@ -750,19 +749,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-NEXT: vmovdqa %xmm3, (%rdx) @@ -780,19 +779,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -810,19 +809,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) @@ -840,19 +839,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -870,19 +869,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -900,19 +899,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -930,19 +929,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -960,19 +959,19 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,14,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,8,15,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,9,16,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,10,17,24] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1256,7 +1255,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,6,u] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-NEXT: vpermd %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] @@ -1275,7 +1274,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [1,0,7,6,5,u,u,u] ; AVX2-NEXT: vpermd %ymm7, %ymm11, %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm7 @@ -1301,7 +1300,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0] +; AVX2-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1357,7 +1356,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,6,u] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] @@ -1376,7 +1375,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [1,0,7,6,5,u,u,u] ; AVX2-FP-NEXT: vpermd %ymm7, %ymm11, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm7 @@ -1402,7 +1401,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1458,7 +1457,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,6,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,6,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] @@ -1477,7 +1476,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,7,6,5,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [1,0,7,6,5,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm7 @@ -1503,7 +1502,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,3,0,0] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1514,7 +1513,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,1,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,0,7,u] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpbroadcastd 216(%rdi), %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] @@ -1555,39 +1554,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,3,10,17] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,4,11,18] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,5,12,19] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,6,13,20] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,7,14,21] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,1,8,15,22] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,2,9,16,23] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm4, (%rsi) @@ -1608,39 +1607,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,3,10,17] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,4,11,18] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,5,12,19] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,6,13,20] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,7,14,21] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,1,8,15,22] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,2,9,16,23] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1661,39 +1660,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,3,10,17] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,4,11,18] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,5,12,19] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,6,13,20] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,7,14,21] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,1,8,15,22] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,2,9,16,23] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rsi) @@ -1714,39 +1713,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,3,10,17] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,4,11,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,5,12,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,6,13,20] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,7,14,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,1,8,15,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,2,9,16,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1767,39 +1766,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,3,10,17] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,4,11,18] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,5,12,19] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,6,13,20] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,7,14,21] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,1,8,15,22] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,2,9,16,23] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -1820,39 +1819,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,3,10,17] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,4,11,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,5,12,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,6,13,20] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,7,14,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,1,8,15,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,2,9,16,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1873,39 +1872,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,3,10,17] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,4,11,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,5,12,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,6,13,20] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,7,14,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,1,8,15,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,2,9,16,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -1926,39 +1925,39 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,3,10,17] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,3,10,17] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,4,11,18] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,4,11,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,5,12,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,5,12,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,6,13,20] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,6,13,20] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,7,14,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,7,14,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,8,15,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,1,8,15,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,2,9,16,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,2,9,16,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -2328,17 +2327,17 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rdi), %ymm6 -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps (%rdi), %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%rdi), %ymm15 ; AVX-NEXT: vmovaps 256(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX-NEXT: vmovaps 320(%rdi), %ymm6 ; AVX-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmovaps 224(%rdi), %xmm13 @@ -2358,7 +2357,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmovaps (%rdi), %xmm9 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] @@ -2369,50 +2368,50 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1] +; AVX-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 288(%rdi), %ymm6 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm6[2,2],ymm5[5,5],ymm6[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX-NEXT: vmovaps 256(%rdi), %xmm11 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm13[1],xmm11[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovaps 384(%rdi), %ymm7 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0],xmm13[1],xmm11[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm3[2,2],ymm6[5,5],ymm3[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX-NEXT: vmovaps 384(%rdi), %ymm8 ; AVX-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm7[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm8[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm2[3,3],ymm1[4,4],ymm2[7,7] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm12[2] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[2,2],ymm15[5,5],ymm3[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0],xmm9[1],xmm10[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm15[1,1],ymm5[2,2],ymm15[5,5],ymm5[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm4[0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7] ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm8[2] +; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm7[2] ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm6[0,3],ymm14[7,5],ymm6[4,7] -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,1],ymm14[2,0],ymm5[6,5],ymm14[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm3[0,3],ymm14[7,5],ymm3[4,7] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm6[2,1],ymm14[2,0],ymm6[6,5],ymm14[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 @@ -2421,78 +2420,78 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7] +; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm5[0,3],ymm12[7,5],ymm5[4,7] ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm5[0,0],ymm6[5,4],ymm5[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6] -; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm13[3] -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,1],ymm7[1,3],ymm2[4,5],ymm7[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm6[0,0],ymm3[5,4],ymm6[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,1],ymm2[0,2],ymm6[7,5],ymm2[4,6] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm13[3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1],ymm8[1,3],ymm3[4,5],ymm8[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm15[0,0],ymm3[5,4],ymm15[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,1],ymm3[0,2],ymm15[7,5],ymm3[4,6] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm15[0,0],ymm5[5,4],ymm15[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,1],ymm2[0,2],ymm15[7,5],ymm2[4,6] ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm9[3] ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vmovaps 192(%rdi), %ymm6 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm4[1,3],ymm6[4,5],ymm4[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm5[2,0],ymm0[4,6],ymm5[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm2[0,0],ymm1[7,4],ymm2[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm8[2,0],ymm3[5,4],ymm8[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] ; AVX-NEXT: vmovaps 320(%rdi), %xmm5 -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,1,0,1] -; AVX-NEXT: vmovaps 288(%rdi), %xmm8 -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1,0,1] +; AVX-NEXT: vmovaps 288(%rdi), %xmm7 +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] ; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm11[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm2[0,0],ymm0[7,4],ymm2[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4] ; AVX-NEXT: vmovaps 64(%rdi), %xmm11 ; AVX-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] ; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,1],ymm7[3,3],ymm2[6,5],ymm7[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,1],ymm8[3,3],ymm3[6,5],ymm8[7,7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4] -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm8[3] +; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm2[2,0],ymm8[5,4],ymm2[6,4] +; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4] ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,1],ymm4[3,3],ymm6[6,5],ymm4[7,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm4[2,0],ymm7[5,4],ymm4[6,4] ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm11[3] @@ -2508,13 +2507,13 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4] ; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm7[0,0],ymm3[7,4],ymm7[4,4] ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,0],ymm7[4,5],ymm2[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,0],ymm7[4,5],ymm3[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX-NEXT: # xmm5 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] @@ -2524,7 +2523,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[0,0],ymm6[7,4],ymm7[4,4] ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm3[1],xmm7[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -2553,7 +2552,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm15, 32(%rax) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps %ymm5, (%rax) -; AVX-NEXT: vmovaps %ymm2, 32(%rax) +; AVX-NEXT: vmovaps %ymm3, 32(%rax) ; AVX-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -2561,36 +2560,36 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i32_stride7_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm15 ; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,6,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm9[6],ymm5[7] ; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm8 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm11 ; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1] -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-NEXT: vpbroadcastd 196(%rdi), %ymm11 +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7] +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7] ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm8 @@ -2603,177 +2602,177 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6] ; AVX2-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-NEXT: vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5,6],ymm8[7] ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm5[1],ymm9[2,3,4],ymm5[5],ymm9[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7] ; AVX2-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-NEXT: vpbroadcastd 204(%rdi), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-NEXT: vpbroadcastd 8(%rdi), %xmm12 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-NEXT: vpbroadcastd 204(%rdi), %ymm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 256(%rdi), %xmm14 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-NEXT: vpbroadcastd 428(%rdi), %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm3[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpbroadcastd 232(%rdi), %xmm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm12[1],xmm13[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-NEXT: vpbroadcastd 428(%rdi), %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],mem[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] ; AVX2-NEXT: vbroadcastss 208(%rdi), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1,2],mem[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] -; AVX2-NEXT: vbroadcastss 432(%rdi), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastss 432(%rdi), %ymm3 +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,2],ymm6[1,3],ymm7[4,6],ymm6[5,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX2-NEXT: vpbroadcastd 100(%rdi), %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd 100(%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-NEXT: vmovq {{.*#+}} xmm10 = [4,3,0,0] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovdqa %ymm9, %ymm14 +; AVX2-NEXT: vpermd %ymm0, %ymm10, %ymm11 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm11 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm11 ; AVX2-NEXT: vpbroadcastd 212(%rdi), %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpbroadcastd 324(%rdi), %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpermd %ymm4, %ymm10, %ymm4 +; AVX2-NEXT: vpbroadcastd 324(%rdi), %xmm10 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] +; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vpbroadcastd 436(%rdi), %ymm11 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-NEXT: vpbroadcastd 216(%rdi), %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[1,0,2,3,5,4,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqa 320(%rdi), %xmm8 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] +; AVX2-NEXT: vpbroadcastd 216(%rdi), %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-NEXT: vpbroadcastd 440(%rdi), %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 136(%rdi), %xmm4 -; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpermd 192(%rdi), %ymm0, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vpbroadcastd 80(%rdi), %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 360(%rdi), %xmm4 -; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vpbroadcastd 304(%rdi), %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,0,2,3,5,4,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm13[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-NEXT: vpbroadcastd 440(%rdi), %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpbroadcastd 136(%rdi), %xmm6 +; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vpermd 192(%rdi), %ymm0, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpbroadcastd 80(%rdi), %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%r8) +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpbroadcastd 360(%rdi), %xmm5 +; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpbroadcastd 304(%rdi), %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%r8) ; AVX2-NEXT: vmovdqa %ymm11, 32(%r9) -; AVX2-NEXT: vmovdqa %ymm12, (%r9) +; AVX2-NEXT: vmovdqa %ymm15, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2781,36 +2780,36 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i32_stride7_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm15 ; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,6,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm9[6],ymm5[7] ; AVX2-FP-NEXT: vpermd %ymm8, %ymm2, %ymm8 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm8 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm11 ; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1] -; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FP-NEXT: vpbroadcastd 196(%rdi), %ymm11 +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7] +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm8 @@ -2823,177 +2822,177 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm11 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm5[1],ymm9[2,3,4],ymm5[5],ymm9[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-FP-NEXT: vpbroadcastd 204(%rdi), %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 8(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FP-NEXT: vpbroadcastd 204(%rdi), %ymm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-FP-NEXT: vpbroadcastd 428(%rdi), %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm3[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpbroadcastd 232(%rdi), %xmm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm12[1],xmm13[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-FP-NEXT: vpbroadcastd 428(%rdi), %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],mem[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,2,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] ; AVX2-FP-NEXT: vbroadcastss 208(%rdi), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],mem[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1,2],mem[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] -; AVX2-FP-NEXT: vbroadcastss 432(%rdi), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 432(%rdi), %ymm3 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,2],ymm6[1,3],ymm7[4,6],ymm6[5,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FP-NEXT: vpbroadcastd 100(%rdi), %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd 100(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm10 = [4,3,0,0] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm14 +; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm11 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm11 ; AVX2-FP-NEXT: vpbroadcastd 212(%rdi), %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpbroadcastd 324(%rdi), %xmm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vpermd %ymm4, %ymm10, %ymm4 +; AVX2-FP-NEXT: vpbroadcastd 324(%rdi), %xmm10 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] +; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vpbroadcastd 436(%rdi), %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FP-NEXT: vpbroadcastd 216(%rdi), %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[1,0,2,3,5,4,6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm13[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] +; AVX2-FP-NEXT: vpbroadcastd 216(%rdi), %ymm2 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FP-NEXT: vpbroadcastd 440(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 136(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpermd 192(%rdi), %ymm0, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vpbroadcastd 80(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 360(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FP-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vpbroadcastd 304(%rdi), %ymm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,0,2,3,5,4,6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm13[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FP-NEXT: vpbroadcastd 440(%rdi), %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 136(%rdi), %xmm6 +; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpermd 192(%rdi), %ymm0, %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vpbroadcastd 80(%rdi), %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 360(%rdi), %xmm5 +; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-FP-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpbroadcastd 304(%rdi), %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%r8) ; AVX2-FP-NEXT: vmovdqa %ymm11, 32(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm12, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm15, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FP-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -3012,7 +3011,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm15 ; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,7,6,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,6,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm6[6],ymm10[7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7] @@ -3050,7 +3049,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,7,6,5,6,5,6] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3070,33 +3069,33 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm1[1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FCP-NEXT: vpbroadcastd 204(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 232(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 232(%rdi), %xmm12 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] ; AVX2-FCP-NEXT: vpbroadcastd 428(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],mem[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm3[1,3],ymm2[4,6],ymm3[5,7] ; AVX2-FCP-NEXT: vbroadcastss 208(%rdi), %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7] @@ -3115,7 +3114,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,3,0,0] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = [4,3,0,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 @@ -3140,7 +3139,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,3,3,1,0,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,3,3,1,0,7,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpbroadcastd 216(%rdi), %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] @@ -3231,16 +3230,16 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-NEXT: kmovw %edi, %k1 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512-NEXT: movb $-32, %dil ; AVX512-NEXT: kmovw %edi, %k1 @@ -3250,9 +3249,9 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -3261,24 +3260,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512-NEXT: kmovw %edi, %k2 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512-NEXT: vpermi2d %zmm1, %zmm15, %zmm8 ; AVX512-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm8, %zmm14 {%k1} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3288,23 +3287,23 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3312,7 +3311,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3320,7 +3319,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3348,16 +3347,16 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %edi, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512-FCP-NEXT: movb $-32, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 @@ -3367,9 +3366,9 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -3378,24 +3377,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512-FCP-NEXT: kmovw %edi, %k2 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm8 ; AVX512-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k1} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3405,23 +3404,23 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3429,7 +3428,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3437,7 +3436,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3465,16 +3464,16 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512DQ-NEXT: movb $-32, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 @@ -3484,9 +3483,9 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -3495,24 +3494,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-NEXT: kmovw %edi, %k2 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm15, %zmm8 ; AVX512DQ-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm14 {%k1} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3522,23 +3521,23 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3546,7 +3545,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3554,7 +3553,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3582,16 +3581,16 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512DQ-FCP-NEXT: movb $-32, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 @@ -3601,9 +3600,9 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -3612,24 +3611,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm8 ; AVX512DQ-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3639,23 +3638,23 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3663,7 +3662,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3671,7 +3670,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3699,16 +3698,16 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -3718,9 +3717,9 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -3729,24 +3728,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %edi, %k2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm8 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3756,23 +3755,23 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3780,7 +3779,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3788,7 +3787,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3816,16 +3815,16 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512BW-FCP-NEXT: movb $-32, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -3835,9 +3834,9 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -3846,24 +3845,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm8 ; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3873,23 +3872,23 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -3897,7 +3896,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -3905,7 +3904,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -3933,16 +3932,16 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512DQ-BW-NEXT: movb $-32, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -3952,9 +3951,9 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -3963,24 +3962,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm8 ; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -3990,23 +3989,23 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -4014,7 +4013,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -4022,7 +4021,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -4050,16 +4049,16 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -4069,9 +4068,9 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -4080,24 +4079,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm14 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] @@ -4107,23 +4106,23 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} @@ -4131,7 +4130,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} @@ -4139,7 +4138,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} @@ -4881,7 +4880,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i32_stride7_vf32: ; AVX: # %bb.0: -; AVX-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX-NEXT: subq $1464, %rsp # imm = 0x5B8 ; AVX-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 448(%rdi), %ymm3 @@ -4891,11 +4890,10 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmovaps (%rdi), %xmm8 @@ -4903,11 +4901,11 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX-NEXT: vmovaps 128(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX-NEXT: vmovaps 192(%rdi), %xmm7 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[1] ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4919,8 +4917,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vmovaps 448(%rdi), %xmm10 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX-NEXT: vmovaps 448(%rdi), %xmm9 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovaps 608(%rdi), %xmm2 @@ -4928,9 +4927,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vmovaps 640(%rdi), %xmm9 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[1] -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 640(%rdi), %xmm10 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4940,24 +4938,24 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3] -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 224(%rdi), %xmm12 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX-NEXT: vmovaps 320(%rdi), %ymm3 ; AVX-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 352(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX-NEXT: vmovaps 416(%rdi), %xmm3 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm3[1] -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 416(%rdi), %xmm11 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm11[1] +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4971,392 +4969,391 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vmovaps 768(%rdi), %ymm14 +; AVX-NEXT: vmovaps 768(%rdi), %ymm15 ; AVX-NEXT: vmovaps 752(%rdi), %xmm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vmovaps 864(%rdi), %xmm6 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm6[1] +; AVX-NEXT: vmovaps 864(%rdi), %xmm13 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1],ymm0[2,2],ymm12[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm14[2,2],ymm6[5,5],ymm14[6,6] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,0],ymm1[3,3],ymm15[4,4],ymm1[7,7] -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] -; AVX-NEXT: vmovaps %ymm5, %ymm7 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm10[1],xmm1[2,3] -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1],ymm1[2,2],ymm5[5,5],ymm1[6,6] +; AVX-NEXT: vmovaps %ymm5, %ymm9 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovaps 576(%rdi), %ymm4 +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm0[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0],ymm2[3,3],ymm4[4,4],ymm2[7,7] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm10[2] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 576(%rdi), %ymm12 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm0[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,0],ymm5[3,3],ymm12[4,4],ymm5[7,7] -; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm9[2] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 288(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0],xmm11[1],xmm0[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] -; AVX-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] +; AVX-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm5[3,3],ymm0[4,4],ymm5[7,7] -; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm3[2] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 736(%rdi), %ymm5 -; AVX-NEXT: vmovaps %ymm14, %ymm3 -; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1],ymm5[2,2],ymm14[5,5],ymm5[6,6] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1],ymm0[2,2],ymm3[5,5],ymm0[6,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX-NEXT: vmovaps 704(%rdi), %xmm4 -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm2[3,4,5,6,7] -; AVX-NEXT: vmovaps 832(%rdi), %ymm13 -; AVX-NEXT: vmovaps 800(%rdi), %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],ymm13[0,1] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,0],ymm14[3,3],ymm2[4,4],ymm14[7,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm6[2] -; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX-NEXT: # xmm11 = mem[2,3,2,3] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm0[1],xmm11[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm9[0,3],ymm14[7,5],ymm9[4,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm8[2,1],ymm14[2,0],ymm8[6,5],ymm14[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7] -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX-NEXT: # ymm14 = ymm15[0],mem[0],ymm15[2],mem[2] -; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm10[0,3],ymm14[7,5],ymm10[4,7] -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7] -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload -; AVX-NEXT: # ymm14 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm1[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm4[1],xmm11[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm3[2,1],ymm14[2,0],ymm3[6,5],ymm14[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm13[0],ymm2[2],ymm13[2] -; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm6[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm0[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,0],ymm4[3,3],ymm2[4,4],ymm4[7,7] +; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm11[2] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm4[5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 704(%rdi), %xmm7 +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm11[1],xmm7[2,3] +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] +; AVX-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX-NEXT: vmovaps %ymm15, %ymm8 +; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm15[1,1],ymm4[2,2],ymm15[5,5],ymm4[6,6] ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,3,2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm12[3,4,5,6,7] +; AVX-NEXT: vmovaps 832(%rdi), %ymm12 +; AVX-NEXT: vmovaps 800(%rdi), %ymm3 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm3[2,3],ymm12[0,1] +; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm15[3,3],ymm3[4,4],ymm15[7,7] +; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX-NEXT: vmovaps %xmm13, %xmm5 +; AVX-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm15[1,2],xmm13[2] +; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[2,3,2,3] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,1],ymm3[0,3],ymm11[7,5],ymm3[4,7] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm14[0,3],ymm15[7,5],ymm14[4,7] +; AVX-NEXT: vmovaps %ymm6, %ymm13 +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,1],ymm15[2,0],ymm6[6,5],ymm15[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm2[2,1],ymm11[2,0],ymm2[6,5],ymm11[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] -; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX-NEXT: # xmm11 = xmm11[0,1,2],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm8[0,0],ymm9[5,4],ymm8[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1],ymm4[0,2],ymm8[7,5],ymm4[4,6] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = xmm0[0,1,2],mem[3] -; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm14[1,3],ymm0[4,5],ymm14[5,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm11[2,0],ymm0[4,6],ymm11[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm7[0,0],ymm10[5,4],ymm7[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm4[0,2],ymm7[7,5],ymm4[4,6] -; AVX-NEXT: vmovaps %xmm15, %xmm10 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = xmm15[0,1,2],mem[3] -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm14[0,3],ymm15[7,5],ymm14[4,7] +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,1],ymm15[2,0],ymm9[6,5],ymm15[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vunpcklpd (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX-NEXT: # ymm15 = ymm15[0],mem[0],ymm15[2],mem[2] +; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3] +; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm4[1,3],ymm0[4,5],ymm4[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[2,0],ymm12[4,6],ymm11[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm4[0,3],ymm15[7,5],ymm4[4,7] ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[0,0],ymm3[5,4],ymm2[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1],ymm7[0,2],ymm2[7,5],ymm7[4,6] -; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm1[0,1,2],xmm6[3] -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 416(%rdi), %ymm15 -; AVX-NEXT: vmovaps %ymm5, %ymm8 -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm15[0,1],ymm5[1,3],ymm15[4,5],ymm5[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm13[0,2],ymm11[2,0],ymm13[4,6],ymm11[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] -; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm5[0,2],ymm0[7,5],ymm5[4,6] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = mem[0,1,2],xmm0[3] -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,1],ymm15[2,0],ymm8[6,5],ymm15[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3,4,5,6,7] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] +; AVX-NEXT: vmovaps %ymm3, %ymm7 +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm15[0,1,2],xmm5[3] +; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,3,2,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,1],ymm4[0,3],ymm10[7,5],ymm4[4,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm5[2,1],ymm10[2,0],ymm5[6,5],ymm10[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,1],ymm9[1,3],ymm0[4,5],ymm9[5,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,2],ymm7[2,0],ymm0[4,6],ymm7[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm13[0,0],ymm0[5,4],ymm13[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,1],ymm0[0,2],ymm13[7,5],ymm0[4,6] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload +; AVX-NEXT: # xmm10 = xmm1[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,1],ymm2[1,3],ymm1[4,5],ymm2[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,2],ymm13[2,0],ymm6[4,6],ymm13[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm9[0,0],ymm14[5,4],ymm9[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,1],ymm0[0,2],ymm9[7,5],ymm0[4,6] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm1[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 640(%rdi), %ymm10 +; AVX-NEXT: vmovups (%rsp), %ymm14 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,1],ymm14[1,3],ymm10[4,5],ymm14[5,7] +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,2],ymm8[2,0],ymm6[4,6],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm5[0,0],ymm4[5,4],ymm5[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[3,1],ymm0[0,2],ymm5[7,5],ymm0[4,6] +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm12[3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 416(%rdi), %ymm12 +; AVX-NEXT: vmovaps %ymm15, %ymm9 +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,1],ymm15[1,3],ymm12[4,5],ymm15[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm5[2,0],ymm3[4,6],ymm5[6,4] +; AVX-NEXT: vmovaps %ymm3, %ymm13 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm12[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm6[0,0],ymm12[7,4],ymm6[4,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,1],ymm0[0,2],ymm2[7,5],ymm0[4,6] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,1,2],xmm2[3] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 864(%rdi), %ymm15 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,0],ymm0[6,4],ymm6[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1],ymm5[1,3],ymm15[4,5],ymm5[5,7] +; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,2],ymm4[2,0],ymm7[4,6],ymm4[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,0],ymm0[0,0],ymm6[7,4],ymm0[4,4] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm14[2,0],ymm10[5,4],ymm14[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] ; AVX-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1] -; AVX-NEXT: vmovaps 512(%rdi), %xmm7 -; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm7[0,1,2],xmm11[3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm1[0,1,0,1] +; AVX-NEXT: vmovaps 512(%rdi), %xmm11 +; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm14[2,0],ymm10[5,4],ymm14[6,4] -; AVX-NEXT: vmovaps %ymm14, %ymm6 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,1,0,1] ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX-NEXT: # xmm14 = mem[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = mem[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm8 = mem[0],xmm8[1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,0],ymm8[2,0],ymm15[5,4],ymm8[6,4] -; AVX-NEXT: vmovaps %ymm15, %ymm11 -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] -; AVX-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm2[0,1,0,1] -; AVX-NEXT: vmovaps 288(%rdi), %xmm3 -; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX-NEXT: # xmm15 = mem[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4] -; AVX-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1] -; AVX-NEXT: vmovaps 736(%rdi), %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,0],ymm9[2,0],ymm12[5,4],ymm9[6,4] +; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4] +; AVX-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX-NEXT: # xmm13 = mem[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[2,1],ymm6[3,3],ymm10[6,5],ymm6[7,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,0],ymm5[2,0],ymm15[5,4],ymm5[6,4] +; AVX-NEXT: vmovaps %ymm5, %ymm8 +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] +; AVX-NEXT: vmovaps 768(%rdi), %xmm5 +; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm5[0,1,0,1] +; AVX-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1,2],xmm13[3] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX-NEXT: # xmm14 = mem[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm7[2,1],ymm6[3,3],ymm7[6,5],ymm6[7,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload +; AVX-NEXT: # xmm14 = mem[0],xmm3[1],mem[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX-NEXT: # ymm14 = ymm0[0,0],mem[1,0],ymm0[4,4],mem[5,4] +; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm10[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX-NEXT: # ymm14 = ymm1[0,0],mem[1,0],ymm1[4,4],mem[5,4] ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] +; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm14[2,0],xmm6[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps $246, (%rsp), %ymm10, %ymm13 # 32-byte Folded Reload +; AVX-NEXT: # ymm13 = ymm10[2,1],mem[3,3],ymm10[6,5],mem[7,7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm10[1],xmm15[2,3] +; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm14 # 16-byte Folded Reload +; AVX-NEXT: # xmm14 = mem[0],xmm15[1],mem[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] +; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0,1,2],xmm11[3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm4[1,0],ymm0[4,4],ymm4[5,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm4[1,0],ymm1[4,4],ymm4[5,4] ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm14[2,0],xmm7[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload -; AVX-NEXT: # xmm14 = mem[0],xmm7[1],mem[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm14[2,0],xmm11[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm12[2,1],ymm9[3,3],ymm12[6,5],ymm9[7,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload +; AVX-NEXT: # xmm14 = mem[0],xmm3[1],mem[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1,2],xmm3[3] +; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[0,1,2],xmm2[3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload ; AVX-NEXT: # ymm14 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4] ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,0],xmm3[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX-NEXT: vmovaps %ymm8, %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[2,1],ymm9[3,3],ymm8[6,5],ymm9[7,7] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,0],xmm2[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,1],ymm8[3,3],ymm7[6,5],ymm8[7,7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0],xmm8[1],xmm9[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm3[2,0],ymm12[5,4],ymm3[6,4] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,0],ymm2[2,0],ymm12[5,4],ymm2[6,4] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm1[0,0],ymm14[1,0],ymm1[4,4],ymm14[5,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm3[0,0],ymm14[1,0],ymm3[4,4],ymm14[5,4] ; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,0],xmm2[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,0],xmm0[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[3] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm4[2,0],ymm1[5,4],ymm4[6,4] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,0],ymm2[0,0],ymm10[7,4],ymm2[4,4] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX-NEXT: # xmm2 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm5[3] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm5[3] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm14[2,0],ymm3[5,4],ymm14[6,4] ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm2[0,0],ymm7[7,4],ymm2[4,4] ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX-NEXT: # xmm2 = mem[0,1,0,1] ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,0],xmm2[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,0],ymm4[0,0],ymm1[7,4],ymm4[4,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[3,0],ymm4[0,0],ymm3[7,4],ymm4[4,4] ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX-NEXT: # xmm5 = mem[2,3,2,3] ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload @@ -5372,90 +5369,91 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: # ymm5 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4] ; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,0],ymm5[0,0],ymm1[7,4],ymm5[4,4] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = mem[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,0],ymm6[4,5],ymm5[6,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,0],ymm5[0,0],ymm3[7,4],ymm5[4,4] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX-NEXT: # xmm7 = mem[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm5, (%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 96(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 32(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 64(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 96(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 32(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 64(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%r9) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps %ymm12, 96(%rax) ; AVX-NEXT: vmovaps %ymm13, 32(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 64(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%rax) +; AVX-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%rax) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps %ymm4, 32(%rax) ; AVX-NEXT: vmovaps %ymm2, (%rax) -; AVX-NEXT: vmovaps %ymm0, 96(%rax) -; AVX-NEXT: vmovaps %ymm3, 64(%rax) -; AVX-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX-NEXT: vmovaps %ymm1, 96(%rax) +; AVX-NEXT: vmovaps %ymm0, 64(%rax) +; AVX-NEXT: addq $1464, %rsp # imm = 0x5B8 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride7_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $1192, %rsp # imm = 0x4A8 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-NEXT: subq $1224, %rsp # imm = 0x4C8 +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-NEXT: vmovdqa 544(%rdi), %ymm12 +; AVX2-NEXT: vmovdqa 544(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa (%rdi), %ymm14 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX2-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -5470,16 +5468,16 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastd 644(%rdi), %ymm3 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5488,9 +5486,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -5501,16 +5498,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 672(%rdi), %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 704(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 672(%rdi), %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm10[6],ymm12[7] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 768(%rdi), %ymm15 +; AVX2-NEXT: vmovdqa 768(%rdi), %ymm9 ; AVX2-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -5530,126 +5526,127 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,2,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 832(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa 800(%rdi), %ymm10 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa 832(%rdi), %ymm11 +; AVX2-NEXT: vmovdqa 800(%rdi), %ymm15 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm15, %ymm5 -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] ; AVX2-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 8(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] -; AVX2-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] ; AVX2-NEXT: vpbroadcastd 204(%rdi), %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm13[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpbroadcastd 456(%rdi), %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX2-NEXT: vpbroadcastd 652(%rdi), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-NEXT: vpbroadcastd 652(%rdi), %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm14[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm9[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-NEXT: vmovdqa 704(%rdi), %xmm7 -; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX2-NEXT: vpbroadcastd 876(%rdi), %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX2-NEXT: vpbroadcastd 680(%rdi), %xmm14 +; AVX2-NEXT: vmovdqa 704(%rdi), %xmm12 +; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX2-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm14 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 232(%rdi), %xmm15 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastd 428(%rdi), %ymm15 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] @@ -5657,319 +5654,325 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm15[0,2],mem[1,3],ymm15[4,6],mem[5,7] ; AVX2-NEXT: vbroadcastss 208(%rdi), %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],mem[3] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,2],ymm10[1,3],ymm8[4,6],ymm10[5,7] ; AVX2-NEXT: vbroadcastss 656(%rdi), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] -; AVX2-NEXT: vmovaps %ymm6, %ymm15 +; AVX2-NEXT: vmovdqa %ymm7, %ymm13 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7] +; AVX2-NEXT: vmovaps %ymm4, %ymm7 ; AVX2-NEXT: vbroadcastss 432(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3] +; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-NEXT: vbroadcastss 880(%rdi), %ymm1 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm6[1,3],ymm14[4,6],ymm6[5,7] -; AVX2-NEXT: vbroadcastss 880(%rdi), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm6[1,3],ymm5[4,6],ymm6[5,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [4,3,0,0] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = [4,3,0,0] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vbroadcastss 548(%rdi), %xmm2 -; AVX2-NEXT: vmovaps 512(%rdi), %xmm7 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-NEXT: vpermps %ymm12, %ymm11, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vmovaps %ymm9, %ymm12 +; AVX2-NEXT: vmovaps 512(%rdi), %xmm14 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-NEXT: vpermps %ymm8, %ymm4, %ymm2 ; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vmovaps %ymm10, %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 100(%rdi), %xmm2 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] +; AVX2-NEXT: vbroadcastss 100(%rdi), %xmm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-NEXT: vpermps %ymm10, %ymm11, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vbroadcastss 212(%rdi), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-NEXT: vmovaps %ymm15, %ymm12 +; AVX2-NEXT: vpermps %ymm15, %ymm4, %ymm3 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vbroadcastss 212(%rdi), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpermps %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vbroadcastss 324(%rdi), %xmm4 +; AVX2-NEXT: vpermps %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vbroadcastss 324(%rdi), %xmm8 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-NEXT: vpermps %ymm15, %ymm11, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vmovaps %ymm8, %ymm9 -; AVX2-NEXT: vbroadcastss 436(%rdi), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] +; AVX2-NEXT: vpermps %ymm7, %ymm4, %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vmovaps %ymm13, %ymm15 +; AVX2-NEXT: vbroadcastss 436(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastss 772(%rdi), %xmm4 -; AVX2-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-NEXT: vpermps %ymm14, %ymm11, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vbroadcastss 884(%rdi), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-NEXT: vbroadcastss 216(%rdi), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm7[3] +; AVX2-NEXT: vbroadcastss 772(%rdi), %xmm9 +; AVX2-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 736(%rdi), %xmm8 +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm9[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-NEXT: vpermps %ymm5, %ymm4, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vmovaps %ymm6, %ymm13 +; AVX2-NEXT: vbroadcastss 884(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,3,5,4,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-NEXT: vbroadcastss 216(%rdi), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 544(%rdi), %xmm3 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm14[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-NEXT: vbroadcastss 664(%rdi), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vbroadcastss 664(%rdi), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovaps 320(%rdi), %xmm12 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX2-NEXT: vmovaps 320(%rdi), %xmm11 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm7[1],ymm15[2,3,4],ymm7[5],ymm15[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-NEXT: vbroadcastss 440(%rdi), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vbroadcastss 440(%rdi), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-NEXT: vbroadcastss 888(%rdi), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vbroadcastss 584(%rdi), %xmm3 -; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpermps 640(%rdi), %ymm11, %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vbroadcastss 528(%rdi), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm8[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-NEXT: vbroadcastss 888(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vbroadcastss 584(%rdi), %xmm5 +; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpermps 640(%rdi), %ymm4, %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vbroadcastss 528(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vbroadcastss 808(%rdi), %xmm3 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpermps 864(%rdi), %ymm11, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vbroadcastss 752(%rdi), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-NEXT: vpermps 864(%rdi), %ymm4, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vbroadcastss 752(%rdi), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpermps 192(%rdi), %ymm11, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vbroadcastss 80(%rdi), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vbroadcastss 360(%rdi), %xmm4 -; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-NEXT: vpermps 416(%rdi), %ymm11, %ymm6 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vbroadcastss 304(%rdi), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, (%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 32(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, (%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm6, 64(%r9) -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-NEXT: vmovaps %ymm5, 32(%rax) -; AVX2-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-NEXT: vmovaps %ymm0, (%rax) -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-NEXT: vmovaps %ymm3, (%rax) -; AVX2-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-NEXT: addq $1192, %rsp # imm = 0x4A8 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX2-FP-LABEL: load_i32_stride7_vf32: -; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $1192, %rsp # imm = 0x4A8 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm14 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpbroadcastd 196(%rdi), %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX2-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 +; AVX2-NEXT: vbroadcastss 80(%rdi), %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm12[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-NEXT: vbroadcastss 360(%rdi), %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = xmm7[0],mem[1],xmm7[2,3] +; AVX2-NEXT: vpermps 416(%rdi), %ymm4, %ymm4 +; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vbroadcastss 304(%rdi), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, (%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 96(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 32(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, (%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovaps %ymm8, 96(%rax) +; AVX2-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-NEXT: vmovaps %ymm0, (%rax) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-NEXT: vmovaps %ymm3, (%rax) +; AVX2-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-NEXT: vmovaps %ymm9, 64(%rax) +; AVX2-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX2-FP-LABEL: load_i32_stride7_vf32: +; AVX2-FP: # %bb.0: +; AVX2-FP-NEXT: subq $1224, %rsp # imm = 0x4C8 +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vpbroadcastd 196(%rdi), %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FP-NEXT: vpbroadcastd 644(%rdi), %ymm3 +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5978,9 +5981,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -5991,16 +5993,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm10[6],ymm12[7] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm9 ; AVX2-FP-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -6020,126 +6021,127 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,2,2,2] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm10 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm15, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 8(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] ; AVX2-FP-NEXT: vpbroadcastd 204(%rdi), %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm13[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpbroadcastd 456(%rdi), %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX2-FP-NEXT: vpbroadcastd 652(%rdi), %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-FP-NEXT: vpbroadcastd 652(%rdi), %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm9[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX2-FP-NEXT: vpbroadcastd 876(%rdi), %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 680(%rdi), %xmm14 +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX2-FP-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm14 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 232(%rdi), %xmm15 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-FP-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 428(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] @@ -6147,274 +6149,279 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm15[0,2],mem[1,3],ymm15[4,6],mem[5,7] ; AVX2-FP-NEXT: vbroadcastss 208(%rdi), %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],mem[3] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,2],ymm10[1,3],ymm8[4,6],ymm10[5,7] ; AVX2-FP-NEXT: vbroadcastss 656(%rdi), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] -; AVX2-FP-NEXT: vmovaps %ymm6, %ymm15 +; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm13 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7] +; AVX2-FP-NEXT: vmovaps %ymm4, %ymm7 ; AVX2-FP-NEXT: vbroadcastss 432(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3] +; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FP-NEXT: vbroadcastss 880(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm6[1,3],ymm14[4,6],ymm6[5,7] -; AVX2-FP-NEXT: vbroadcastss 880(%rdi), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm6[1,3],ymm5[4,6],ymm6[5,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [4,3,0,0] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm0 = [4,3,0,0] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vbroadcastss 548(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-FP-NEXT: vpermps %ymm12, %ymm11, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vmovaps %ymm9, %ymm12 +; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpermps %ymm8, %ymm4, %ymm2 ; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vmovaps %ymm10, %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 100(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] +; AVX2-FP-NEXT: vbroadcastss 100(%rdi), %xmm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpermps %ymm10, %ymm11, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vbroadcastss 212(%rdi), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FP-NEXT: vmovaps %ymm15, %ymm12 +; AVX2-FP-NEXT: vpermps %ymm15, %ymm4, %ymm3 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vbroadcastss 212(%rdi), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm2, %ymm1, %ymm3 -; AVX2-FP-NEXT: vbroadcastss 324(%rdi), %xmm4 +; AVX2-FP-NEXT: vpermps %ymm2, %ymm0, %ymm3 +; AVX2-FP-NEXT: vbroadcastss 324(%rdi), %xmm8 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpermps %ymm15, %ymm11, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vmovaps %ymm8, %ymm9 -; AVX2-FP-NEXT: vbroadcastss 436(%rdi), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] +; AVX2-FP-NEXT: vpermps %ymm7, %ymm4, %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vmovaps %ymm13, %ymm15 +; AVX2-FP-NEXT: vbroadcastss 436(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vbroadcastss 772(%rdi), %xmm4 -; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpermps %ymm14, %ymm11, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vbroadcastss 884(%rdi), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FP-NEXT: vbroadcastss 216(%rdi), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm7[3] +; AVX2-FP-NEXT: vbroadcastss 772(%rdi), %xmm9 +; AVX2-FP-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm9[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm4, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vmovaps %ymm6, %ymm13 +; AVX2-FP-NEXT: vbroadcastss 884(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,3,5,4,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-FP-NEXT: vbroadcastss 216(%rdi), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm14[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FP-NEXT: vbroadcastss 664(%rdi), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vbroadcastss 664(%rdi), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm7[1],ymm15[2,3,4],ymm7[5],ymm15[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FP-NEXT: vbroadcastss 440(%rdi), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 440(%rdi), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FP-NEXT: vbroadcastss 888(%rdi), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 584(%rdi), %xmm3 -; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpermps 640(%rdi), %ymm11, %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vbroadcastss 528(%rdi), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm8[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FP-NEXT: vbroadcastss 888(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 584(%rdi), %xmm5 +; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpermps 640(%rdi), %ymm4, %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vbroadcastss 528(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 808(%rdi), %xmm3 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpermps 864(%rdi), %ymm11, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vbroadcastss 752(%rdi), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpermps 864(%rdi), %ymm4, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vbroadcastss 752(%rdi), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpermps 192(%rdi), %ymm11, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vbroadcastss 80(%rdi), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 360(%rdi), %xmm4 -; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FP-NEXT: vpermps 416(%rdi), %ymm11, %ymm6 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vbroadcastss 304(%rdi), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, (%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 32(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, (%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm6, 64(%r9) -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm3, (%rax) +; AVX2-FP-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 +; AVX2-FP-NEXT: vbroadcastss 80(%rdi), %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm12[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vbroadcastss 360(%rdi), %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = xmm7[0],mem[1],xmm7[2,3] +; AVX2-FP-NEXT: vpermps 416(%rdi), %ymm4, %ymm4 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vbroadcastss 304(%rdi), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, (%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 96(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 32(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FP-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-FP-NEXT: addq $1192, %rsp # imm = 0x4A8 +; AVX2-FP-NEXT: vmovaps %ymm9, 64(%rax) +; AVX2-FP-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -6423,7 +6430,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: subq $1224, %rsp # imm = 0x4C8 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm8 @@ -6434,7 +6441,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6],ymm14[7] ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -6464,8 +6471,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 @@ -6482,9 +6489,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm2[6],ymm9[7] +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 @@ -6517,23 +6524,23 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -6541,18 +6548,18 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm15 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm10[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm5[2,3],ymm11[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -6580,37 +6587,37 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-FCP-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 204(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 456(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX2-FCP-NEXT: vpbroadcastd 652(%rdi), %ymm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 752(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 680(%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm9[1],xmm13[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm9[0],ymm15[2],ymm9[2] +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] ; AVX2-FCP-NEXT: vpbroadcastd 876(%rdi), %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] @@ -6631,28 +6638,28 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-FCP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,2],ymm9[1,3],ymm15[4,6],ymm9[5,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm14[1,3],ymm10[4,6],ymm14[5,7] ; AVX2-FCP-NEXT: vbroadcastss 208(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,2],ymm12[1,3],ymm8[4,6],ymm12[5,7] +; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm12[1,3],ymm3[4,6],ymm12[5,7] ; AVX2-FCP-NEXT: vbroadcastss 656(%rdi), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] @@ -6662,16 +6669,17 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7] +; AVX2-FCP-NEXT: vbroadcastss 432(%rdi), %ymm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm4[1,3],ymm5[4,6],ymm4[5,7] ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FCP-NEXT: vbroadcastss 432(%rdi), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],mem[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -6682,14 +6690,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,3,0,0] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 548(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm11, %ymm2 @@ -6707,11 +6715,10 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vmovaps %ymm15, %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm11, %ymm3 -; AVX2-FCP-NEXT: vmovaps %ymm9, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm3 ; AVX2-FCP-NEXT: vpbroadcastd 212(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps %ymm14, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6723,8 +6730,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm11, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm15 ; AVX2-FCP-NEXT: vpbroadcastd 436(%rdi), %ymm8 @@ -6740,27 +6746,28 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm11, %ymm4 +; AVX2-FCP-NEXT: vmovaps %ymm6, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm12 ; AVX2-FCP-NEXT: vpbroadcastd 884(%rdi), %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7] ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vbroadcastss 216(%rdi), %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm14[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm7[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] @@ -6769,8 +6776,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm4, %ymm5 -; AVX2-FCP-NEXT: vbroadcastss 664(%rdi), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FCP-NEXT: vbroadcastss 664(%rdi), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] @@ -6779,24 +6786,24 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7] +; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm4, %ymm5 -; AVX2-FCP-NEXT: vbroadcastss 440(%rdi), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FCP-NEXT: vbroadcastss 440(%rdi), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vbroadcastss 888(%rdi), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vbroadcastss 888(%rdi), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 584(%rdi), %xmm3 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] @@ -6832,73 +6839,73 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 +; AVX2-FCP-NEXT: vpbroadcastd 80(%rdi), %ymm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 80(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm9[3] +; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vbroadcastss 360(%rdi), %xmm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 360(%rdi), %xmm4 -; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 +; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm4 = xmm7[0],mem[1],xmm7[2,3] +; AVX2-FCP-NEXT: vpermd 416(%rdi), %ymm11, %ymm7 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 304(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpbroadcastd 304(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3] ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, (%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, (%rcx) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, (%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) @@ -6933,33 +6940,33 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512-NEXT: vpermt2d %zmm13, %zmm18, %zmm17 ; AVX512-NEXT: movb $-32, %dil ; AVX512-NEXT: kmovw %edi, %k2 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 -; AVX512-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 ; AVX512-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm7 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512-NEXT: movw $480, %di # imm = 0x1E0 @@ -6986,7 +6993,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7009,7 +7016,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7026,19 +7033,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 ; AVX512-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -7050,15 +7057,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 ; AVX512-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] +; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -7076,7 +7083,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -7125,33 +7132,33 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512-FCP-NEXT: kmovw %edi, %k1 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm17 ; AVX512-FCP-NEXT: movb $-32, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm7 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512-FCP-NEXT: movw $480, %di # imm = 0x1E0 @@ -7178,7 +7185,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7201,7 +7208,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7218,19 +7225,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 ; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -7242,15 +7249,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 ; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -7268,7 +7275,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -7317,33 +7324,33 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512DQ-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-NEXT: kmovw %edi, %k1 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm18, %zmm17 ; AVX512DQ-NEXT: movb $-32, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm7 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512DQ-NEXT: movw $480, %di # imm = 0x1E0 @@ -7370,7 +7377,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7393,7 +7400,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7410,19 +7417,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 ; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -7434,15 +7441,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 ; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -7460,7 +7467,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -7509,33 +7516,33 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: movb $-32, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: movw $480, %di # imm = 0x1E0 @@ -7562,7 +7569,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7585,7 +7592,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7602,19 +7609,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -7626,15 +7633,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -7652,7 +7659,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -7701,33 +7708,33 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm17 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 @@ -7754,7 +7761,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7777,7 +7784,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7794,19 +7801,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -7818,15 +7825,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 ; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -7844,7 +7851,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -7893,33 +7900,33 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: movb $-32, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 @@ -7946,7 +7953,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -7969,7 +7976,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -7986,19 +7993,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -8010,15 +8017,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -8036,7 +8043,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -8085,33 +8092,33 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: movb $-32, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: movw $480, %di # imm = 0x1E0 @@ -8138,7 +8145,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -8161,7 +8168,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -8178,19 +8185,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -8202,15 +8209,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -8228,7 +8235,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -8277,33 +8284,33 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] +; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm3, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movw $480, %di # imm = 0x1E0 @@ -8330,7 +8337,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm19, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2} @@ -8353,7 +8360,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [19,26,1,8,15,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2} @@ -8370,19 +8377,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm26, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm29, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 @@ -8394,15 +8401,15 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm29, %zmm28, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm28, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm30 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm30, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm31, %zmm29, %zmm29 @@ -8420,7 +8427,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} @@ -9927,12 +9934,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i32_stride7_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $3176, %rsp # imm = 0xC68 +; AVX-NEXT: subq $3224, %rsp # imm = 0xC98 ; AVX-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 672(%rdi), %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 768(%rdi), %ymm11 +; AVX-NEXT: vmovaps 768(%rdi), %ymm13 ; AVX-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rdi), %ymm1 @@ -9944,15 +9951,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vmovaps 224(%rdi), %xmm10 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX-NEXT: vmovaps 224(%rdi), %xmm9 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 352(%rdi), %xmm4 +; AVX-NEXT: vmovaps 384(%rdi), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX-NEXT: vmovaps 416(%rdi), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm4[1] @@ -9961,12 +9968,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 752(%rdi), %xmm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vmovaps 672(%rdi), %xmm15 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX-NEXT: vmovaps 672(%rdi), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovaps 832(%rdi), %xmm1 @@ -9974,8 +9981,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX-NEXT: vmovaps 864(%rdi), %xmm13 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[1] +; AVX-NEXT: vmovaps 864(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9985,15 +9993,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps 1120(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps 1120(%rdi), %xmm14 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vmovaps 1216(%rdi), %ymm9 +; AVX-NEXT: vmovaps 1216(%rdi), %ymm12 ; AVX-NEXT: vmovaps 1200(%rdi), %xmm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10012,9 +10020,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps 1568(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps 1568(%rdi), %xmm7 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX-NEXT: vmovaps 1664(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10043,11 +10050,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vmovaps 96(%rdi), %ymm14 +; AVX-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2] -; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10066,15 +10073,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps 448(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 544(%rdi), %ymm10 ; AVX-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 608(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10093,23 +10100,24 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps 896(%rdi), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps 896(%rdi), %xmm4 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vmovaps 992(%rdi), %ymm5 +; AVX-NEXT: vmovaps 992(%rdi), %ymm8 ; AVX-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX-NEXT: vmovaps 1088(%rdi), %xmm8 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1] -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 1088(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10119,15 +10127,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps 1344(%rdi), %xmm5 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vmovaps 1440(%rdi), %ymm4 +; AVX-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 1424(%rdi), %xmm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10140,82 +10148,84 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 288(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3] -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm1[2,2],ymm6[5,5],ymm1[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 352(%rdi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX-NEXT: vmovaps 352(%rdi), %ymm15 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,0],ymm1[3,3],ymm15[4,4],ymm1[7,7] +; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm0[2,2],ymm11[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 704(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vmovaps 704(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm3, %xmm6 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm1[2,2],ymm13[5,5],ymm1[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 800(%rdi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX-NEXT: vmovaps 800(%rdi), %ymm3 +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0],ymm1[3,3],ymm3[4,4],ymm1[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[2] +; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1184(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vmovaps 1152(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[2,2],ymm12[5,5],ymm1[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1248(%rdi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX-NEXT: vmovaps 1248(%rdi), %ymm13 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0],ymm1[3,3],ymm13[4,4],ymm1[7,7] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[2] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm14[2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1632(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 1600(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vmovaps 1600(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm7, %xmm12 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX-NEXT: vmovaps 1632(%rdi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm1[2,2],ymm7[5,5],ymm1[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 1696(%rdi), %ymm2 @@ -10228,39 +10238,38 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm0[2,2],ymm14[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[2,2],ymm2[5,5],ymm1[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[3,3],ymm0[4,4],ymm2[7,7] +; AVX-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm0[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,0],ymm2[3,3],ymm3[4,4],ymm2[7,7] ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX-NEXT: # xmm2 = zero,xmm2[1,2],mem[0] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX-NEXT: vmovaps 480(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm11[1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] +; AVX-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm0[2,2],ymm10[5,5],ymm0[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 576(%rdi), %ymm1 @@ -10273,645 +10282,679 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 928(%rdi), %xmm10 +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0],xmm4[1],xmm10[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] ; AVX-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX-NEXT: vmovaps 928(%rdi), %xmm0 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX-NEXT: vmovaps %xmm12, %xmm6 -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,1],ymm0[2,2],ymm8[5,5],ymm0[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm0[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm4[3,3],ymm1[4,4],ymm4[7,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,0],ymm5[3,3],ymm1[4,4],ymm5[7,7] -; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm8[2] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm11[2] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,0],mem[3,3] ; AVX-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX-NEXT: vmovaps 1376(%rdi), %xmm4 -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3] -; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm5[3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1],ymm0[2,2],ymm3[5,5],ymm0[6,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7] ; AVX-NEXT: vmovaps 1504(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1472(%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,0],ymm8[3,3],ymm1[4,4],ymm8[7,7] -; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = zero,xmm8[1,2],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX-NEXT: vmovaps 1472(%rdi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm0[0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0],ymm5[3,3],ymm2[4,4],ymm5[7,7] +; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = zero,xmm5[1,2],mem[0] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,3,2,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm8[3,1],mem[0,3],ymm8[7,5],mem[4,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[2,1],ymm8[2,0],ymm0[6,5],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm15[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm8[3,1],mem[0,3],ymm8[7,5],mem[4,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[2,1],ymm8[2,0],ymm0[6,5],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm11[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX-NEXT: vblendps $8, (%rsp), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[2,3,2,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm8[3,1],mem[0,3],ymm8[7,5],mem[4,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm9[3] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm7[2,3,2,3] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[2,1],ymm8[2,0],ymm0[6,5],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,3,2,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm8[3,1],mem[0,3],ymm8[7,5],mem[4,7] +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,1],ymm8[2,0],ymm7[6,5],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = mem[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm14[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm14[2,1],ymm12[2,0],ymm14[6,5],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps %xmm3, %xmm9 -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX-NEXT: vmovaps %xmm4, %xmm6 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm5[0,3],ymm12[7,5],ymm5[4,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm4[2,1],ymm12[2,0],ymm4[6,5],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm8[3,1],mem[0,3],ymm8[7,5],mem[4,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,1],ymm8[2,0],ymm2[6,5],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm8[3,1],mem[0,3],ymm8[7,5],mem[4,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,1],ymm8[2,0],ymm2[6,5],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[2,3,2,3] +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm8[3,1],mem[0,3],ymm8[7,5],mem[4,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,1],ymm8[2,0],ymm2[6,5],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm11[3] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,3,2,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = mem[0,1,2],xmm0[3] -; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 416(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm10[1,3],ymm0[4,5],ymm10[5,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = mem[0,1,2],xmm0[3] -; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 864(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm2[1,3],ymm0[4,5],ymm2[5,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,1],ymm7[0,3],ymm8[7,5],ymm7[4,7] +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,1],ymm8[2,0],ymm3[6,5],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0],ymm8[0,0],ymm5[5,4],ymm8[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[3,1],ymm5[0,2],ymm8[7,5],ymm5[4,6] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm9[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm8[0,1],mem[1,3],ymm8[4,5],mem[5,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,2],ymm8[2,0],ymm9[4,6],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0],ymm8[0,0],ymm5[5,4],ymm8[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[3,1],ymm5[0,2],ymm8[7,5],ymm5[4,6] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm6[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 864(%rdi), %ymm6 +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm6[0,1],mem[1,3],ymm6[4,5],mem[5,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,2],ymm8[2,0],ymm6[4,6],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0],ymm8[0,0],ymm5[5,4],ymm8[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[3,1],ymm5[0,2],ymm8[7,5],ymm5[4,6] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm4[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 1312(%rdi), %ymm4 +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,1],ymm0[1,3],ymm4[4,5],ymm0[5,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm2[3,1],ymm8[0,2],ymm2[7,5],ymm8[4,6] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = mem[0,1,2],xmm0[3] -; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 1312(%rdi), %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[0,2],ymm8[2,0],ymm0[4,6],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm8[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm1[1,3],ymm0[4,5],ymm1[5,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm12[5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm1[3,1],ymm8[0,2],ymm1[7,5],ymm8[4,6] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm13[0,1,2],mem[3] -; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm0[0,0],ymm4[5,4],ymm0[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm5[0,2],ymm0[7,5],ymm5[4,6] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm14[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] ; AVX-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm11[1,3],ymm0[4,5],ymm11[5,7] -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm15[0,2],ymm12[2,0],ymm15[4,6],ymm12[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm12[5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[0,1],ymm13[1,3],ymm0[4,5],ymm13[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm15[0,2],ymm8[2,0],ymm15[4,6],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm8[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm4[0,0],ymm5[5,4],ymm4[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm2[0,2],ymm4[7,5],ymm2[4,6] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm9[3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX-NEXT: vmovaps %ymm3, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[0,0],ymm7[5,4],ymm3[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm3[0,2],ymm0[7,5],ymm3[4,6] +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm12[3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX-NEXT: vmovaps 1536(%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,1],ymm1[1,3],ymm0[4,5],ymm1[5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,2],ymm5[2,0],ymm2[4,6],ymm5[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] -; AVX-NEXT: vmovaps %ymm3, %ymm15 -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,2],ymm4[2,0],ymm7[4,6],ymm4[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[0,0],ymm1[5,4],ymm14[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,1],ymm2[0,2],ymm14[7,5],ymm2[4,6] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 1088(%rdi), %ymm11 -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,1],ymm10[1,3],ymm11[4,5],ymm10[5,7] -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm3[0,2],ymm0[7,5],ymm3[4,6] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm10[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 1088(%rdi), %ymm14 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,1],ymm15[1,3],ymm14[4,5],ymm15[5,7] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[2,0],ymm6[4,6],ymm5[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm9[0,1,2],mem[3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX-NEXT: vmovaps 640(%rdi), %ymm8 -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm5[1,3],ymm8[4,5],ymm5[5,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,2],ymm4[2,0],ymm3[4,6],ymm4[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm3[0,2],ymm0[7,5],ymm3[4,6] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm10[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 640(%rdi), %ymm7 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm2[1,3],ymm7[4,5],ymm2[5,7] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,2],ymm5[2,0],ymm4[4,6],ymm5[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm6[0,1,2],mem[3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm3[0,2],ymm0[7,5],ymm3[4,6] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm8[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX-NEXT: vmovaps 192(%rdi), %ymm13 -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2],ymm7[2,0],ymm2[4,6],ymm7[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm7[5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,2],ymm5[2,0],ymm11[4,6],ymm5[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm4[0,0],ymm2[7,4],ymm4[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] -; AVX-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm3[0,0],ymm11[7,4],ymm3[4,4] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] +; AVX-NEXT: vmovaps 64(%rdi), %xmm12 +; AVX-NEXT: vmovaps 96(%rdi), %xmm11 +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm11[0,1,0,1] +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm0[0,0],ymm9[7,4],ymm0[4,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,0],ymm8[2,0],ymm13[5,4],ymm8[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] ; AVX-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1] ; AVX-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = mem[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm5[2,0],ymm8[5,4],ymm5[6,4] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,0],ymm0[0,0],ymm4[7,4],ymm0[4,4] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm2[2,0],ymm7[5,4],ymm2[6,4] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX-NEXT: vmovaps 512(%rdi), %xmm7 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm5[2,0],ymm3[5,4],ymm5[6,4] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX-NEXT: vmovaps 736(%rdi), %xmm4 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = mem[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX-NEXT: vmovaps 736(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX-NEXT: # xmm10 = mem[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX-NEXT: vmovaps 992(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX-NEXT: vmovaps 960(%rdi), %xmm3 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,0],ymm0[0,0],ymm6[7,4],ymm0[4,4] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,0],ymm15[2,0],ymm14[5,4],ymm15[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4] +; AVX-NEXT: vmovaps 992(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm2[0,1,0,1] +; AVX-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX-NEXT: vmovaps 1184(%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX-NEXT: # xmm14 = mem[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX-NEXT: vmovaps 1184(%rdi), %xmm4 +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4] +; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm1[0,1,0,1] ; AVX-NEXT: vmovaps 1408(%rdi), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = mem[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX-NEXT: # xmm14 = mem[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4] -; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm10[2,0],ymm0[6,4],ymm10[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,0],ymm14[2,0],ymm0[6,4],ymm14[6,4] ; AVX-NEXT: vmovaps 1664(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,1,0,1] -; AVX-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = mem[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX-NEXT: # ymm9 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = mem[0],xmm2[1],mem[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm9[2,0],ymm10[5,4],ymm9[6,4] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = xmm14[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vmovaps 1632(%rdi), %xmm4 +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[2,1],mem[3,3],ymm0[6,5],mem[7,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] +; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm12[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm6[0,0],ymm9[1,0],ymm6[4,4],ymm9[5,4] +; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,0],xmm4[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[2,1],ymm8[3,3],ymm13[6,5],ymm8[7,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm11[0,1,2],mem[3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,0],ymm12[1,0],ymm2[4,4],ymm12[5,4] -; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm13[2,1],ymm7[3,3],ymm13[6,5],ymm7[7,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = mem[0],xmm9[1],mem[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,0],ymm8[2,0],ymm9[5,4],ymm8[6,4] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX-NEXT: # xmm7 = mem[0,1,2],xmm7[3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,0],ymm13[1,0],ymm10[4,4],ymm13[5,4] -; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,0],xmm7[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: # ymm7 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = mem[0],xmm8[1],mem[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = mem[0,1,2],xmm6[3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX-NEXT: # ymm8 = ymm9[0,0],mem[1,0],ymm9[4,4],mem[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm12[1,0],ymm10[4,4],ymm12[5,4] ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX-NEXT: # ymm6 = ymm6[2,1],mem[3,3],ymm6[6,5],mem[7,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = mem[0,1,2],xmm4[3] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,0],xmm4[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[2,1],mem[3,3],ymm0[6,5],mem[7,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] +; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,1,2],xmm7[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: # ymm7 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0],ymm13[1,0],ymm8[4,4],ymm13[5,4] ; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,0],xmm4[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1,2],xmm3[3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX-NEXT: # ymm6 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] -; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,1],ymm5[3,3],ymm3[6,5],ymm5[7,7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm11[0,1,2],mem[3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] -; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,1,2],xmm3[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0],ymm14[1,0],ymm7[4,4],ymm14[5,4] +; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[2,1],mem[3,3],ymm0[6,5],mem[7,7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] -; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = mem[0,1,2],xmm3[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4] +; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,0],ymm0[2,0],ymm3[5,4],ymm0[6,4] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = mem[0,1,2],xmm2[3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX-NEXT: # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4] ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,1],ymm15[3,3],ymm5[6,5],ymm15[7,7] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[2,1],mem[3,3],ymm0[6,5],mem[7,7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,0],ymm0[2,0],ymm3[5,4],ymm0[6,4] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[0,1,2],xmm2[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: # ymm3 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[3,2] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,1],ymm15[3,3],ymm1[6,5],ymm15[7,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,0],ymm0[2,0],ymm2[5,4],ymm0[6,4] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm15[0,1,2],mem[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX-NEXT: # ymm2 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[3,2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0,1,2],mem[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm9[2,0],ymm6[5,4],ymm9[6,4] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm12[2,0],ymm10[5,4],ymm12[6,4] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1,2],mem[3] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm13[2,0],ymm10[5,4],ymm13[6,4] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = mem[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm13[2,0],ymm8[5,4],ymm13[6,4] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1,2],mem[3] -; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm14[2,0],ymm7[5,4],ymm14[6,4] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[0,1,0,1] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload +; AVX-NEXT: # ymm2 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[0,0],ymm4[7,4],ymm2[4,4] ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = mem[2,3,2,3] ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1,0,1] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[0,1,0,1] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,0],ymm4[0,0],ymm5[7,4],ymm4[4,4] @@ -10921,212 +10964,175 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,0],ymm8[4,5],ymm4[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = mem[0,1,0,1] ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX-NEXT: # ymm8 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,0],xmm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm5[3,0],ymm8[0,0],ymm5[7,4],ymm8[4,4] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = mem[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm11[3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload -; AVX-NEXT: # ymm9 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] -; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,0],xmm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm5[3,0],ymm9[0,0],ymm5[7,4],ymm9[4,4] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = mem[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = mem[0,1,0,1] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm9[0,1,2],mem[3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX-NEXT: # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] -; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,0],xmm5[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm7[3,0],ymm9[0,0],ymm7[7,4],ymm9[4,4] -; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = mem[2,3,2,3] -; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload -; AVX-NEXT: # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm10[2,0],xmm9[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm6[2,3,0,1] -; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm6[3,0],ymm10[0,0],ymm6[7,4],ymm10[4,4] +; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX-NEXT: # xmm11 = mem[2,3,2,3] +; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,0],ymm11[4,5],ymm8[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = mem[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm15[3] +; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX-NEXT: # ymm11 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] +; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm11[2,0],xmm8[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm3[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm3[3,0],ymm11[0,0],ymm3[7,4],ymm11[4,4] ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX-NEXT: # xmm12 = mem[2,3,2,3] ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,0],ymm12[4,5],ymm10[6,4] -; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, (%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm10, 160(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, (%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, (%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, (%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 64(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 128(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 192(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 224(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 160(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 96(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 32(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 224(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 192(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 160(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 128(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 96(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 64(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 32(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, (%r9) -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 224(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 192(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 160(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 128(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 96(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 64(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, 32(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, (%rax) +; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,0],ymm12[4,5],ymm11[6,4] +; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm11, 160(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm11, 32(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 128(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 192(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 224(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 160(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 224(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 192(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 160(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 128(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%r9) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps %ymm9, 224(%rax) -; AVX-NEXT: vmovaps %ymm5, 192(%rax) -; AVX-NEXT: vmovaps %ymm4, 160(%rax) -; AVX-NEXT: vmovaps %ymm8, 128(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 224(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 192(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 160(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 128(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm3, 96(%rax) -; AVX-NEXT: vmovaps %ymm0, 64(%rax) -; AVX-NEXT: vmovaps %ymm1, 32(%rax) -; AVX-NEXT: vmovaps %ymm2, (%rax) -; AVX-NEXT: addq $3176, %rsp # imm = 0xC68 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 64(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%rax) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: vmovaps %ymm8, 224(%rax) +; AVX-NEXT: vmovaps %ymm4, 192(%rax) +; AVX-NEXT: vmovaps %ymm1, 160(%rax) +; AVX-NEXT: vmovaps %ymm2, 128(%rax) +; AVX-NEXT: vmovaps %ymm0, 96(%rax) +; AVX-NEXT: vmovaps %ymm9, 64(%rax) +; AVX-NEXT: vmovaps %ymm10, 32(%rax) +; AVX-NEXT: vmovaps %ymm6, (%rax) +; AVX-NEXT: addq $3224, %rsp # imm = 0xC98 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride7_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $2648, %rsp # imm = 0xA58 -; AVX2-NEXT: vmovdqa 1216(%rdi), %ymm9 -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: subq $2664, %rsp # imm = 0xA68 +; AVX2-NEXT: vmovdqa 1216(%rdi), %ymm8 ; AVX2-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1120(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 768(%rdi), %ymm12 -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm11 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -11138,28 +11144,26 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq 752(%rdi), %ymm2 +; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovdqa %ymm12, %ymm6 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastd 868(%rdi), %ymm3 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-NEXT: vmovdqa %ymm5, %ymm7 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -11170,11 +11174,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1600(%rdi), %ymm13 +; AVX2-NEXT: vmovdqa 1600(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1568(%rdi), %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6],ymm5[7] ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11190,34 +11194,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastd 196(%rdi), %ymm3 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 480(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -11230,11 +11234,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 928(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-NEXT: vmovdqa 928(%rdi), %ymm15 +; AVX2-NEXT: vmovdqa 896(%rdi), %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm15[6],ymm12[7] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 992(%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11250,16 +11254,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1376(%rdi), %ymm14 -; AVX2-NEXT: vmovdqa 1344(%rdi), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1376(%rdi), %ymm13 +; AVX2-NEXT: vmovdqa 1344(%rdi), %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 1440(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 1440(%rdi), %ymm9 ; AVX2-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 1472(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 1504(%rdi), %xmm2 @@ -11271,46 +11275,49 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqa 800(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,2,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 1280(%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm2 @@ -11320,12 +11327,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 1184(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7] +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] +; AVX2-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11340,30 +11347,30 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 1632(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] +; AVX2-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,2,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 608(%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -11371,27 +11378,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 960(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,2,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 1504(%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1472(%rdi), %ymm2 @@ -11401,8 +11406,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovdqa 1408(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -11410,109 +11415,110 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm14[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-NEXT: vpermd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 256(%rdi), %xmm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] ; AVX2-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 752(%rdi), %xmm0 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpbroadcastd 680(%rdi), %xmm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-NEXT: vpbroadcastd 876(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-NEXT: vpbroadcastd 876(%rdi), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 1128(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 1152(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-NEXT: vmovdqa 1152(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-NEXT: vpbroadcastd 1324(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vpbroadcastd 1324(%rdi), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 1576(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 1600(%rdi), %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-NEXT: vmovdqa 1600(%rdi), %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-NEXT: vpbroadcastd 1772(%rdi), %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-NEXT: vpbroadcastd 1772(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] ; AVX2-NEXT: vpbroadcastd 204(%rdi), %ymm6 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovdqa 528(%rdi), %xmm6 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpbroadcastd 456(%rdi), %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqa 480(%rdi), %xmm5 +; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-NEXT: vpbroadcastd 652(%rdi), %ymm15 @@ -11521,118 +11527,115 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-NEXT: vmovdqa 928(%rdi), %xmm11 -; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] +; AVX2-NEXT: vmovdqa 928(%rdi), %xmm12 +; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-NEXT: vpbroadcastd 1100(%rdi), %ymm14 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm14 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastd 1548(%rdi), %ymm15 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-NEXT: vpbroadcastd 1548(%rdi), %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7] -; AVX2-NEXT: vmovaps %ymm4, %ymm12 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7] ; AVX2-NEXT: vbroadcastss 432(%rdi), %ymm14 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,2],ymm14[1,3],ymm4[4,6],ymm14[5,7] ; AVX2-NEXT: vbroadcastss 880(%rdi), %ymm13 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm4[0,2],mem[1,3],ymm4[4,6],mem[5,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,2],ymm13[1,3],ymm10[4,6],ymm13[5,7] ; AVX2-NEXT: vbroadcastss 1328(%rdi), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm13[0,2],mem[1,3],ymm13[4,6],mem[5,7] -; AVX2-NEXT: vbroadcastss 1776(%rdi), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastss 1776(%rdi), %ymm3 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm9[1,3],ymm4[4,6],ymm9[5,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] -; AVX2-NEXT: vmovaps %ymm1, %ymm9 ; AVX2-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] -; AVX2-NEXT: vmovaps %ymm7, %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],mem[3] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm9[1,3],ymm8[4,6],ymm9[5,7] ; AVX2-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -11640,9 +11643,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm7[1,3],ymm5[4,6],ymm7[5,7] ; AVX2-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -11655,151 +11658,150 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastss 208(%rdi), %ymm1 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7] -; AVX2-NEXT: vbroadcastss 208(%rdi), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-NEXT: vbroadcastss 100(%rdi), %xmm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-NEXT: vmovsd {{.*#+}} xmm5 = [4,3,0,0] +; AVX2-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm2[3] +; AVX2-NEXT: vmovsd {{.*#+}} xmm6 = [4,3,0,0] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-NEXT: vpermps %ymm3, %ymm12, %ymm2 +; AVX2-NEXT: vbroadcastss 212(%rdi), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vpermps %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7] -; AVX2-NEXT: vpermps %ymm3, %ymm15, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vbroadcastss 212(%rdi), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vbroadcastss 324(%rdi), %xmm2 +; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpermps %ymm1, %ymm6, %ymm2 +; AVX2-NEXT: vbroadcastss 324(%rdi), %xmm3 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vpermps %ymm12, %ymm15, %ymm2 -; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastss 436(%rdi), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vbroadcastss 548(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-NEXT: vpermps %ymm4, %ymm15, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastss 436(%rdi), %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vpermps %ymm2, %ymm6, %ymm3 +; AVX2-NEXT: vbroadcastss 548(%rdi), %xmm4 +; AVX2-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-NEXT: vpermps %ymm5, %ymm12, %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vbroadcastss 772(%rdi), %xmm4 +; AVX2-NEXT: vpermps %ymm3, %ymm6, %ymm5 ; AVX2-NEXT: vmovaps 736(%rdi), %xmm3 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-NEXT: vpermps %ymm14, %ymm15, %ymm4 -; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] ; AVX2-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpermps %ymm4, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastss 996(%rdi), %xmm7 ; AVX2-NEXT: vmovaps 960(%rdi), %xmm4 ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; AVX2-NEXT: vpermps %ymm8, %ymm15, %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-NEXT: vpermps %ymm8, %ymm12, %ymm7 ; AVX2-NEXT: vbroadcastss 1108(%rdi), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vbroadcastss 1220(%rdi), %xmm7 -; AVX2-NEXT: vmovaps 1184(%rdi), %xmm14 -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpermps %ymm10, %ymm15, %ymm7 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vbroadcastss 1332(%rdi), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm7 -; AVX2-NEXT: vbroadcastss 1444(%rdi), %xmm8 -; AVX2-NEXT: vmovaps 1408(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm7 +; AVX2-NEXT: vbroadcastss 1220(%rdi), %xmm8 +; AVX2-NEXT: vmovaps 1184(%rdi), %xmm5 +; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vmovaps %ymm9, %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vbroadcastss 1556(%rdi), %ymm9 +; AVX2-NEXT: vpermps %ymm10, %ymm12, %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vbroadcastss 1332(%rdi), %ymm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpermps %ymm7, %ymm5, %ymm7 -; AVX2-NEXT: vbroadcastss 1668(%rdi), %xmm8 -; AVX2-NEXT: vmovaps 1632(%rdi), %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] +; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm7 +; AVX2-NEXT: vbroadcastss 1444(%rdi), %xmm8 +; AVX2-NEXT: vmovaps 1408(%rdi), %xmm10 +; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-NEXT: vpermps %ymm13, %ymm15, %ymm8 -; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vpermps %ymm13, %ymm12, %ymm8 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vbroadcastss 1556(%rdi), %ymm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-NEXT: vbroadcastss 216(%rdi), %ymm8 +; AVX2-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm6 +; AVX2-NEXT: vmovaps 1632(%rdi), %xmm14 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm8[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vpermps %ymm15, %ymm12, %ymm7 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vbroadcastss 1780(%rdi), %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-NEXT: vbroadcastss 216(%rdi), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%rdi), %xmm13 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] @@ -11810,8 +11812,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-NEXT: vbroadcastss 440(%rdi), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vbroadcastss 440(%rdi), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 544(%rdi), %xmm8 @@ -11822,30 +11824,31 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: vblendps $34, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-NEXT: vbroadcastss 664(%rdi), %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-NEXT: vbroadcastss 888(%rdi), %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 992(%rdi), %xmm2 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] @@ -11853,175 +11856,176 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-NEXT: vbroadcastss 1112(%rdi), %ymm6 +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm14[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm5[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-NEXT: vbroadcastss 1336(%rdi), %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-NEXT: vbroadcastss 1336(%rdi), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovaps 1440(%rdi), %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm10[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] -; AVX2-NEXT: vbroadcastss 1560(%rdi), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovaps 1664(%rdi), %xmm14 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm5[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vbroadcastss 136(%rdi), %xmm0 -; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermps 192(%rdi), %ymm15, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastss 360(%rdi), %xmm0 -; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermps 416(%rdi), %ymm15, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastss 1560(%rdi), %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovaps 1664(%rdi), %xmm5 +; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm5[0,1,2],xmm14[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] +; AVX2-NEXT: vbroadcastss 1784(%rdi), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vbroadcastss 136(%rdi), %xmm14 +; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-NEXT: vpermps 192(%rdi), %ymm12, %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vbroadcastss 80(%rdi), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vbroadcastss 360(%rdi), %xmm14 +; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-NEXT: vpermps 416(%rdi), %ymm12, %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vbroadcastss 304(%rdi), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vbroadcastss 584(%rdi), %xmm0 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermps 640(%rdi), %ymm15, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vbroadcastss 528(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-NEXT: vpermps 640(%rdi), %ymm12, %ymm15 +; AVX2-NEXT: vbroadcastss 528(%rdi), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastss 808(%rdi), %xmm0 -; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-NEXT: vbroadcastss 808(%rdi), %xmm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm15[0],mem[1],xmm15[2,3] +; AVX2-NEXT: vpermps 864(%rdi), %ymm12, %ymm1 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpermps 864(%rdi), %ymm15, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vbroadcastss 752(%rdi), %ymm1 +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vbroadcastss 1032(%rdi), %xmm1 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpermps 1088(%rdi), %ymm15, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vbroadcastss 976(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-NEXT: vpermps 1088(%rdi), %ymm12, %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vbroadcastss 976(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpermps 1312(%rdi), %ymm15, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vbroadcastss 1200(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vpermps 1312(%rdi), %ymm12, %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vbroadcastss 1200(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpermps 1536(%rdi), %ymm15, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vbroadcastss 1424(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-NEXT: vpermps 1536(%rdi), %ymm12, %ymm9 +; AVX2-NEXT: vbroadcastss 1424(%rdi), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-NEXT: vbroadcastss 1704(%rdi), %xmm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vbroadcastss 1704(%rdi), %xmm4 -; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-NEXT: vpermps 1760(%rdi), %ymm15, %ymm5 +; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload +; AVX2-NEXT: # xmm4 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-NEXT: vpermps 1760(%rdi), %ymm12, %ymm9 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vbroadcastss 1648(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3] -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vbroadcastss 1648(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm5, 192(%rsi) @@ -12097,20 +12101,21 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm5, 128(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm5, 64(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm5, 32(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm5, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm12, 224(%rax) +; AVX2-NEXT: vmovaps %ymm13, 224(%rax) ; AVX2-NEXT: vmovaps %ymm10, 192(%rax) ; AVX2-NEXT: vmovaps %ymm6, 160(%rax) -; AVX2-NEXT: vmovaps %ymm7, 128(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-NEXT: vmovaps %ymm5, 128(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm5, 64(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm5, 32(%rax) @@ -12123,35 +12128,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-NEXT: vmovaps %ymm13, 32(%rax) -; AVX2-NEXT: vmovaps %ymm11, (%rax) -; AVX2-NEXT: addq $2648, %rsp # imm = 0xA58 +; AVX2-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-NEXT: vmovaps %ymm7, (%rax) +; AVX2-NEXT: addq $2664, %rsp # imm = 0xA68 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i32_stride7_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $2648, %rsp # imm = 0xA58 -; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: subq $2664, %rsp # imm = 0xA68 +; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm11 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6],ymm1[7] ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -12163,28 +12167,26 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 752(%rdi), %ymm2 +; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpbroadcastd 868(%rdi), %ymm3 +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -12195,11 +12197,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1568(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6],ymm5[7] ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12215,34 +12217,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpbroadcastd 196(%rdi), %ymm3 +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -12255,11 +12257,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm15[6],ymm12[7] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12275,16 +12277,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovdqa 1344(%rdi), %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovdqa 1344(%rdi), %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %ymm9 ; AVX2-FP-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 1472(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %xmm2 @@ -12296,46 +12298,49 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6],ymm8[7] +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,2,2,2] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm2 @@ -12345,12 +12350,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12365,30 +12370,30 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,2,2,2] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -12396,27 +12401,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 960(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,2,2,2] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1472(%rdi), %ymm2 @@ -12426,8 +12429,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -12435,109 +12438,110 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm14[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] ; AVX2-FP-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 752(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpbroadcastd 680(%rdi), %xmm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-FP-NEXT: vpbroadcastd 876(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-FP-NEXT: vpbroadcastd 876(%rdi), %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 1128(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FP-NEXT: vpbroadcastd 1324(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vpbroadcastd 1324(%rdi), %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 1576(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %xmm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FP-NEXT: vpbroadcastd 1772(%rdi), %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FP-NEXT: vpbroadcastd 1772(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] ; AVX2-FP-NEXT: vpbroadcastd 204(%rdi), %ymm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqa 528(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpbroadcastd 456(%rdi), %xmm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FP-NEXT: vpbroadcastd 652(%rdi), %ymm15 @@ -12546,118 +12550,115 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa 928(%rdi), %xmm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] +; AVX2-FP-NEXT: vmovdqa 928(%rdi), %xmm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FP-NEXT: vpbroadcastd 1100(%rdi), %ymm14 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm14 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 1548(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-FP-NEXT: vpbroadcastd 1548(%rdi), %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7] -; AVX2-FP-NEXT: vmovaps %ymm4, %ymm12 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7] ; AVX2-FP-NEXT: vbroadcastss 432(%rdi), %ymm14 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,2],ymm14[1,3],ymm4[4,6],ymm14[5,7] ; AVX2-FP-NEXT: vbroadcastss 880(%rdi), %ymm13 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm4[0,2],mem[1,3],ymm4[4,6],mem[5,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,2],ymm13[1,3],ymm10[4,6],ymm13[5,7] ; AVX2-FP-NEXT: vbroadcastss 1328(%rdi), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm13[0,2],mem[1,3],ymm13[4,6],mem[5,7] -; AVX2-FP-NEXT: vbroadcastss 1776(%rdi), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 1776(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm9[1,3],ymm4[4,6],ymm9[5,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] -; AVX2-FP-NEXT: vmovaps %ymm1, %ymm9 ; AVX2-FP-NEXT: vbroadcastss 1552(%rdi), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] -; AVX2-FP-NEXT: vmovaps %ymm7, %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],mem[3] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm9[1,3],ymm8[4,6],ymm9[5,7] ; AVX2-FP-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -12665,9 +12666,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm7[1,3],ymm4[4,6],ymm7[5,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm7[1,3],ymm5[4,6],ymm7[5,7] ; AVX2-FP-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -12680,151 +12681,150 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 208(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7] -; AVX2-FP-NEXT: vbroadcastss 208(%rdi), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vbroadcastss 100(%rdi), %xmm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm5 = [4,3,0,0] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm2[3] +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm6 = [4,3,0,0] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm12 = [0,7,0,7,0,7,0,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpermps %ymm3, %ymm12, %ymm2 +; AVX2-FP-NEXT: vbroadcastss 212(%rdi), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm1, %ymm5, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7] -; AVX2-FP-NEXT: vpermps %ymm3, %ymm15, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FP-NEXT: vbroadcastss 212(%rdi), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 -; AVX2-FP-NEXT: vbroadcastss 324(%rdi), %xmm2 +; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm1, %ymm6, %ymm2 +; AVX2-FP-NEXT: vbroadcastss 324(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpermps %ymm12, %ymm15, %ymm2 -; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastss 436(%rdi), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 -; AVX2-FP-NEXT: vbroadcastss 548(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vpermps %ymm4, %ymm15, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastss 436(%rdi), %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm2, %ymm6, %ymm3 +; AVX2-FP-NEXT: vbroadcastss 548(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm12, %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 772(%rdi), %xmm4 +; AVX2-FP-NEXT: vpermps %ymm3, %ymm6, %ymm5 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpermps %ymm14, %ymm15, %ymm4 -; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FP-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm4, %ymm6, %ymm5 ; AVX2-FP-NEXT: vbroadcastss 996(%rdi), %xmm7 ; AVX2-FP-NEXT: vmovaps 960(%rdi), %xmm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm15, %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vpermps %ymm8, %ymm12, %ymm7 ; AVX2-FP-NEXT: vbroadcastss 1108(%rdi), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm0 -; AVX2-FP-NEXT: vbroadcastss 1220(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovaps 1184(%rdi), %xmm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpermps %ymm10, %ymm15, %ymm7 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vbroadcastss 1332(%rdi), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm5, %ymm7 -; AVX2-FP-NEXT: vbroadcastss 1444(%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps 1408(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm5, %ymm6, %ymm7 +; AVX2-FP-NEXT: vbroadcastss 1220(%rdi), %xmm8 +; AVX2-FP-NEXT: vmovaps 1184(%rdi), %xmm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-FP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: vmovaps %ymm9, %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vbroadcastss 1556(%rdi), %ymm9 +; AVX2-FP-NEXT: vpermps %ymm10, %ymm12, %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vbroadcastss 1332(%rdi), %ymm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm5, %ymm7 -; AVX2-FP-NEXT: vbroadcastss 1668(%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps 1632(%rdi), %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] +; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm7 +; AVX2-FP-NEXT: vbroadcastss 1444(%rdi), %xmm8 +; AVX2-FP-NEXT: vmovaps 1408(%rdi), %xmm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-FP-NEXT: vpermps %ymm13, %ymm15, %ymm8 -; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vpermps %ymm13, %ymm12, %ymm8 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vbroadcastss 1556(%rdi), %ymm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FP-NEXT: vbroadcastss 216(%rdi), %ymm8 +; AVX2-FP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm6 +; AVX2-FP-NEXT: vmovaps 1632(%rdi), %xmm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm8[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vpermps %ymm15, %ymm12, %ymm7 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vbroadcastss 1780(%rdi), %ymm8 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FP-NEXT: vbroadcastss 216(%rdi), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] @@ -12835,8 +12835,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FP-NEXT: vbroadcastss 440(%rdi), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vbroadcastss 440(%rdi), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm8 @@ -12847,30 +12847,31 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps $34, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FP-NEXT: vbroadcastss 664(%rdi), %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FP-NEXT: vbroadcastss 888(%rdi), %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 992(%rdi), %xmm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] @@ -12878,175 +12879,176 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FP-NEXT: vbroadcastss 1112(%rdi), %ymm6 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm14[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm5[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FP-NEXT: vbroadcastss 1336(%rdi), %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FP-NEXT: vbroadcastss 1336(%rdi), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 1440(%rdi), %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm10[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0,3,3,5,4,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] -; AVX2-FP-NEXT: vbroadcastss 1560(%rdi), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 1664(%rdi), %xmm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm5[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FP-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 136(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermps 192(%rdi), %ymm15, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 360(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermps 416(%rdi), %ymm15, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 1560(%rdi), %ymm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 1664(%rdi), %xmm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm5[0,1,2],xmm14[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-FP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] +; AVX2-FP-NEXT: vbroadcastss 1784(%rdi), %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 136(%rdi), %xmm14 +; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FP-NEXT: vpermps 192(%rdi), %ymm12, %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vbroadcastss 80(%rdi), %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 360(%rdi), %xmm14 +; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FP-NEXT: vpermps 416(%rdi), %ymm12, %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vbroadcastss 304(%rdi), %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 584(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermps 640(%rdi), %ymm15, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vbroadcastss 528(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpermps 640(%rdi), %ymm12, %ymm15 +; AVX2-FP-NEXT: vbroadcastss 528(%rdi), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 808(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vbroadcastss 808(%rdi), %xmm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm15[0],mem[1],xmm15[2,3] +; AVX2-FP-NEXT: vpermps 864(%rdi), %ymm12, %ymm1 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermps 864(%rdi), %ymm15, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vbroadcastss 752(%rdi), %ymm1 +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 1032(%rdi), %xmm1 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpermps 1088(%rdi), %ymm15, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vbroadcastss 976(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpermps 1088(%rdi), %ymm12, %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vbroadcastss 976(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpermps 1312(%rdi), %ymm15, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vbroadcastss 1200(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpermps 1312(%rdi), %ymm12, %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vbroadcastss 1200(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpermps 1536(%rdi), %ymm15, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vbroadcastss 1424(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vpermps 1536(%rdi), %ymm12, %ymm9 +; AVX2-FP-NEXT: vbroadcastss 1424(%rdi), %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vbroadcastss 1704(%rdi), %xmm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 1704(%rdi), %xmm4 -; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FP-NEXT: vpermps 1760(%rdi), %ymm15, %ymm5 +; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm4 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FP-NEXT: vpermps 1760(%rdi), %ymm12, %ymm9 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vbroadcastss 1648(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3] -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vbroadcastss 1648(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rsi) @@ -13122,20 +13124,21 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm5, 128(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm5, 64(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm5, 32(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm12, 224(%rax) +; AVX2-FP-NEXT: vmovaps %ymm13, 224(%rax) ; AVX2-FP-NEXT: vmovaps %ymm10, 192(%rax) ; AVX2-FP-NEXT: vmovaps %ymm6, 160(%rax) -; AVX2-FP-NEXT: vmovaps %ymm7, 128(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 128(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rax) @@ -13148,20 +13151,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FP-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-FP-NEXT: vmovaps %ymm13, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm11, (%rax) -; AVX2-FP-NEXT: addq $2648, %rsp # imm = 0xA58 +; AVX2-FP-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FP-NEXT: addq $2664, %rsp # imm = 0xA68 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride7_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $2648, %rsp # imm = 0xA58 +; AVX2-FCP-NEXT: subq $2664, %rsp # imm = 0xA68 ; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm7 @@ -13169,7 +13172,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm11 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,6,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm10[6],ymm11[7] ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13205,7 +13208,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm13 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] @@ -13219,11 +13223,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6],ymm5[7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13243,11 +13247,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -13265,17 +13269,17 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpbroadcastd 644(%rdi), %ymm3 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13288,7 +13292,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %ymm12 ; AVX2-FCP-NEXT: vpbroadcastq 976(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %xmm3 @@ -13330,12 +13334,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,0,7,6,5,6,5,6] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13348,10 +13352,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm11 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] @@ -13369,10 +13374,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm13[1],mem[2,3,4],ymm13[5],mem[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6],ymm7[7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -13391,10 +13396,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7] +; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13403,14 +13408,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] @@ -13459,20 +13464,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm15[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 @@ -13480,49 +13485,48 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-FCP-NEXT: vpbroadcastd 428(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpbroadcastd 428(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm12[0],ymm5[0],ymm12[2],ymm5[2] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX2-FCP-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpbroadcastd 1128(%rdi), %xmm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] -; AVX2-FCP-NEXT: vpbroadcastd 1324(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX2-FCP-NEXT: vpbroadcastd 1324(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1648(%rdi), %xmm0 @@ -13530,8 +13534,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 1576(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -13541,28 +13545,28 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-FCP-NEXT: vpbroadcastd 204(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vpbroadcastd 204(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 456(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FCP-NEXT: vpbroadcastd 652(%rdi), %ymm15 @@ -13571,12 +13575,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr $8, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %xmm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm9[1],xmm15[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload @@ -13586,69 +13590,68 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1424(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-FCP-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-FCP-NEXT: vpbroadcastd 1548(%rdi), %ymm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] -; AVX2-FCP-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2],ymm5[1,3],ymm12[4,6],ymm5[5,7] +; AVX2-FCP-NEXT: vbroadcastss 432(%rdi), %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,2],ymm12[1,3],ymm11[4,6],ymm12[5,7] ; AVX2-FCP-NEXT: vbroadcastss 880(%rdi), %ymm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm8[1,3],ymm14[4,6],ymm8[5,7] -; AVX2-FCP-NEXT: vmovaps %ymm8, %ymm13 -; AVX2-FCP-NEXT: vbroadcastss 1328(%rdi), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 1328(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm10[1,3],ymm14[4,6],ymm10[5,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] @@ -13656,26 +13659,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm1[1,3],ymm11[4,6],ymm1[5,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] +; AVX2-FCP-NEXT: vmovaps %ymm6, %ymm10 ; AVX2-FCP-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm8[1,3],ymm11[4,6],ymm8[5,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm9[1,3],ymm8[4,6],ymm9[5,7] ; AVX2-FCP-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -13688,11 +13692,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vbroadcastss 656(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] -; AVX2-FCP-NEXT: vbroadcastss 656(%rdi), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm7[1,3],ymm5[4,6],ymm7[5,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -13704,144 +13708,146 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm12[1,3],ymm4[4,6],ymm12[5,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm11[1,3],ymm4[4,6],ymm11[5,7] ; AVX2-FCP-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm0 = [4,3,0,0] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm15, %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vbroadcastss 212(%rdi), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [4,3,0,0] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vbroadcastss 324(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm15 = [0,7,0,7,0,7,0,7] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm15, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vbroadcastss 212(%rdi), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vbroadcastss 324(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vbroadcastss 436(%rdi), %ymm4 +; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vbroadcastss 548(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm15, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vbroadcastss 772(%rdi), %xmm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm15, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vbroadcastss 772(%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %xmm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpermps %ymm13, %ymm15, %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 996(%rdi), %xmm7 +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %xmm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm15, %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm15, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vbroadcastss 1108(%rdi), %ymm8 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vbroadcastss 1220(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %xmm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %xmm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm15, %ymm7 -; AVX2-FCP-NEXT: vmovaps %ymm13, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FCP-NEXT: vbroadcastss 1332(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vbroadcastss 1444(%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %xmm14 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpermps %ymm13, %ymm15, %ymm7 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpermps %ymm10, %ymm15, %ymm7 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vbroadcastss 1556(%rdi), %ymm8 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vbroadcastss 1668(%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vbroadcastss 1780(%rdi), %ymm8 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm12, %ymm6 +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm3 ; AVX2-FCP-NEXT: vbroadcastss 216(%rdi), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm8 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] @@ -13851,8 +13857,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FCP-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm3 ; AVX2-FCP-NEXT: vbroadcastss 440(%rdi), %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] @@ -13866,13 +13872,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps $34, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vbroadcastss 664(%rdi), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm4[3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] @@ -13880,277 +13886,279 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm13[1],mem[2,3,4],ymm13[5],mem[6,7] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vbroadcastss 888(%rdi), %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm5[3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermps %ymm4, %ymm12, %ymm4 ; AVX2-FCP-NEXT: vbroadcastss 1112(%rdi), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm10[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm11[3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-FCP-NEXT: vbroadcastss 1336(%rdi), %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FCP-NEXT: vbroadcastss 1336(%rdi), %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %xmm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm4[0,1,2],xmm14[3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm14[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm14 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm12, %ymm14 -; AVX2-FCP-NEXT: vbroadcastss 1560(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 1560(%rdi), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %xmm14 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm12, %ymm11 +; AVX2-FCP-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermps %ymm10, %ymm12, %ymm10 ; AVX2-FCP-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 136(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps 192(%rdi), %ymm15, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 80(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3] -; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpermps 192(%rdi), %ymm15, %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vbroadcastss 80(%rdi), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 360(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps 416(%rdi), %ymm15, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 304(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm8[3] +; AVX2-FCP-NEXT: vpermps 416(%rdi), %ymm15, %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vbroadcastss 304(%rdi), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm8[3] ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 584(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpermps 640(%rdi), %ymm15, %ymm10 +; AVX2-FCP-NEXT: vbroadcastss 528(%rdi), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[3] +; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vbroadcastss 808(%rdi), %xmm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm10[0],mem[1],xmm10[2,3] +; AVX2-FCP-NEXT: vpermps 864(%rdi), %ymm15, %ymm1 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps 640(%rdi), %ymm15, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 528(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 808(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps 864(%rdi), %ymm15, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 752(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] -; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vbroadcastss 752(%rdi), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 1032(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps 1088(%rdi), %ymm15, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 976(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpermps 1088(%rdi), %ymm15, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vbroadcastss 976(%rdi), %ymm1 +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 1256(%rdi), %xmm1 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps 1312(%rdi), %ymm15, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 1200(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpermps 1312(%rdi), %ymm15, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vbroadcastss 1200(%rdi), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 1480(%rdi), %xmm2 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpermps 1536(%rdi), %ymm15, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 1424(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpermps 1536(%rdi), %ymm15, %ymm6 +; AVX2-FCP-NEXT: vbroadcastss 1424(%rdi), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] +; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vbroadcastss 1704(%rdi), %xmm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 1704(%rdi), %xmm4 -; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpermps 1760(%rdi), %ymm15, %ymm11 +; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm4 = xmm6[0],mem[1],xmm6[2,3] +; AVX2-FCP-NEXT: vpermps 1760(%rdi), %ymm15, %ymm6 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 1648(%rdi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm14[3] -; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 192(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 128(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 224(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 160(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 160(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 224(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%r9) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm9, (%r9) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vbroadcastss 1648(%rdi), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm14[3] +; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps %ymm12, 224(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm11, 192(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rax) @@ -14165,267 +14173,268 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm13, 32(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm7, (%rax) -; AVX2-FCP-NEXT: addq $2648, %rsp # imm = 0xA58 +; AVX2-FCP-NEXT: addq $2664, %rsp # imm = 0xA68 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i32_stride7_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm16 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm10 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vpermt2d %zmm4, %zmm3, %zmm10 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 +; AVX512-NEXT: vpermt2d %zmm17, %zmm28, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-NEXT: vpermt2d %zmm18, %zmm28, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512-NEXT: vpermt2d %zmm19, %zmm28, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512-NEXT: vpermt2d %zmm21, %zmm28, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm12 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm8, %zmm11, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-NEXT: vpermt2d %zmm9, %zmm28, %zmm12 +; AVX512-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512-NEXT: vpermt2d %zmm5, %zmm28, %zmm12 +; AVX512-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm7, %zmm13, %zmm28 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm20, %zmm11, %zmm28 +; AVX512-NEXT: vpermt2d %zmm9, %zmm12, %zmm15 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm6, %zmm9, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 +; AVX512-NEXT: vpermt2d %zmm2, %zmm9, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512-NEXT: vpermt2d %zmm20, %zmm9, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 +; AVX512-NEXT: vpermt2d %zmm8, %zmm9, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512-NEXT: vpermt2d %zmm17, %zmm12, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512-NEXT: vpermt2d %zmm18, %zmm12, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512-NEXT: vpermt2d %zmm19, %zmm12, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512-NEXT: vpermt2d %zmm21, %zmm12, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512-NEXT: vpermt2d %zmm10, %zmm26, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm18, %zmm27, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512-NEXT: vpermt2d %zmm18, %zmm29, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm10, %zmm2, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512-NEXT: vpermt2d %zmm0, %zmm26, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512-NEXT: vpermt2d %zmm19, %zmm27, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512-NEXT: vpermt2d %zmm19, %zmm29, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512-NEXT: vpermt2d %zmm19, %zmm31, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 +; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512-NEXT: vpermt2d %zmm17, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 +; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512-NEXT: vpermt2d %zmm17, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm29 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm31 +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm3, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512-NEXT: vpermt2d %zmm17, %zmm7, %zmm30 +; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] +; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] ; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -14463,8 +14472,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 ; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm25 ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 @@ -14484,7 +14493,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512-NEXT: movw $480, %ax # imm = 0x1E0 @@ -14528,9 +14537,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14548,13 +14557,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm18, %zmm29, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm25, %zmm26, %zmm25 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload @@ -14563,10 +14572,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} ; AVX512-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -14597,7 +14606,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512-NEXT: vmovdqa64 %zmm20, (%r9) @@ -14620,260 +14629,261 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-LABEL: load_i32_stride7_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm28, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm28, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm12 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm11, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm28 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm11, %zmm28 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm15 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm9, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm12, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm12, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm12, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm12, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm29, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm27, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm29, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm31, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 +; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm29, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm31 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -14911,8 +14921,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm25 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 @@ -14932,7 +14942,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512-FCP-NEXT: movw $480, %ax # imm = 0x1E0 @@ -14976,9 +14986,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14996,13 +15006,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm29, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm26, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload @@ -15011,10 +15021,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -15045,7 +15055,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r9) @@ -15068,260 +15078,261 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-LABEL: load_i32_stride7_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm16 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm10 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm28, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm28, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm28, %zmm12 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm28, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm28, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm13, %zmm28 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm11, %zmm28 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm12, %zmm15 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm9, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm9, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm9, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm9, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm12, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm12, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm12, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm12, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm26, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm27, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm29, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm2, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm26, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm27, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm29, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm31, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 +; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm31 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm2, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm7, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -15359,8 +15370,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm25 ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 @@ -15380,7 +15391,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-NEXT: movw $480, %ax # imm = 0x1E0 @@ -15424,9 +15435,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15444,13 +15455,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm18, %zmm29, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm25, %zmm26, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload @@ -15459,10 +15470,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -15493,7 +15504,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%r9) @@ -15516,260 +15527,261 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-LABEL: load_i32_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm28, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm28, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm12 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm28 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm11, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm9, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm12, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm12, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm12, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm12, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm29, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm27, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm29, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm31, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -15807,8 +15819,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm25 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 @@ -15828,7 +15840,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-FCP-NEXT: movw $480, %ax # imm = 0x1E0 @@ -15872,9 +15884,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15892,13 +15904,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm29, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm26, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload @@ -15907,10 +15919,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -15941,7 +15953,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r9) @@ -15964,260 +15976,261 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride7_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm16 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm28, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm28, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm28 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm11, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm12, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm12, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm12, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm12, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm26, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm29, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm26, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm27, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm29, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm31, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm30 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -16255,8 +16268,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm25 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 @@ -16276,7 +16289,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: movw $480, %ax # imm = 0x1E0 @@ -16320,9 +16333,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16340,13 +16353,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm29, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm26, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload @@ -16355,10 +16368,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} ; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -16389,7 +16402,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) @@ -16412,260 +16425,261 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-LABEL: load_i32_stride7_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm28, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm28, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm12 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm28 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm11, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm15 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm12, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm12, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm12, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm12, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm29, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm2, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm27, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm29, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm31, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -16703,8 +16717,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm25 ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 @@ -16724,7 +16738,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: movw $480, %ax # imm = 0x1E0 @@ -16768,9 +16782,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16788,13 +16802,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm29, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm26, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload @@ -16803,10 +16817,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -16837,7 +16851,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%r9) @@ -16860,260 +16874,261 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-LABEL: load_i32_stride7_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm28, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm28, %zmm12 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm28, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm28 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm11, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm15 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm12, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm12, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm12, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm12, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm26, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm29, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm26, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm27, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm29, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm31, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -17151,8 +17166,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm25 ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 @@ -17172,7 +17187,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: movw $480, %ax # imm = 0x1E0 @@ -17216,9 +17231,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17236,13 +17251,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm18, %zmm29, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm25, %zmm26, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload @@ -17251,10 +17266,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -17285,7 +17300,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 192(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%r9) @@ -17308,260 +17323,261 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-LABEL: load_i32_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm13, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm28, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm28, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm28, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm13, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm11, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm12, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm12, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm12, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm12, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm29, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm27, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm29, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm31, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm2, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,7,14,21,28,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,8,15,22,29,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [18,25,0,7,14,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [19,26,1,8,15,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [4,11,18,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [5,12,19,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [6,13,20,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 @@ -17599,8 +17615,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 @@ -17620,7 +17636,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $480, %ax # imm = 0x1E0 @@ -17664,9 +17680,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17684,13 +17700,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm18, %zmm29, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm26, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload @@ -17699,10 +17715,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -17733,7 +17749,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 13410fb5cc4b8..4c96543ff03fd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -219,15 +219,15 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 ; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -284,7 +284,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm1 @@ -349,15 +349,15 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] ; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 ; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -414,7 +414,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 @@ -740,23 +740,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-NEXT: vmovdqa %xmm3, (%rdx) @@ -774,23 +774,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -808,23 +808,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx) @@ -842,23 +842,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -876,23 +876,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -910,23 +910,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -944,23 +944,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) @@ -978,23 +978,23 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) @@ -1234,18 +1234,18 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i32_stride8_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm7 -; AVX2-NEXT: vmovaps 128(%rdi), %xmm11 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm8 ; AVX2-NEXT: vbroadcastss %xmm8, %xmm5 -; AVX2-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX2-NEXT: vbroadcastss %xmm10, %xmm6 +; AVX2-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-NEXT: vbroadcastss %xmm11, %xmm6 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] @@ -1259,33 +1259,34 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vmovaps 224(%rdi), %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm15 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] -; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3] ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovaps 160(%rdi), %ymm13 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX2-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX2-NEXT: vmovaps 128(%rdi), %ymm11 ; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 @@ -1293,70 +1294,73 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[4],ymm13[4],ymm11[5],ymm13[5] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] +; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm12 +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] +; AVX2-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm11 = ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[6],ymm13[6],ymm11[7],ymm13[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm3 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm1[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm10 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-NEXT: vmovaps %ymm7, (%rcx) ; AVX2-NEXT: vmovaps %ymm8, (%r8) ; AVX2-NEXT: vmovaps %ymm9, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm12, (%rax) +; AVX2-NEXT: vmovaps %ymm0, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm1, (%rax) +; AVX2-NEXT: vmovaps %ymm3, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm0, (%rax) +; AVX2-NEXT: vmovaps %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i32_stride8_vf8: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm11 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm8 ; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm5 -; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm6 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-FP-NEXT: vbroadcastss %xmm11, %xmm6 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] @@ -1370,33 +1374,34 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm13 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm11 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 @@ -1404,70 +1409,73 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[4],ymm13[4],ymm11[5],ymm13[5] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FP-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-FP-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] +; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm12 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[6],ymm13[6],ymm11[7],ymm13[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm3 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm1[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm10 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-FP-NEXT: vmovaps %ymm7, (%rcx) ; AVX2-FP-NEXT: vmovaps %ymm8, (%r8) ; AVX2-FP-NEXT: vmovaps %ymm9, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm12, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride8_vf8: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm11 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm8 ; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm5 -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX2-FCP-NEXT: vbroadcastss %xmm10, %xmm6 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-FCP-NEXT: vbroadcastss %xmm11, %xmm6 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] @@ -1481,33 +1489,34 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm13 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm11 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 @@ -1515,53 +1524,56 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[4],ymm13[4],ymm11[5],ymm13[5] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] +; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm12 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[6],ymm13[6],ymm11[7],ymm13[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm3 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm10 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm7, (%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm8, (%r8) ; AVX2-FCP-NEXT: vmovaps %ymm9, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm12, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -1574,44 +1586,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,8,16,24] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,9,17,25] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,2,10,18,26] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,20,28] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,5,13,21,29] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,6,14,22,30] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm4, (%rsi) @@ -1634,44 +1646,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,8,16,24] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,9,17,25] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,2,10,18,26] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,20,28] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,5,13,21,29] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,6,14,22,30] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1694,44 +1706,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,8,16,24] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,9,17,25] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,2,10,18,26] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,20,28] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,5,13,21,29] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,6,14,22,30] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rsi) @@ -1754,44 +1766,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,8,16,24] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,9,17,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,2,10,18,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,20,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,5,13,21,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,6,14,22,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1814,44 +1826,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,8,16,24] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,9,17,25] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,2,10,18,26] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,20,28] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,5,13,21,29] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,6,14,22,30] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -1874,44 +1886,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,8,16,24] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,9,17,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,2,10,18,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,20,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,5,13,21,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,6,14,22,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -1934,44 +1946,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,8,16,24] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,9,17,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,2,10,18,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,20,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,5,13,21,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,6,14,22,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) @@ -1994,44 +2006,44 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) @@ -2612,8 +2624,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX2-NEXT: vbroadcastss %xmm5, %xmm2 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX2-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX2-NEXT: vbroadcastss %xmm14, %xmm3 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm10 @@ -2623,7 +2635,7 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -2636,9 +2648,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] ; AVX2-NEXT: vmovaps %xmm8, %xmm6 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -2661,7 +2673,7 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vmovaps %xmm8, %xmm7 ; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2676,11 +2688,11 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload @@ -2695,29 +2707,29 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-NEXT: vmovaps (%rdi), %ymm14 +; AVX2-NEXT: vmovaps (%rdi), %ymm12 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 224(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm13 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] +; AVX2-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[4],ymm7[4],ymm15[5],ymm7[5] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -2732,51 +2744,51 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm6[2,3] ; AVX2-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-NEXT: vmovaps 448(%rdi), %ymm11 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm10 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm0 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm1 -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[6],ymm7[6],ymm15[7],ymm7[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -2785,56 +2797,56 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm8 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm8 -; AVX2-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vunpckhps (%rsp), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm4 -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%r9) +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm2 +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm3 +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm11, 32(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm4, (%rax) +; AVX2-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-NEXT: vmovaps %ymm1, (%rax) +; AVX2-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-NEXT: vmovaps %ymm0, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-NEXT: vmovaps %ymm2, (%rax) +; AVX2-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-NEXT: vmovaps %ymm1, (%rax) ; AVX2-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2886,8 +2898,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX2-FP-NEXT: vbroadcastss %xmm5, %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm3 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm10 @@ -2897,7 +2909,7 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -2910,9 +2922,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] ; AVX2-FP-NEXT: vmovaps %xmm8, %xmm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -2935,7 +2947,7 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vmovaps %xmm8, %xmm7 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2950,11 +2962,11 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload @@ -2969,29 +2981,29 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm14 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm12 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm13 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[4],ymm7[4],ymm15[5],ymm7[5] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -3006,51 +3018,51 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm6[2,3] ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm10 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm2 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm1 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm1 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm0 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm1 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[6],ymm7[6],ymm15[7],ymm7[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -3059,56 +3071,56 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm8 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm8 -; AVX2-FP-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vunpckhps (%rsp), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm4 -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%r9) +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm2 +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm3 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm4, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FP-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -3160,8 +3172,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX2-FCP-NEXT: vbroadcastss %xmm5, %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX2-FCP-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX2-FCP-NEXT: vbroadcastss %xmm14, %xmm3 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm10 @@ -3171,7 +3183,7 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3184,9 +3196,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] ; AVX2-FCP-NEXT: vmovaps %xmm8, %xmm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -3209,7 +3221,7 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovaps %xmm8, %xmm7 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3224,11 +3236,11 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload @@ -3243,29 +3255,29 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm12 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm13 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[4],ymm7[4],ymm15[5],ymm7[5] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -3280,51 +3292,51 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm6[2,3] ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm10 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm0 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm1 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[6],ymm7[6],ymm15[7],ymm7[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -3333,62 +3345,62 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm8 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm8 -; AVX2-FCP-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps (%rsp), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm4 -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm4 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%r9) -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax) -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) -; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8 -; AVX2-FCP-NEXT: vzeroupper -; AVX2-FCP-NEXT: retq -; -; AVX512-LABEL: load_i32_stride8_vf16: -; AVX512: # %bb.0: +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm2 +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm3 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%r9) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rax) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX2-FCP-NEXT: vzeroupper +; AVX2-FCP-NEXT: retq +; +; AVX512-LABEL: load_i32_stride8_vf16: +; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 @@ -3413,9 +3425,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -3425,9 +3437,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -3437,9 +3449,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -3449,9 +3461,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -3461,9 +3473,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -3473,9 +3485,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -3485,9 +3497,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -3532,9 +3544,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -3544,9 +3556,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -3556,9 +3568,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -3568,9 +3580,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -3580,9 +3592,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -3592,9 +3604,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -3604,9 +3616,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -3651,9 +3663,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -3663,9 +3675,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -3675,9 +3687,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -3687,9 +3699,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -3699,9 +3711,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -3711,9 +3723,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -3723,9 +3735,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -3770,9 +3782,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -3782,9 +3794,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -3794,9 +3806,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -3806,9 +3818,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -3818,9 +3830,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -3830,9 +3842,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -3842,9 +3854,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -3889,9 +3901,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -3901,9 +3913,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -3913,9 +3925,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -3925,9 +3937,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -3937,9 +3949,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -3949,9 +3961,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -3961,9 +3973,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -4008,9 +4020,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -4020,9 +4032,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -4032,9 +4044,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -4044,9 +4056,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -4056,9 +4068,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -4068,9 +4080,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -4080,9 +4092,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -4127,9 +4139,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -4139,9 +4151,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -4151,9 +4163,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -4163,9 +4175,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -4175,9 +4187,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -4187,9 +4199,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -4199,9 +4211,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -4246,9 +4258,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 @@ -4258,9 +4270,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 @@ -4270,9 +4282,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 @@ -4282,9 +4294,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 @@ -4294,9 +4306,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 @@ -4306,9 +4318,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 @@ -4318,9 +4330,9 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} @@ -5580,9 +5592,9 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-NEXT: vmovaps 736(%rdi), %xmm1 @@ -5682,9 +5694,9 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -5829,18 +5841,19 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] +; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX2-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -5855,21 +5868,19 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 928(%rdi), %ymm13 ; AVX2-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[4],ymm13[4],ymm1[5],ymm13[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 @@ -5882,251 +5893,254 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 544(%rdi), %ymm14 -; AVX2-NEXT: vmovaps 512(%rdi), %ymm11 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 544(%rdi), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 512(%rdi), %ymm14 +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 576(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] ; AVX2-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 672(%rdi), %ymm11 ; AVX2-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm1 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm10 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm1 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm1 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm1 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm11[5],ymm4[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[6],ymm3[6],ymm15[7],ymm3[7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 1016(%rdi), %ymm1 -; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vbroadcastss 1016(%rdi), %ymm0 +; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm12 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm3 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm2 -; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm8 -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm15 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vbroadcastss 760(%rdi), %ymm2 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm15 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm1 -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[6],ymm6[6],ymm14[7],ymm6[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm5 +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm5 +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm1 -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastss 732(%rdi), %ymm1 -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm5 -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vbroadcastss 988(%rdi), %ymm1 -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm0 +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX2-NEXT: vbroadcastss 732(%rdi), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm4 +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, (%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, (%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastss 988(%rdi), %ymm0 +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm6 +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, (%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, (%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, (%rax) -; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, (%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm6, 32(%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-NEXT: vmovaps %ymm8, (%rax) +; AVX2-NEXT: vmovaps %ymm3, (%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-NEXT: vmovaps %ymm4, (%rax) +; AVX2-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-NEXT: vmovaps %ymm5, (%rax) ; AVX2-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -6201,9 +6215,9 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm1 @@ -6303,9 +6317,9 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-FP-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6450,18 +6464,19 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] +; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6476,21 +6491,19 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[4],ymm13[4],ymm1[5],ymm13[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 @@ -6503,251 +6516,254 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm11 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm14 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] ; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm1 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm1 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm1 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm1 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm1 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[6],ymm3[6],ymm15[7],ymm3[7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 1016(%rdi), %ymm1 -; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss 1016(%rdi), %ymm0 +; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm3 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm2 -; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm8 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm3 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 760(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm15 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm1 -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[6],ymm6[6],ymm14[7],ymm6[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm5 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm5 +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm1 -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 732(%rdi), %ymm1 -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm5 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 988(%rdi), %ymm1 -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm3 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm0 +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vbroadcastss 732(%rdi), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm4 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 988(%rdi), %ymm0 +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm6 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, (%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%rax) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, (%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FP-NEXT: vmovaps %ymm8, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm4, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, (%rax) ; AVX2-FP-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -6822,9 +6838,9 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %xmm1 @@ -6924,9 +6940,9 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7071,18 +7087,19 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] +; AVX2-FCP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -7097,21 +7114,19 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[4],ymm13[4],ymm1[5],ymm13[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 @@ -7124,251 +7139,254 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm11 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm14 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm1 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm1 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm1 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm1 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[6],ymm3[6],ymm15[7],ymm3[7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 1016(%rdi), %ymm1 -; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss 1016(%rdi), %ymm0 +; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm3 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm2 -; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm8 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm3 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 760(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm15 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm1 -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm4 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[6],ymm6[6],ymm14[7],ymm6[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm5 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm5 +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm1 -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 732(%rdi), %ymm1 -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm5 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 988(%rdi), %ymm1 -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm3 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm0 +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vbroadcastss 732(%rdi), %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm4 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 988(%rdi), %ymm0 +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm6 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rax) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, (%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm8, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm5, (%rax) ; AVX2-FCP-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -7487,29 +7505,29 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 ; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512-NEXT: vpermt2d %zmm16, %zmm8, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512-NEXT: vpermt2d %zmm14, %zmm8, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2d %zmm12, %zmm8, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512-NEXT: vpermt2d %zmm9, %zmm8, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -7706,29 +7724,29 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 ; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -7925,29 +7943,29 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 ; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm8, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm8, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm8, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm8, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -8144,29 +8162,29 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -8363,29 +8381,29 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 ; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -8582,29 +8600,29 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -8801,29 +8819,29 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -9020,29 +9038,29 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -11617,7 +11635,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 352(%rdi), %xmm9 ; AVX2-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -11645,7 +11663,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 864(%rdi), %xmm12 ; AVX2-NEXT: vbroadcastss %xmm12, %xmm1 ; AVX2-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -11850,7 +11868,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] ; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps %xmm13, %xmm9 @@ -11871,7 +11889,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -12016,7 +12034,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -12036,7 +12054,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] -; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -12164,7 +12182,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload ; AVX2-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -12179,7 +12197,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload ; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] @@ -12273,11 +12291,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12324,8 +12342,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 544(%rdi), %ymm0 @@ -12346,11 +12364,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 672(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 672(%rdi), %ymm7 ; AVX2-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -12371,15 +12389,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 960(%rdi), %ymm5 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 928(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1056(%rdi), %ymm0 @@ -12396,160 +12414,173 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1216(%rdi), %ymm7 -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1184(%rdi), %ymm2 -; AVX2-NEXT: vmovaps 1152(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX2-NEXT: vmovaps 1248(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovaps 1216(%rdi), %ymm4 +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1312(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1280(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX2-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX2-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovaps 1376(%rdi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1344(%rdi), %ymm2 +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1472(%rdi), %ymm11 +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 1408(%rdi), %ymm10 +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[4],ymm2[4],ymm10[5],ymm2[5] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[4],ymm1[4],ymm11[5],ymm1[5] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1344(%rdi), %ymm7 -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] +; AVX2-NEXT: vmovaps 1568(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX2-NEXT: vmovaps 1504(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1472(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 1536(%rdi), %ymm10 +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[4],ymm0[4],ymm10[5],ymm0[5] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX2-NEXT: vmovaps 1632(%rdi), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1600(%rdi), %ymm11 +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1],xmm11[2,3] +; AVX2-NEXT: vmovaps 1760(%rdi), %ymm10 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-NEXT: vmovaps 1728(%rdi), %ymm13 +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovaps 1408(%rdi), %ymm9 -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1568(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1536(%rdi), %ymm7 -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm9 -; AVX2-NEXT: vmovaps 1632(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1600(%rdi), %ymm7 -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3] -; AVX2-NEXT: vmovaps 1760(%rdi), %ymm9 -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1728(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1696(%rdi), %ymm7 -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1664(%rdi), %ymm11 +; AVX2-NEXT: vmovaps 1664(%rdi), %ymm12 +; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[1],ymm10[1],ymm13[4],ymm10[4],ymm13[5],ymm10[5] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovaps 1824(%rdi), %ymm10 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1824(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1792(%rdi), %ymm7 -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm11 -; AVX2-NEXT: vmovaps 1888(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1856(%rdi), %ymm7 -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3] -; AVX2-NEXT: vmovaps 2016(%rdi), %ymm11 +; AVX2-NEXT: vmovaps 1792(%rdi), %ymm11 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1984(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1952(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 1920(%rdi), %ymm9 -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm12 +; AVX2-NEXT: vmovaps 1888(%rdi), %ymm10 +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1856(%rdi), %ymm11 +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1],xmm13[2,3] +; AVX2-NEXT: vmovaps 2016(%rdi), %ymm12 +; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1984(%rdi), %ymm10 +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1952(%rdi), %ymm11 +; AVX2-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm13 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] ; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm14 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm14 +; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm13 -; AVX2-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm13 +; AVX2-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm15[0,1,2,3,4],mem[5],ymm15[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm13 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] +; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm8 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4],ymm14[5],ymm9[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastss 1172(%rdi), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-NEXT: vbroadcastss 1428(%rdi), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 1172(%rdi), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -12558,10 +12589,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 1428(%rdi), %ymm1 +; AVX2-NEXT: vbroadcastss 1684(%rdi), %ymm1 ; AVX2-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -12570,26 +12602,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-NEXT: vbroadcastss 1940(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 1684(%rdi), %ymm0 -; AVX2-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] -; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 1940(%rdi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -12604,67 +12621,69 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[6],ymm10[6],ymm1[7],ymm10[7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm0 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vbroadcastss 760(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 760(%rdi), %ymm0 -; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 1016(%rdi), %ymm0 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload @@ -12672,7 +12691,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 1272(%rdi), %ymm0 @@ -12681,7 +12700,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload ; AVX2-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm13 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -12695,10 +12714,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -12706,17 +12724,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vbroadcastss 1784(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 1784(%rdi), %ymm0 -; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: vunpckhps (%rsp), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -12743,7 +12760,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm0 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -12768,11 +12785,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-NEXT: vbroadcastss 732(%rdi), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 732(%rdi), %ymm0 -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -12792,16 +12809,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vbroadcastss 1244(%rdi), %ymm0 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vbroadcastss 1500(%rdi), %ymm0 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -12810,10 +12827,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-NEXT: vbroadcastss 1756(%rdi), %ymm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastss 1756(%rdi), %ymm0 -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] @@ -12927,7 +12944,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 224(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 192(%rax) @@ -12947,10 +12964,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-NEXT: vmovaps %ymm5, 192(%rax) ; AVX2-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-NEXT: vmovaps %ymm14, 96(%rax) +; AVX2-NEXT: vmovaps %ymm11, 128(%rax) +; AVX2-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-NEXT: vmovaps %ymm15, 64(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%rax) @@ -12969,7 +12986,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm9 ; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -12997,7 +13014,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 864(%rdi), %xmm12 ; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm1 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -13202,7 +13219,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] ; AVX2-FP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps %xmm13, %xmm9 @@ -13223,7 +13240,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -13368,7 +13385,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-FP-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -13388,7 +13405,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] -; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -13516,7 +13533,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -13531,7 +13548,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] @@ -13625,11 +13642,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13676,8 +13693,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm0 @@ -13698,11 +13715,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -13723,15 +13740,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm0 @@ -13748,160 +13765,173 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovaps 1152(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX2-FP-NEXT: vmovaps 1248(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX2-FP-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FP-NEXT: vmovaps 1376(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1344(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1472(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 1408(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[4],ymm2[4],ymm10[5],ymm2[5] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[4],ymm1[4],ymm11[5],ymm1[5] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1344(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] +; AVX2-FP-NEXT: vmovaps 1568(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1472(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 1536(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[4],ymm0[4],ymm10[5],ymm0[5] +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX2-FP-NEXT: vmovaps 1632(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1600(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1],xmm11[2,3] +; AVX2-FP-NEXT: vmovaps 1760(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps 1728(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1408(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1568(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1536(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm9 -; AVX2-FP-NEXT: vmovaps 1632(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1600(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3] -; AVX2-FP-NEXT: vmovaps 1760(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1728(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1664(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovaps 1664(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[1],ymm10[1],ymm13[4],ymm10[4],ymm13[5],ymm10[5] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 1824(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1824(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1792(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm11 -; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1856(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3] -; AVX2-FP-NEXT: vmovaps 2016(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovaps 1792(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1984(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1952(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovaps 1920(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm12 +; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1856(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1],xmm13[2,3] +; AVX2-FP-NEXT: vmovaps 2016(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1984(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1952(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm13 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] ; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm14 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm14 +; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm13 -; AVX2-FP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm13 +; AVX2-FP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm15[0,1,2,3,4],mem[5],ymm15[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm13 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] +; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm8 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4],ymm14[5],ymm9[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss 1172(%rdi), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vbroadcastss 1428(%rdi), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 1172(%rdi), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -13910,10 +13940,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 1428(%rdi), %ymm1 +; AVX2-FP-NEXT: vbroadcastss 1684(%rdi), %ymm1 ; AVX2-FP-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -13922,26 +13953,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vbroadcastss 1940(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 1684(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 1940(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -13956,67 +13972,69 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[6],ymm10[6],ymm1[7],ymm10[7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm0 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vbroadcastss 760(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 760(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 1016(%rdi), %ymm0 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload @@ -14024,7 +14042,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 1272(%rdi), %ymm0 @@ -14033,7 +14051,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -14047,10 +14065,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -14058,17 +14075,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vbroadcastss 1784(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 1784(%rdi), %ymm0 -; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: vunpckhps (%rsp), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -14095,7 +14111,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -14120,11 +14136,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vbroadcastss 732(%rdi), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 732(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -14144,16 +14160,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 1244(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 1500(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -14162,10 +14178,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FP-NEXT: vbroadcastss 1756(%rdi), %ymm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 1756(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] @@ -14279,7 +14295,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rax) @@ -14299,10 +14315,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rax) ; AVX2-FP-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FP-NEXT: vmovaps %ymm14, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm11, 128(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-FP-NEXT: vmovaps %ymm15, 64(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) @@ -14321,7 +14337,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm9 ; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -14349,7 +14365,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %xmm12 ; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm1 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -14554,7 +14570,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps %xmm13, %xmm9 @@ -14575,7 +14591,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -14720,7 +14736,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-FCP-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -14740,7 +14756,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] -; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -14868,7 +14884,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -14883,7 +14899,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] @@ -14977,11 +14993,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -15028,8 +15044,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm0 @@ -15050,11 +15066,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -15075,15 +15091,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm0 @@ -15100,160 +15116,173 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1472(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[4],ymm2[4],ymm10[5],ymm2[5] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[4],ymm1[4],ymm11[5],ymm1[5] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] +; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1472(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[4],ymm0[4],ymm10[5],ymm0[5] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1],xmm11[2,3] +; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 1728(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm9 -; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1728(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[1],ymm10[1],ymm13[4],ymm10[4],ymm13[5],ymm10[5] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm11 -; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vmovaps 2016(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1984(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 1920(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm12 +; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vmovaps 2016(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1984(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] ; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm14 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm14 +; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm13 -; AVX2-FCP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm13 +; AVX2-FCP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm15[0,1,2,3,4],mem[5],ymm15[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm13 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] +; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm8 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3,4],ymm14[5],ymm9[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss 1172(%rdi), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vbroadcastss 1428(%rdi), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 1172(%rdi), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -15262,10 +15291,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 1428(%rdi), %ymm1 +; AVX2-FCP-NEXT: vbroadcastss 1684(%rdi), %ymm1 ; AVX2-FCP-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -15274,26 +15304,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vbroadcastss 1940(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 1684(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 1940(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -15308,67 +15323,69 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[6],ymm10[6],ymm1[7],ymm10[7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm0 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastss 760(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 760(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 1016(%rdi), %ymm0 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload @@ -15376,7 +15393,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 1272(%rdi), %ymm0 @@ -15385,7 +15402,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -15399,10 +15416,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -15410,17 +15426,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastss 1784(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 1784(%rdi), %ymm0 -; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vunpckhps (%rsp), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -15447,7 +15462,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -15472,11 +15487,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vbroadcastss 732(%rdi), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 732(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -15496,16 +15511,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 1244(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 1500(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -15514,10 +15529,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vbroadcastss 1756(%rdi), %ymm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 1756(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] @@ -15631,7 +15646,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rax) @@ -15651,10 +15666,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm14, 96(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm11, 128(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm15, 64(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) @@ -15666,478 +15681,464 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512: # %bb.0: ; AVX512-NEXT: subq $3144, %rsp # imm = 0xC48 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm30 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm19 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 ; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm15 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm28 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512-NEXT: vpermt2d %zmm11, %zmm1, %zmm30 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-NEXT: vpermt2d %zmm13, %zmm1, %zmm9 +; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512-NEXT: vpermt2d %zmm10, %zmm1, %zmm8 +; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512-NEXT: vpermt2d %zmm25, %zmm1, %zmm31 +; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512-NEXT: vpermt2d %zmm23, %zmm1, %zmm29 +; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512-NEXT: vpermt2d %zmm20, %zmm1, %zmm5 +; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512-NEXT: vpermt2d %zmm28, %zmm1, %zmm4 +; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm23 +; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -16148,18 +16149,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512-NEXT: vpermt2d %zmm21, %zmm1, %zmm20 +; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm25 +; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512-NEXT: vpermt2d %zmm22, %zmm1, %zmm14 +; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 @@ -16177,21 +16181,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX512-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpblendd $240, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm7 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm8, 192(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -16259,478 +16264,464 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512-FCP-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512-FCP-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm1, %zmm30 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm31 +; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -16741,18 +16732,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm25 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 @@ -16770,21 +16764,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm7 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -16852,478 +16847,464 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $3144, %rsp # imm = 0xC48 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 ; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512DQ-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQ-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512DQ-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm28 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm1, %zmm30 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm1, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm1, %zmm31 +; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm1, %zmm29 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm19 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -17334,18 +17315,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm1, %zmm20 +; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm1, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 @@ -17363,21 +17347,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm7 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -17445,478 +17430,464 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm1, %zmm30 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -17927,18 +17898,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 @@ -17956,21 +17930,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm7 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -18038,478 +18013,464 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $3144, %rsp # imm = 0xC48 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512BW-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512BW-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm28 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm1, %zmm30 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -18520,18 +18481,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 @@ -18549,21 +18513,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm7 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -18631,478 +18596,464 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm1, %zmm30 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -19113,18 +19064,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 @@ -19142,21 +19096,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm7 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -19224,478 +19179,464 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $3144, %rsp # imm = 0xC48 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm28 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm1, %zmm30 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm1, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -19706,18 +19647,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 @@ -19735,21 +19679,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm7 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%rsi) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -19817,478 +19762,464 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovaps 1664(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm27, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm20, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm28, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm1, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -20299,18 +20230,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1 @@ -20328,21 +20262,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm21, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll index aa7d8ceb14950..dfeccb33a20d5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -244,10 +244,10 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-FCP-LABEL: load_i64_stride2_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6] ; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7] ; AVX512-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovaps %ymm0, (%rsi) ; AVX512-FCP-NEXT: vmovaps %ymm1, (%rdx) @@ -269,10 +269,10 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-FCP-LABEL: load_i64_stride2_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovaps %ymm1, (%rdx) @@ -294,10 +294,10 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512BW-FCP-LABEL: load_i64_stride2_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6] ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7] ; AVX512BW-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rsi) ; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%rdx) @@ -319,10 +319,10 @@ define void @load_i64_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride2_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7] ; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%rdx) @@ -378,14 +378,14 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; AVX-NEXT: vmovaps %ymm5, (%rsi) -; AVX-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX-NEXT: vmovaps %ymm4, 32(%rsi) ; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX-NEXT: vzeroupper @@ -458,9 +458,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -471,9 +471,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -484,9 +484,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -497,9 +497,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -510,9 +510,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -523,9 +523,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -536,9 +536,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -549,9 +549,9 @@ define void @load_i64_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) @@ -639,24 +639,24 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] ; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3],mem[2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] ; AVX-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX-NEXT: vmovaps %ymm9, (%rsi) +; AVX-NEXT: vmovaps %ymm10, (%rsi) ; AVX-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX-NEXT: vmovaps %ymm6, 96(%rsi) ; AVX-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vmovaps %ymm3, 96(%rdx) @@ -781,11 +781,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -801,11 +801,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -821,11 +821,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -841,11 +841,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -861,11 +861,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -881,11 +881,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -901,11 +901,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -921,11 +921,11 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi) @@ -1100,64 +1100,64 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; ; AVX-LABEL: load_i64_stride2_vf32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX-NEXT: vmovaps (%rdi), %ymm1 -; AVX-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX-NEXT: vmovaps 128(%rdi), %ymm8 ; AVX-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm7[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm7 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm10 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm3[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm12 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm13[1],ymm6[3],ymm13[3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX-NEXT: vmovaps %ymm13, 64(%rsi) -; AVX-NEXT: vmovaps %ymm0, (%rsi) -; AVX-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm6, %ymm14 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm8[2,3],mem[2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm8, %ymm8 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm4 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3],mem[2,3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm2 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm11[0],ymm2[2],ymm11[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm9[0],ymm4[2],ymm9[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm15[0],ymm8[2],ymm15[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX-NEXT: vmovaps %ymm11, 128(%rsi) +; AVX-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX-NEXT: vmovaps %ymm15, (%rsi) +; AVX-NEXT: vmovaps %ymm1, 160(%rsi) ; AVX-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX-NEXT: vmovaps %ymm1, (%rdx) -; AVX-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX-NEXT: vmovaps %ymm14, 160(%rdx) -; AVX-NEXT: vmovaps %ymm12, 96(%rdx) -; AVX-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX-NEXT: vmovaps %ymm13, 224(%rsi) +; AVX-NEXT: vmovaps %ymm0, (%rdx) +; AVX-NEXT: vmovaps %ymm8, 64(%rdx) +; AVX-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX-NEXT: vmovaps %ymm5, 224(%rdx) +; AVX-NEXT: vmovaps %ymm12, 160(%rdx) +; AVX-NEXT: vmovaps %ymm10, 96(%rdx) +; AVX-NEXT: vmovaps %ymm6, 32(%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1384,7 +1384,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1392,7 +1392,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1418,7 +1418,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1426,7 +1426,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1452,7 +1452,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1460,7 +1460,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1486,7 +1486,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1494,7 +1494,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1520,7 +1520,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1528,7 +1528,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1554,7 +1554,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1562,7 +1562,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1588,7 +1588,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1596,7 +1596,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1622,7 +1622,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 @@ -1630,7 +1630,7 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 @@ -1988,117 +1988,117 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX-LABEL: load_i64_stride2_vf64: ; AVX: # %bb.0: ; AVX-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX-NEXT: vmovaps (%rdi), %ymm11 +; AVX-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX-NEXT: vmovaps (%rdi), %ymm12 ; AVX-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX-NEXT: vmovaps 192(%rdi), %ymm9 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm9, %ymm12 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm4, %ymm9 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm13 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm11, %ymm4 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm2 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm10 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm7[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm0 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm3, %ymm8 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm1, %ymm13 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm12, %ymm3 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm4, %ymm4 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm8, %ymm2 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],mem[2,3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm0 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 416(%rdi), %ymm7, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm9, %ymm1 +; AVX-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],mem[2,3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 576(%rdi), %ymm0 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3],mem[2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 704(%rdi), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX-NEXT: vinsertf128 $1, 736(%rdi), %ymm3, %ymm2 +; AVX-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm1 +; AVX-NEXT: vmovaps 640(%rdi), %ymm4 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],mem[2,3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX-NEXT: vinsertf128 $1, 672(%rdi), %ymm4, %ymm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX-NEXT: vmovaps 832(%rdi), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 864(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX-NEXT: vmovaps 768(%rdi), %ymm0 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmovaps 960(%rdi), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 992(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX-NEXT: vmovaps 896(%rdi), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] -; AVX-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX-NEXT: vmovaps %ymm2, 448(%rsi) -; AVX-NEXT: vmovaps %ymm3, 384(%rsi) -; AVX-NEXT: vmovaps %ymm6, 320(%rsi) +; AVX-NEXT: vinsertf128 $1, 800(%rdi), %ymm1, %ymm1 +; AVX-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] +; AVX-NEXT: vinsertf128 $1, 928(%rdi), %ymm2, %ymm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX-NEXT: vmovaps %ymm1, 448(%rsi) +; AVX-NEXT: vmovaps %ymm4, 384(%rsi) +; AVX-NEXT: vmovaps %ymm5, 320(%rsi) ; AVX-NEXT: vmovaps %ymm10, 256(%rsi) -; AVX-NEXT: vmovaps %ymm13, 192(%rsi) +; AVX-NEXT: vmovaps %ymm14, 192(%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, (%rsi) -; AVX-NEXT: vmovaps %ymm4, 480(%rsi) +; AVX-NEXT: vmovaps %ymm3, 480(%rsi) ; AVX-NEXT: vmovaps %ymm7, 416(%rsi) -; AVX-NEXT: vmovaps %ymm11, 352(%rsi) +; AVX-NEXT: vmovaps %ymm9, 352(%rsi) ; AVX-NEXT: vmovaps %ymm15, 288(%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 224(%rsi) @@ -2109,11 +2109,11 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX-NEXT: vmovaps %ymm5, 480(%rdx) +; AVX-NEXT: vmovaps %ymm6, 480(%rdx) ; AVX-NEXT: vmovaps %ymm8, 384(%rdx) -; AVX-NEXT: vmovaps %ymm9, 416(%rdx) +; AVX-NEXT: vmovaps %ymm11, 416(%rdx) ; AVX-NEXT: vmovaps %ymm12, 320(%rdx) -; AVX-NEXT: vmovaps %ymm14, 352(%rdx) +; AVX-NEXT: vmovaps %ymm13, 352(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 256(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2663,7 +2663,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2679,7 +2679,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2725,7 +2725,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2741,7 +2741,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2787,7 +2787,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2803,7 +2803,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2849,7 +2849,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2865,7 +2865,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2911,7 +2911,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2927,7 +2927,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -2973,7 +2973,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -2989,7 +2989,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -3035,7 +3035,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -3051,7 +3051,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 @@ -3097,7 +3097,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 @@ -3113,7 +3113,7 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll index c8b95cd71c5d1..27ede99cc9fd1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -36,8 +36,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovdqa %xmm3, (%rsi) ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) @@ -49,8 +49,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa %xmm3, (%rsi) ; AVX2-NEXT: vmovdqa %xmm0, (%rdx) ; AVX2-NEXT: vmovdqa %xmm1, (%rcx) @@ -62,8 +62,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsi) ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rdx) ; AVX2-FP-NEXT: vmovdqa %xmm1, (%rcx) @@ -75,8 +75,8 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsi) ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rcx) @@ -96,7 +96,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i64_stride3_vf2: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [1,4] ; AVX512-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3] ; AVX512-FCP-NEXT: vmovaps 16(%rdi), %xmm2 @@ -121,7 +121,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i64_stride3_vf2: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [1,4] ; AVX512DQ-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3] ; AVX512DQ-FCP-NEXT: vmovaps 16(%rdi), %xmm2 @@ -146,7 +146,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i64_stride3_vf2: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [1,4] ; AVX512BW-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3] ; AVX512BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2 @@ -171,7 +171,7 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride3_vf2: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [1,4] ; AVX512DQ-BW-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2 @@ -223,8 +223,8 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm0[2,3] ; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[3] ; AVX-NEXT: vbroadcastsd 80(%rdi), %ymm4 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] @@ -304,11 +304,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-NEXT: vmovdqa %ymm3, (%rdx) @@ -320,11 +320,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -336,11 +336,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx) @@ -352,11 +352,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -368,11 +368,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -384,11 +384,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -400,11 +400,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) @@ -416,11 +416,11 @@ define void @load_i64_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,3,6,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,6,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,4,7,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,7,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,5,8,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,8,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) @@ -484,32 +484,32 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i64_stride3_vf8: ; AVX: # %bb.0: -; AVX-NEXT: vmovapd 128(%rdi), %ymm0 -; AVX-NEXT: vmovapd 32(%rdi), %ymm1 -; AVX-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm1[2,3] -; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm2, %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] -; AVX-NEXT: vmovaps 112(%rdi), %xmm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3] -; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2],ymm5[3] -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm1[0],ymm3[3],ymm1[3] +; AVX-NEXT: vmovapd 32(%rdi), %ymm0 +; AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm0[2,3] +; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX-NEXT: vmovapd 128(%rdi), %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm3[2,3] +; AVX-NEXT: vmovaps 112(%rdi), %xmm6 +; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm6 +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[3] ; AVX-NEXT: vbroadcastsd 80(%rdi), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3] -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm0[0],ymm6[3],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1],ymm3[0],ymm5[3],ymm3[3] ; AVX-NEXT: vbroadcastsd 176(%rdi), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] ; AVX-NEXT: vmovapd %ymm7, 32(%rsi) ; AVX-NEXT: vmovapd %ymm4, (%rsi) -; AVX-NEXT: vmovapd %ymm6, 32(%rdx) -; AVX-NEXT: vmovapd %ymm3, (%rdx) -; AVX-NEXT: vmovapd %ymm0, 32(%rcx) -; AVX-NEXT: vmovapd %ymm1, (%rcx) +; AVX-NEXT: vmovapd %ymm5, 32(%rdx) +; AVX-NEXT: vmovapd %ymm2, (%rdx) +; AVX-NEXT: vmovapd %ymm1, 32(%rcx) +; AVX-NEXT: vmovapd %ymm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -522,27 +522,27 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm4 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovaps 112(%rdi), %xmm3 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] +; AVX2-NEXT: vmovaps 112(%rdi), %xmm6 +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-NEXT: vmovaps %ymm0, 32(%rdx) @@ -561,27 +561,27 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm4 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 80(%rdi), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 176(%rdi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 112(%rdi), %xmm3 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] +; AVX2-FP-NEXT: vmovaps 112(%rdi), %xmm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx) @@ -600,27 +600,27 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 80(%rdi), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 176(%rdi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 112(%rdi), %xmm3 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vmovaps 112(%rdi), %xmm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx) @@ -635,17 +635,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -658,17 +658,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -681,17 +681,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -704,17 +704,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -727,17 +727,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -750,17 +750,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -773,17 +773,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -796,17 +796,17 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,6,9,12,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,4,7,10,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [10,13,0,3,6,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -930,58 +930,58 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i64_stride3_vf16: ; AVX: # %bb.0: -; AVX-NEXT: vmovapd 32(%rdi), %ymm1 +; AVX-NEXT: vmovapd 32(%rdi), %ymm2 ; AVX-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX-NEXT: vmovapd 320(%rdi), %ymm2 -; AVX-NEXT: vmovapd 128(%rdi), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm5[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm2[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm1[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm7[1],ymm5[0],ymm7[3],ymm5[3] -; AVX-NEXT: vbroadcastsd 176(%rdi), %ymm6 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3] -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[1],ymm2[0],ymm8[3],ymm2[3] -; AVX-NEXT: vbroadcastsd 368(%rdi), %ymm10 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3] -; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm9[1],ymm1[0],ymm9[3],ymm1[3] -; AVX-NEXT: vbroadcastsd 80(%rdi), %ymm11 +; AVX-NEXT: vmovapd 128(%rdi), %ymm3 +; AVX-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm3[2,3] +; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm5 +; AVX-NEXT: vmovapd 320(%rdi), %ymm4 +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm5[1],ymm7[2],ymm5[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm4[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm2[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm3[0],ymm7[3],ymm3[3] +; AVX-NEXT: vbroadcastsd 176(%rdi), %ymm10 +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm8[1],ymm4[0],ymm8[3],ymm4[3] +; AVX-NEXT: vbroadcastsd 368(%rdi), %ymm11 ; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3] -; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[1],ymm0[0],ymm3[3],ymm0[3] -; AVX-NEXT: vbroadcastsd 272(%rdi), %ymm12 +; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm9[1],ymm2[0],ymm9[3],ymm2[3] +; AVX-NEXT: vbroadcastsd 80(%rdi), %ymm12 ; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3] -; AVX-NEXT: vmovaps 112(%rdi), %xmm12 -; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm12, %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3] -; AVX-NEXT: vmovaps 304(%rdi), %xmm12 -; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm12, %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm6[1],ymm0[0],ymm6[3],ymm0[3] +; AVX-NEXT: vbroadcastsd 272(%rdi), %ymm13 +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3] +; AVX-NEXT: vmovaps 304(%rdi), %xmm13 +; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm13, %ymm13 +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2],ymm13[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] +; AVX-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm5, %ymm5 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm13[0],ymm4[1],ymm13[2],ymm4[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm5[1],ymm9[2],ymm5[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2],ymm2[3] -; AVX-NEXT: vmovaps 16(%rdi), %xmm12 -; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm12, %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2],ymm1[3] -; AVX-NEXT: vmovaps 208(%rdi), %xmm12 -; AVX-NEXT: vinsertf128 $1, 256(%rdi), %ymm12, %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3] +; AVX-NEXT: vmovaps 208(%rdi), %xmm5 +; AVX-NEXT: vinsertf128 $1, 256(%rdi), %ymm5, %ymm5 +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm5[1],ymm6[2],ymm5[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3] ; AVX-NEXT: vmovapd %ymm9, (%rsi) -; AVX-NEXT: vmovapd %ymm3, 64(%rsi) +; AVX-NEXT: vmovapd %ymm6, 64(%rsi) ; AVX-NEXT: vmovapd %ymm8, 96(%rsi) -; AVX-NEXT: vmovapd %ymm7, 32(%rsi) -; AVX-NEXT: vmovapd %ymm11, 64(%rdx) -; AVX-NEXT: vmovapd %ymm10, (%rdx) -; AVX-NEXT: vmovapd %ymm6, 96(%rdx) -; AVX-NEXT: vmovapd %ymm4, 32(%rdx) +; AVX-NEXT: vmovapd %ymm1, 32(%rsi) +; AVX-NEXT: vmovapd %ymm12, 64(%rdx) +; AVX-NEXT: vmovapd %ymm11, (%rdx) +; AVX-NEXT: vmovapd %ymm10, 96(%rdx) +; AVX-NEXT: vmovapd %ymm7, 32(%rdx) ; AVX-NEXT: vmovapd %ymm0, 64(%rcx) -; AVX-NEXT: vmovapd %ymm1, (%rcx) -; AVX-NEXT: vmovapd %ymm2, 96(%rcx) -; AVX-NEXT: vmovapd %ymm5, 32(%rcx) +; AVX-NEXT: vmovapd %ymm2, (%rcx) +; AVX-NEXT: vmovapd %ymm4, 96(%rcx) +; AVX-NEXT: vmovapd %ymm3, 32(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1006,18 +1006,18 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm4 -; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm9[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 368(%rdi), %ymm10 +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] @@ -1026,23 +1026,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 272(%rdi), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vmovaps 112(%rdi), %xmm7 -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovaps 304(%rdi), %xmm9 +; AVX2-NEXT: vmovaps 112(%rdi), %xmm9 ; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vmovaps 304(%rdi), %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vmovaps 208(%rdi), %xmm11 +; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] +; AVX2-NEXT: vmovaps 16(%rdi), %xmm11 ; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovaps 208(%rdi), %xmm10 +; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-NEXT: vmovaps %ymm1, 96(%rsi) @@ -1051,10 +1051,10 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm8, (%rdx) ; AVX2-NEXT: vmovaps %ymm5, 96(%rdx) ; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-NEXT: vmovaps %ymm10, (%rcx) -; AVX2-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX2-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-NEXT: vmovaps %ymm9, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1079,18 +1079,18 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm2 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm4 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm9[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 176(%rdi), %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 368(%rdi), %ymm10 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] @@ -1099,23 +1099,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 272(%rdi), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vmovaps 112(%rdi), %xmm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 304(%rdi), %xmm9 +; AVX2-FP-NEXT: vmovaps 112(%rdi), %xmm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovaps 304(%rdi), %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm11 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] +; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) @@ -1124,10 +1124,10 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm8, (%rdx) ; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rdx) ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FP-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm10, (%rcx) -; AVX2-FP-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm9, 32(%rcx) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -1152,18 +1152,18 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm9[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 176(%rdi), %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 368(%rdi), %ymm10 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] @@ -1172,23 +1172,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 272(%rdi), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vmovaps 112(%rdi), %xmm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 304(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovaps 112(%rdi), %xmm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovaps 304(%rdi), %xmm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm11 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) @@ -1197,10 +1197,10 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm8, (%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FCP-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm10, (%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%rcx) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -1212,23 +1212,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] ; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1249,23 +1249,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1286,23 +1286,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1323,23 +1323,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1360,23 +1360,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1397,23 +1397,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1434,23 +1434,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1471,23 +1471,23 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,6,9,12,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [1,4,7,10,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [10,13,0,3,6,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 @@ -1744,164 +1744,166 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i64_stride3_vf32: ; AVX: # %bb.0: -; AVX-NEXT: subq $232, %rsp -; AVX-NEXT: vmovapd 32(%rdi), %ymm7 -; AVX-NEXT: vmovapd 704(%rdi), %ymm13 -; AVX-NEXT: vmovapd 512(%rdi), %ymm9 -; AVX-NEXT: vmovapd 320(%rdi), %ymm8 -; AVX-NEXT: vmovapd 128(%rdi), %ymm10 -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm10[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm8[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm9[2,3] -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm13[2,3] +; AVX-NEXT: subq $264, %rsp # imm = 0x108 +; AVX-NEXT: vmovapd 416(%rdi), %ymm12 +; AVX-NEXT: vmovapd 224(%rdi), %ymm7 +; AVX-NEXT: vmovapd 32(%rdi), %ymm4 +; AVX-NEXT: vmovapd 704(%rdi), %ymm10 +; AVX-NEXT: vmovapd 512(%rdi), %ymm8 +; AVX-NEXT: vmovapd 320(%rdi), %ymm5 +; AVX-NEXT: vmovapd 128(%rdi), %ymm9 +; AVX-NEXT: vblendpd {{.*#+}} ymm13 = mem[0,1],ymm9[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm5[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm8[2,3] +; AVX-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm10[2,3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm7[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm10[0],ymm12[3],ymm10[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm4[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm7[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm12[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[1],ymm9[0],ymm13[3],ymm9[3] ; AVX-NEXT: vbroadcastsd 176(%rdi), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm8[0],ymm11[3],ymm8[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm5[0],ymm11[3],ymm5[3] ; AVX-NEXT: vbroadcastsd 368(%rdi), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[3],ymm9[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm8[0],ymm14[3],ymm8[3] ; AVX-NEXT: vbroadcastsd 560(%rdi), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm13[0],ymm2[3],ymm13[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm10[0],ymm2[3],ymm10[3] ; AVX-NEXT: vbroadcastsd 752(%rdi), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm4[0],ymm3[3],ymm4[3] ; AVX-NEXT: vbroadcastsd 80(%rdi), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 224(%rdi), %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm3[0],ymm4[3],ymm3[3] -; AVX-NEXT: vbroadcastsd 272(%rdi), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm7[0],ymm6[3],ymm7[3] +; AVX-NEXT: vbroadcastsd 272(%rdi), %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm1[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1],ymm1[0],ymm6[3],ymm1[3] -; AVX-NEXT: vbroadcastsd 464(%rdi), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm5[3] -; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovapd 608(%rdi), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm12[0],ymm15[3],ymm12[3] +; AVX-NEXT: vbroadcastsd 464(%rdi), %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 608(%rdi), %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm1[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm1[0],ymm2[3],ymm1[3] ; AVX-NEXT: vbroadcastsd 656(%rdi), %ymm14 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm0[0],ymm10[1],ymm0[2],ymm10[3] -; AVX-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] -; AVX-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2],ymm0[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] -; AVX-NEXT: vmovaps 496(%rdi), %xmm3 -; AVX-NEXT: vinsertf128 $1, 544(%rdi), %ymm3, %ymm3 -; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: # ymm7 = mem[0],ymm3[1],mem[2],ymm3[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3] -; AVX-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX-NEXT: vinsertf128 $1, 448(%rdi), %ymm8, %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0],ymm0[1],ymm13[2],ymm0[3] +; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 304(%rdi), %xmm14 +; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm14 +; AVX-NEXT: vmovaps 16(%rdi), %xmm13 +; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm13, %ymm13 +; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2],ymm9[3] +; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm13[1],ymm3[2],ymm13[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm13[0],ymm3[1],ymm13[2],ymm3[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm14[0],ymm3[1],ymm14[2],ymm3[3] +; AVX-NEXT: vmovaps 208(%rdi), %xmm3 +; AVX-NEXT: vinsertf128 $1, 256(%rdi), %ymm3, %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2],ymm6[3] +; AVX-NEXT: vmovaps 496(%rdi), %xmm6 +; AVX-NEXT: vinsertf128 $1, 544(%rdi), %ymm6, %ymm6 +; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = mem[0],ymm6[1],mem[2],ymm6[3] +; AVX-NEXT: vmovaps 688(%rdi), %xmm13 +; AVX-NEXT: vinsertf128 $1, 736(%rdi), %ymm13, %ymm13 +; AVX-NEXT: vmovaps 400(%rdi), %xmm14 +; AVX-NEXT: vinsertf128 $1, 448(%rdi), %ymm14, %ymm14 +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3] -; AVX-NEXT: vmovaps 688(%rdi), %xmm8 -; AVX-NEXT: vinsertf128 $1, 736(%rdi), %ymm8, %ymm8 -; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload -; AVX-NEXT: # ymm9 = mem[0],ymm8[1],mem[2],ymm8[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2],ymm13[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2],ymm14[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3] +; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX-NEXT: # ymm14 = mem[0],ymm13[1],mem[2],ymm13[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2],ymm10[3] ; AVX-NEXT: vmovaps 592(%rdi), %xmm13 ; AVX-NEXT: vinsertf128 $1, 640(%rdi), %ymm13, %ymm13 -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2],ymm2[3] -; AVX-NEXT: vmovapd %ymm5, 192(%rsi) -; AVX-NEXT: vmovapd %ymm6, 128(%rsi) -; AVX-NEXT: vmovapd %ymm4, 64(%rsi) -; AVX-NEXT: vmovapd %ymm15, (%rsi) -; AVX-NEXT: vmovapd %ymm9, 224(%rsi) +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2],ymm13[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0],ymm1[1],ymm13[2],ymm1[3] +; AVX-NEXT: vmovapd %ymm2, 192(%rsi) +; AVX-NEXT: vmovapd %ymm8, 128(%rsi) +; AVX-NEXT: vmovapd %ymm5, 64(%rsi) +; AVX-NEXT: vmovapd %ymm0, (%rsi) +; AVX-NEXT: vmovapd %ymm14, 224(%rsi) ; AVX-NEXT: vmovapd %ymm7, 160(%rsi) ; AVX-NEXT: vmovapd %ymm11, 96(%rsi) -; AVX-NEXT: vmovapd %ymm12, 32(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, (%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 160(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX-NEXT: vmovapd %ymm2, 192(%rcx) -; AVX-NEXT: vmovapd %ymm8, 224(%rcx) -; AVX-NEXT: vmovapd %ymm1, 128(%rcx) -; AVX-NEXT: vmovapd %ymm3, 160(%rcx) -; AVX-NEXT: vmovapd %ymm0, 64(%rcx) -; AVX-NEXT: vmovapd %ymm10, 96(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, (%rcx) -; AVX-NEXT: vmovapd %ymm14, 32(%rcx) -; AVX-NEXT: addq $232, %rsp +; AVX-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, (%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX-NEXT: vmovapd %ymm1, 192(%rcx) +; AVX-NEXT: vmovapd %ymm10, 224(%rcx) +; AVX-NEXT: vmovapd %ymm12, 128(%rcx) +; AVX-NEXT: vmovapd %ymm6, 160(%rcx) +; AVX-NEXT: vmovapd %ymm3, 64(%rcx) +; AVX-NEXT: vmovapd %ymm4, 96(%rcx) +; AVX-NEXT: vmovapd %ymm9, (%rcx) +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX-NEXT: addq $264, %rsp # imm = 0x108 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i64_stride3_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $232, %rsp -; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 704(%rdi), %ymm11 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm14 ; AVX2-NEXT: vmovaps 512(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 480(%rdi), %ymm15 -; AVX2-NEXT: vmovaps 320(%rdi), %ymm9 -; AVX2-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 480(%rdi), %ymm9 +; AVX2-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,3,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,3,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1911,21 +1913,21 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 416(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1936,34 +1938,34 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm13 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vbroadcastsd 368(%rdi), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vbroadcastsd 560(%rdi), %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vbroadcastsd 752(%rdi), %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 368(%rdi), %ymm7 +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 560(%rdi), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 752(%rdi), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 272(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 464(%rdi), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] @@ -1973,63 +1975,63 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 496(%rdi), %xmm4 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovaps 400(%rdi), %xmm5 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovaps 688(%rdi), %xmm7 ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovaps 592(%rdi), %xmm10 -; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-NEXT: vmovaps 592(%rdi), %xmm9 +; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm10, 128(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm10, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm10, 224(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm10, 160(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm9, 192(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm9, 128(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm9, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm9, 224(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm9, 160(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm9, 32(%rsi) ; AVX2-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX2-NEXT: vmovaps %ymm15, 128(%rdx) ; AVX2-NEXT: vmovaps %ymm13, 64(%rdx) ; AVX2-NEXT: vmovaps %ymm12, (%rdx) ; AVX2-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-NEXT: vmovaps %ymm9, 160(%rdx) +; AVX2-NEXT: vmovaps %ymm10, 160(%rdx) ; AVX2-NEXT: vmovaps %ymm8, 96(%rdx) ; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-NEXT: vmovaps %ymm6, 224(%rcx) ; AVX2-NEXT: vmovaps %ymm5, 128(%rcx) ; AVX2-NEXT: vmovaps %ymm4, 160(%rcx) ; AVX2-NEXT: vmovaps %ymm0, 64(%rcx) @@ -2043,30 +2045,30 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i64_stride3_vf32: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $232, %rsp -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,3,2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,3,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2076,21 +2078,21 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2101,34 +2103,34 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm13 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vbroadcastsd 176(%rdi), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vbroadcastsd 368(%rdi), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vbroadcastsd 560(%rdi), %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vbroadcastsd 752(%rdi), %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vbroadcastsd 80(%rdi), %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vbroadcastsd 176(%rdi), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 368(%rdi), %ymm7 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vbroadcastsd 560(%rdi), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vbroadcastsd 752(%rdi), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vbroadcastsd 80(%rdi), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 272(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 464(%rdi), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] @@ -2138,63 +2140,63 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],mem[2,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 496(%rdi), %xmm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 400(%rdi), %xmm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 688(%rdi), %xmm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 592(%rdi), %xmm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-FP-NEXT: vmovaps 592(%rdi), %xmm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm10, 128(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm10, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm10, 224(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm10, 160(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm9, 192(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm9, 128(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm9, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm9, 224(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm9, 160(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm9, 32(%rsi) ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX2-FP-NEXT: vmovaps %ymm15, 128(%rdx) ; AVX2-FP-NEXT: vmovaps %ymm13, 64(%rdx) ; AVX2-FP-NEXT: vmovaps %ymm12, (%rdx) ; AVX2-FP-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-FP-NEXT: vmovaps %ymm9, 160(%rdx) +; AVX2-FP-NEXT: vmovaps %ymm10, 160(%rdx) ; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rdx) ; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FP-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm6, 224(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm5, 128(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm4, 160(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rcx) @@ -2208,30 +2210,30 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i64_stride3_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $232, %rsp -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,3,2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,3,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2241,21 +2243,21 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2266,34 +2268,34 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vbroadcastsd 176(%rdi), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vbroadcastsd 368(%rdi), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vbroadcastsd 560(%rdi), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vbroadcastsd 752(%rdi), %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vbroadcastsd 80(%rdi), %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vbroadcastsd 176(%rdi), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 368(%rdi), %ymm7 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vbroadcastsd 560(%rdi), %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vbroadcastsd 752(%rdi), %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vbroadcastsd 80(%rdi), %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 272(%rdi), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 464(%rdi), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] @@ -2303,63 +2305,63 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 496(%rdi), %xmm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 400(%rdi), %xmm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 688(%rdi), %xmm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 592(%rdi), %xmm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-FCP-NEXT: vmovaps 592(%rdi), %xmm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm10, 128(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm10, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm10, 224(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm10, 160(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm9, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm9, 224(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%rsi) ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm15, 128(%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm13, 64(%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm12, (%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%rdx) +; AVX2-FCP-NEXT: vmovaps %ymm10, 160(%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rdx) ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FCP-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm4, 160(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rcx) @@ -2384,10 +2386,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2397,10 +2399,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2410,9 +2412,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] ; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2449,10 +2451,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2462,10 +2464,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2475,9 +2477,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2514,10 +2516,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2527,10 +2529,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2540,9 +2542,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2579,10 +2581,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2592,10 +2594,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2605,9 +2607,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2644,10 +2646,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2657,10 +2659,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2670,9 +2672,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2709,10 +2711,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2722,10 +2724,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2735,9 +2737,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2774,10 +2776,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2787,10 +2789,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2800,9 +2802,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -2839,10 +2841,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,6,9,12,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 @@ -2852,10 +2854,10 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [1,4,7,10,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 @@ -2865,9 +2867,9 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [10,13,0,3,6,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 @@ -3390,250 +3392,287 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i64_stride3_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX-NEXT: vmovapd 896(%rdi), %ymm0 -; AVX-NEXT: vmovapd 704(%rdi), %ymm1 -; AVX-NEXT: vmovapd 512(%rdi), %ymm2 -; AVX-NEXT: vmovapd 320(%rdi), %ymm5 -; AVX-NEXT: vmovapd 128(%rdi), %ymm6 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm6[2,3] -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm5[2,3] +; AVX-NEXT: subq $1384, %rsp # imm = 0x568 +; AVX-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 992(%rdi), %ymm4 +; AVX-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 608(%rdi), %ymm13 +; AVX-NEXT: vmovapd 416(%rdi), %ymm15 +; AVX-NEXT: vmovapd 224(%rdi), %ymm7 +; AVX-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 1280(%rdi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 1088(%rdi), %ymm3 +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 896(%rdi), %ymm14 +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 704(%rdi), %ymm9 +; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 512(%rdi), %ymm10 +; AVX-NEXT: vmovapd 320(%rdi), %ymm11 +; AVX-NEXT: vmovapd 128(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm8[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm11[2,3] +; AVX-NEXT: vmovupd %ymm12, (%rsp) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm10[2,3] ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd %ymm2, %ymm4 -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd %ymm1, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm2 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd %ymm7, %ymm0 +; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm7[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm0[2,3] -; AVX-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd %ymm0, %ymm2 -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm6[0],ymm7[3],ymm6[3] -; AVX-NEXT: vbroadcastsd 176(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm5[0],ymm8[3],ymm5[3] -; AVX-NEXT: vbroadcastsd 368(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[3] -; AVX-NEXT: vbroadcastsd 560(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm3[0],ymm10[3],ymm3[3] -; AVX-NEXT: vbroadcastsd 752(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm2[0],ymm11[3],ymm2[3] -; AVX-NEXT: vbroadcastsd 944(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1088(%rdi), %ymm9 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3] -; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[3] -; AVX-NEXT: vbroadcastsd 1136(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1280(%rdi), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm8[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm8[0],ymm0[3],ymm8[3] -; AVX-NEXT: vbroadcastsd 1328(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1472(%rdi), %ymm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[3] -; AVX-NEXT: vbroadcastsd 1520(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 32(%rdi), %ymm0 -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm15[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3] -; AVX-NEXT: vbroadcastsd 80(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 224(%rdi), %ymm13 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm13[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[3],ymm13[3] -; AVX-NEXT: vbroadcastsd 272(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 416(%rdi), %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm12[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[3] -; AVX-NEXT: vbroadcastsd 464(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd %ymm13, %ymm7 +; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm13[2,3] +; AVX-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm6[2,3] +; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm4[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd %ymm4, %ymm3 +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm13[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm2[1],ymm8[0],ymm2[3],ymm8[3] +; AVX-NEXT: vbroadcastsd 176(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd (%rsp), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm11[0],ymm4[3],ymm11[3] +; AVX-NEXT: vbroadcastsd 368(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd %ymm10, %ymm14 +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm10[1],ymm14[0],ymm10[3],ymm14[3] +; AVX-NEXT: vbroadcastsd 560(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm4[1],mem[0],ymm4[3],mem[3] +; AVX-NEXT: vbroadcastsd 752(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm4[1],mem[0],ymm4[3],mem[3] +; AVX-NEXT: vbroadcastsd 944(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm4[1],mem[0],ymm4[3],mem[3] +; AVX-NEXT: vbroadcastsd 1136(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm4[1],mem[0],ymm4[3],mem[3] +; AVX-NEXT: vbroadcastsd 1328(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm4[1],mem[0],ymm4[3],mem[3] +; AVX-NEXT: vbroadcastsd 1520(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm4[1],mem[0],ymm4[3],mem[3] +; AVX-NEXT: vbroadcastsd 80(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm0[0],ymm4[3],ymm0[3] +; AVX-NEXT: vbroadcastsd 272(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm8[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 608(%rdi), %ymm10 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3] +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm15[0],ymm0[3],ymm15[3] +; AVX-NEXT: vbroadcastsd 464(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm8[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[3] -; AVX-NEXT: vbroadcastsd 656(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm12[1],ymm7[0],ymm12[3],ymm7[3] +; AVX-NEXT: vbroadcastsd 656(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm8[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 800(%rdi), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm5[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[3] -; AVX-NEXT: vbroadcastsd 848(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm9[1],ymm6[0],ymm9[3],ymm6[3] +; AVX-NEXT: vbroadcastsd 848(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm8[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 992(%rdi), %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm4[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm4[0],ymm11[3],ymm4[3] -; AVX-NEXT: vbroadcastsd 1040(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm3[0],ymm5[3],ymm3[3] +; AVX-NEXT: vbroadcastsd 1040(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm8[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1184(%rdi), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm2[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm2[0],ymm6[3],ymm2[3] -; AVX-NEXT: vbroadcastsd 1232(%rdi), %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm13[0],ymm1[3],ymm13[3] +; AVX-NEXT: vbroadcastsd 1232(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm8[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1376(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm1[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm1[0],ymm3[3],ymm1[3] -; AVX-NEXT: vbroadcastsd 1424(%rdi), %ymm15 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX-NEXT: vmovapd 1376(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm8[0],ymm12[3],ymm8[3] +; AVX-NEXT: vbroadcastsd 1424(%rdi), %ymm6 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 496(%rdi), %xmm0 -; AVX-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd $5, (%rsp), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 496(%rdi), %xmm2 +; AVX-NEXT: vinsertf128 $1, 544(%rdi), %ymm2, %ymm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm11[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0],ymm2[1],ymm10[2],ymm2[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 688(%rdi), %xmm0 ; AVX-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 880(%rdi), %xmm0 ; AVX-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 1072(%rdi), %xmm0 ; AVX-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendpd $5, (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] -; AVX-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2],ymm9[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1264(%rdi), %xmm0 -; AVX-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3] -; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 1264(%rdi), %xmm1 +; AVX-NEXT: vinsertf128 $1, 1312(%rdi), %ymm1, %ymm2 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 1456(%rdi), %xmm0 -; AVX-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX-NEXT: # ymm8 = mem[0],ymm0[1],mem[2],ymm0[3] -; AVX-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1360(%rdi), %xmm0 -; AVX-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm1[1],ymm6[2],ymm1[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm3 +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm7 +; AVX-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm5 +; AVX-NEXT: vmovaps 592(%rdi), %xmm9 +; AVX-NEXT: vinsertf128 $1, 640(%rdi), %ymm9, %ymm4 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm2, %ymm1 +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX-NEXT: # ymm15 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX-NEXT: vmovaps 976(%rdi), %xmm6 +; AVX-NEXT: vinsertf128 $1, 1024(%rdi), %ymm6, %ymm6 +; AVX-NEXT: vmovaps 1168(%rdi), %xmm9 +; AVX-NEXT: vinsertf128 $1, 1216(%rdi), %ymm9, %ymm14 +; AVX-NEXT: vmovaps 1360(%rdi), %xmm9 +; AVX-NEXT: vinsertf128 $1, 1408(%rdi), %ymm9, %ymm0 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX-NEXT: # ymm10 = mem[0],ymm14[1],mem[2],ymm14[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],mem[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 976(%rdi), %xmm1 -; AVX-NEXT: vinsertf128 $1, 1024(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0],ymm1[1],ymm11[2],ymm1[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] -; AVX-NEXT: vmovaps 784(%rdi), %xmm1 -; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] -; AVX-NEXT: vmovaps 592(%rdi), %xmm4 -; AVX-NEXT: vinsertf128 $1, 640(%rdi), %ymm4, %ymm4 -; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0],ymm4[1],mem[2],ymm4[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0],ymm9[1],ymm4[2],ymm9[3] -; AVX-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX-NEXT: vinsertf128 $1, 448(%rdi), %ymm4, %ymm4 -; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX-NEXT: # ymm9 = mem[0],ymm4[1],mem[2],ymm4[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm12[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2],ymm11[3] -; AVX-NEXT: vmovaps 208(%rdi), %xmm11 -; AVX-NEXT: vinsertf128 $1, 256(%rdi), %ymm11, %ymm11 -; AVX-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: # ymm12 = mem[0],ymm11[1],mem[2],ymm11[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm11[0],ymm13[1],ymm11[2],ymm13[3] -; AVX-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm11, %ymm11 -; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] -; AVX-NEXT: vmovapd %ymm7, 448(%rsi) -; AVX-NEXT: vmovapd %ymm3, 384(%rsi) -; AVX-NEXT: vmovapd %ymm2, 320(%rsi) -; AVX-NEXT: vmovapd %ymm14, 256(%rsi) -; AVX-NEXT: vmovapd %ymm5, 192(%rsi) -; AVX-NEXT: vmovapd %ymm9, 128(%rsi) -; AVX-NEXT: vmovapd %ymm12, 64(%rsi) +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX-NEXT: # ymm9 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm13[2,3],ymm6[4,5],ymm13[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm6[2,3],ymm1[4,5],ymm6[6,7] +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX-NEXT: # ymm6 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: # ymm3 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX-NEXT: # ymm2 = mem[0,1],ymm7[2,3],mem[4,5],ymm7[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7] +; AVX-NEXT: vmovapd %ymm11, 448(%rsi) +; AVX-NEXT: vmovapd %ymm10, 384(%rsi) +; AVX-NEXT: vmovaps %ymm9, 320(%rsi) +; AVX-NEXT: vmovaps %ymm8, 256(%rsi) +; AVX-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX-NEXT: vmovaps %ymm0, (%rsi) +; AVX-NEXT: vmovaps %ymm15, 480(%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 480(%rsi) -; AVX-NEXT: vmovapd %ymm15, 416(%rsi) +; AVX-NEXT: vmovaps %ymm0, 416(%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 352(%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3678,19 +3717,19 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX-NEXT: vmovaps %ymm11, (%rcx) -; AVX-NEXT: vmovapd %ymm13, 64(%rcx) -; AVX-NEXT: vmovapd %ymm4, 128(%rcx) -; AVX-NEXT: vmovapd %ymm10, 192(%rcx) -; AVX-NEXT: vmovapd %ymm1, 256(%rcx) -; AVX-NEXT: vmovapd %ymm6, 320(%rcx) +; AVX-NEXT: vmovaps %ymm1, (%rcx) +; AVX-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX-NEXT: vmovaps %ymm13, 256(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX-NEXT: vmovapd %ymm12, 448(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 480(%rcx) -; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 416(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 352(%rcx) @@ -3700,70 +3739,70 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm0, 224(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX-NEXT: addq $1384, %rsp # imm = 0x568 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i64_stride3_vf64: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-NEXT: vmovaps 896(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 864(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 896(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 864(%rdi), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 704(%rdi), %ymm5 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 672(%rdi), %ymm6 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-NEXT: vmovaps 480(%rdi), %ymm8 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-NEXT: vmovaps 320(%rdi), %ymm9 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 288(%rdi), %ymm10 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1056(%rdi), %ymm0 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3793,16 +3832,16 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rdi), %ymm13 -; AVX2-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-NEXT: vmovaps 416(%rdi), %ymm12 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm10 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3821,16 +3860,16 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 992(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 960(%rdi), %ymm4 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 1184(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 1152(%rdi), %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3851,8 +3890,8 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 368(%rdi), %ymm15 +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -3879,8 +3918,8 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 1136(%rdi), %ymm15 +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -3904,12 +3943,12 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 272(%rdi), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 464(%rdi), %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] @@ -3924,159 +3963,159 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vbroadcastsd 848(%rdi), %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 1040(%rdi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 1232(%rdi), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vbroadcastsd 1424(%rdi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 688(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 688(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 880(%rdi), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 1072(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 1072(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 1264(%rdi), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 1456(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 1456(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 1360(%rdi), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 1168(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 976(%rdi), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 784(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 592(%rdi), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],mem[2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = mem[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 448(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 384(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 320(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 256(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 192(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 480(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 416(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 352(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 288(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 224(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm15, 32(%rsi) -; AVX2-NEXT: vmovaps %ymm11, 448(%rdx) -; AVX2-NEXT: vmovaps %ymm13, 384(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 320(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 256(%rdx) -; AVX2-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 192(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 128(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 480(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 416(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 352(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 288(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 160(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm11, 32(%rdx) +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 448(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 384(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 320(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 256(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 192(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 480(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 416(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 352(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 288(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 224(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 160(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-NEXT: vmovaps %ymm13, 448(%rdx) +; AVX2-NEXT: vmovaps %ymm15, 384(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 320(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 256(%rdx) +; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 480(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 416(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 352(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 288(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX2-NEXT: vmovaps %ymm4, 256(%rcx) -; AVX2-NEXT: vmovaps %ymm5, 320(%rcx) -; AVX2-NEXT: vmovaps %ymm6, 384(%rcx) -; AVX2-NEXT: vmovaps %ymm7, 448(%rcx) -; AVX2-NEXT: vmovaps %ymm8, 480(%rcx) -; AVX2-NEXT: vmovaps %ymm9, 416(%rcx) -; AVX2-NEXT: vmovaps %ymm10, 352(%rcx) +; AVX2-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-NEXT: vmovaps %ymm5, 256(%rcx) +; AVX2-NEXT: vmovaps %ymm6, 320(%rcx) +; AVX2-NEXT: vmovaps %ymm7, 384(%rcx) +; AVX2-NEXT: vmovaps %ymm8, 448(%rcx) +; AVX2-NEXT: vmovaps %ymm9, 480(%rcx) +; AVX2-NEXT: vmovaps %ymm10, 416(%rcx) +; AVX2-NEXT: vmovaps %ymm11, 352(%rcx) ; AVX2-NEXT: vmovaps %ymm12, 288(%rcx) ; AVX2-NEXT: vmovaps %ymm14, 224(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4092,59 +4131,59 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i64_stride3_vf64: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4174,16 +4213,16 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm10 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4202,16 +4241,16 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm4 -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-FP-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 1152(%rdi), %ymm2 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4232,8 +4271,8 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 368(%rdi), %ymm15 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -4260,8 +4299,8 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 1136(%rdi), %ymm15 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -4285,12 +4324,12 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vbroadcastsd 80(%rdi), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 272(%rdi), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 464(%rdi), %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] @@ -4305,159 +4344,159 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vbroadcastsd 848(%rdi), %ymm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 1040(%rdi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 1232(%rdi), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vbroadcastsd 1424(%rdi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 688(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 688(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 880(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 1072(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 1072(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 1264(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 1456(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 1456(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 1360(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 1168(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 976(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 784(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 592(%rdi), %xmm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],mem[2,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = mem[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 448(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 384(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 320(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 256(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 192(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 480(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 416(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 352(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 288(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 224(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm15, 32(%rsi) -; AVX2-FP-NEXT: vmovaps %ymm11, 448(%rdx) -; AVX2-FP-NEXT: vmovaps %ymm13, 384(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 320(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 256(%rdx) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 192(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 128(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 480(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 416(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 352(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 288(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 160(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rdx) +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 448(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 384(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 320(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 256(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 480(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 416(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 352(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 288(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FP-NEXT: vmovaps %ymm13, 448(%rdx) +; AVX2-FP-NEXT: vmovaps %ymm15, 384(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 320(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 256(%rdx) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 480(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 416(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 352(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 288(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm4, 256(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm5, 320(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm6, 384(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm7, 448(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm8, 480(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm9, 416(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm10, 352(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm5, 256(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm6, 320(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm7, 384(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm8, 448(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm9, 480(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm10, 416(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm11, 352(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm12, 288(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm14, 224(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4473,59 +4512,59 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i64_stride3_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4555,16 +4594,16 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm10 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4583,16 +4622,16 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-FCP-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4613,8 +4652,8 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 368(%rdi), %ymm15 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -4641,8 +4680,8 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 1136(%rdi), %ymm15 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -4666,12 +4705,12 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcastsd 80(%rdi), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 272(%rdi), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 464(%rdi), %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] @@ -4686,159 +4725,159 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcastsd 848(%rdi), %ymm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 1040(%rdi), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 1232(%rdi), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vbroadcastsd 1424(%rdi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 688(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 688(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 880(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 1072(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 1072(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 1264(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 1456(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 1456(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 1360(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 1168(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 976(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 784(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 592(%rdi), %xmm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = mem[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 448(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 384(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 320(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 256(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 192(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 480(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 416(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 352(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 288(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 224(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm15, 32(%rsi) -; AVX2-FCP-NEXT: vmovaps %ymm11, 448(%rdx) -; AVX2-FCP-NEXT: vmovaps %ymm13, 384(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 320(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 256(%rdx) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 192(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 128(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 480(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 416(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 352(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 288(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 160(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rdx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = mem[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 448(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 384(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 320(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 256(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 480(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 416(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 352(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 288(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FCP-NEXT: vmovaps %ymm13, 448(%rdx) +; AVX2-FCP-NEXT: vmovaps %ymm15, 384(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 320(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 256(%rdx) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 480(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 416(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 352(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 288(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm4, 256(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm5, 320(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm6, 384(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm7, 448(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm8, 480(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm9, 416(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm10, 352(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm5, 256(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm6, 320(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm7, 384(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm8, 448(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm9, 480(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm10, 416(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm11, 352(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm12, 288(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm14, 224(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4874,7 +4913,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -4889,10 +4928,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] ; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -4917,7 +4956,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -4928,7 +4967,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -4937,7 +4976,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -4997,7 +5036,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5012,10 +5051,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5040,7 +5079,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5051,7 +5090,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5060,7 +5099,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5120,7 +5159,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5135,10 +5174,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5163,7 +5202,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5174,7 +5213,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5183,7 +5222,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5243,7 +5282,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5258,10 +5297,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5286,7 +5325,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5297,7 +5336,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5306,7 +5345,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5366,7 +5405,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5381,10 +5420,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5409,7 +5448,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5420,7 +5459,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5429,7 +5468,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5489,7 +5528,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5504,10 +5543,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5532,7 +5571,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5543,7 +5582,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5552,7 +5591,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5612,7 +5651,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5627,10 +5666,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5655,7 +5694,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5666,7 +5705,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5675,7 +5714,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 @@ -5735,7 +5774,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,3,6,9,12,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 @@ -5750,10 +5789,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [1,4,7,10,13,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [10,13,0,3,6,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 @@ -5778,7 +5817,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 @@ -5789,7 +5828,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 @@ -5798,7 +5837,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll index 0c7c3f4b16646..b48abf5976bd7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -120,7 +120,7 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i64_stride4_vf2: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,4] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm1 ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] @@ -154,7 +154,7 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i64_stride4_vf2: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,4] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] @@ -188,7 +188,7 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i64_stride4_vf2: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,4] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] @@ -222,7 +222,7 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride4_vf2: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,4] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] @@ -672,38 +672,38 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-NEXT: vmovaps (%rdi), %ymm5 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX2-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-NEXT: vinsertf128 $1, 96(%rdi), %ymm9, %ymm9 -; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm8 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-NEXT: vmovaps 160(%rdi), %xmm11 -; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm11, %ymm11 -; AVX2-NEXT: vmovaps 128(%rdi), %xmm12 -; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm12, %ymm12 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-NEXT: vinsertf128 $1, 96(%rdi), %ymm7, %ymm7 +; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm6, %ymm6 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm8 +; AVX2-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm9, %ymm9 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 128(%rdi), %xmm11 +; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm8[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-NEXT: vmovaps %ymm13, 32(%rsi) -; AVX2-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-NEXT: vmovaps %ymm12, 32(%rcx) -; AVX2-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-NEXT: vmovaps %ymm12, (%rsi) +; AVX2-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-NEXT: vmovaps %ymm9, (%rcx) ; AVX2-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-NEXT: vmovaps %ymm4, (%r8) ; AVX2-NEXT: vzeroupper @@ -717,38 +717,38 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm5 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdi), %ymm9, %ymm9 -; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm8 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm11 -; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm11, %ymm11 -; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm12 -; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm12, %ymm12 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdi), %ymm7, %ymm7 +; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdi), %ymm6, %ymm6 +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm9, %ymm9 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm11 +; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm8[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vmovaps %ymm13, 32(%rsi) -; AVX2-FP-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-FP-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-FP-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-FP-NEXT: vmovaps %ymm12, 32(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-FP-NEXT: vmovaps %ymm12, (%rsi) +; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FP-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm9, (%rcx) ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) ; AVX2-FP-NEXT: vzeroupper @@ -762,38 +762,38 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdi), %ymm9, %ymm9 -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm8 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm11 -; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm11, %ymm11 -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm12 -; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm12, %ymm12 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdi), %ymm7, %ymm7 +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdi), %ymm6, %ymm6 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm9, %ymm9 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm11 +; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm8[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vmovaps %ymm13, 32(%rsi) -; AVX2-FCP-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-FCP-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-FCP-NEXT: vmovaps %ymm12, 32(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm12, (%rsi) +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FCP-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm9, (%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8) ; AVX2-FCP-NEXT: vzeroupper @@ -810,23 +810,23 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,1,5,9,13] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,3,7,11,15] +; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -846,23 +846,23 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -882,23 +882,23 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,1,5,9,13] +; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,3,7,11,15] +; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -918,23 +918,23 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -954,23 +954,23 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,1,5,9,13] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,3,7,11,15] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -990,23 +990,23 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1026,23 +1026,23 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,1,5,9,13] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,3,7,11,15] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1062,23 +1062,23 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) @@ -1399,27 +1399,27 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i64_stride4_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 416(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm2, %ymm10 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %xmm7 -; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 -; AVX2-NEXT: vmovaps 384(%rdi), %xmm7 -; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 +; AVX2-NEXT: vmovaps 416(%rdi), %xmm6 +; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm6, %ymm11 +; AVX2-NEXT: vmovaps 384(%rdi), %xmm6 +; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm12 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %xmm8 @@ -1437,54 +1437,54 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm13[0],ymm7[2],ymm13[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX2-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-NEXT: vmovaps %ymm5, %ymm14 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm15[2,3] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 ; AVX2-NEXT: vmovaps (%rdi), %ymm6 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm0[2,3] -; AVX2-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm8[2,3],ymm0[2,3] +; AVX2-NEXT: vmovaps 288(%rdi), %ymm8 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, (%rsi) @@ -1502,13 +1502,13 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-NEXT: vmovaps %ymm15, (%rcx) -; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-NEXT: vmovaps %ymm0, 64(%r8) ; AVX2-NEXT: vmovaps %ymm4, (%r8) -; AVX2-NEXT: vmovaps %ymm8, 96(%r8) +; AVX2-NEXT: vmovaps %ymm7, 96(%r8) ; AVX2-NEXT: vmovaps %ymm12, 32(%r8) ; AVX2-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-NEXT: vzeroupper @@ -1517,27 +1517,27 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i64_stride4_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm2, %ymm10 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm7 -; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 -; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm7 -; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 +; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm6 +; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm6, %ymm11 +; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm6 +; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm12 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8 @@ -1555,54 +1555,54 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm13[0],ymm7[2],ymm13[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-FP-NEXT: vmovaps %ymm5, %ymm14 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm15[2,3] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm6 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm8[2,3],ymm0[2,3] +; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) @@ -1620,13 +1620,13 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm15, (%rcx) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r8) ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FP-NEXT: vmovaps %ymm8, 96(%r8) +; AVX2-FP-NEXT: vmovaps %ymm7, 96(%r8) ; AVX2-FP-NEXT: vmovaps %ymm12, 32(%r8) ; AVX2-FP-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-FP-NEXT: vzeroupper @@ -1635,27 +1635,27 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i64_stride4_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm2, %ymm10 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm7 -; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 -; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm7 -; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm6 +; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm6, %ymm11 +; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm6 +; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm12 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 @@ -1673,54 +1673,54 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm13[0],ymm7[2],ymm13[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm14 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm15[2,3] -; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm8[2,3],ymm0[2,3] +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) @@ -1738,13 +1738,13 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm15, (%rcx) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%r8) +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm12, 32(%r8) ; AVX2-FCP-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-FCP-NEXT: vzeroupper @@ -1759,21 +1759,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm10 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 @@ -1784,7 +1784,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 @@ -1795,14 +1795,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm8 ; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -1821,21 +1821,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 @@ -1846,7 +1846,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 @@ -1857,14 +1857,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -1883,21 +1883,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 @@ -1908,7 +1908,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 @@ -1919,14 +1919,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm14, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -1945,21 +1945,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 @@ -1970,7 +1970,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 @@ -1981,14 +1981,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -2007,21 +2007,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 @@ -2032,7 +2032,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 @@ -2043,14 +2043,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -2069,21 +2069,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 @@ -2094,7 +2094,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 @@ -2105,14 +2105,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -2131,21 +2131,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 @@ -2156,7 +2156,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 @@ -2167,14 +2167,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -2193,21 +2193,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm12 @@ -2218,7 +2218,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm14 @@ -2229,14 +2229,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx) @@ -2914,16 +2914,16 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm2 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm3 ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 672(%rdi), %xmm4 ; AVX2-NEXT: vinsertf128 $1, 736(%rdi), %ymm4, %ymm4 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 640(%rdi), %xmm5 ; AVX2-NEXT: vinsertf128 $1, 704(%rdi), %ymm5, %ymm5 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] @@ -3171,16 +3171,16 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm3 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 672(%rdi), %xmm4 ; AVX2-FP-NEXT: vinsertf128 $1, 736(%rdi), %ymm4, %ymm4 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm5 ; AVX2-FP-NEXT: vinsertf128 $1, 704(%rdi), %ymm5, %ymm5 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] @@ -3428,16 +3428,16 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm3 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %xmm4 ; AVX2-FCP-NEXT: vinsertf128 $1, 736(%rdi), %ymm4, %ymm4 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm5 ; AVX2-FCP-NEXT: vinsertf128 $1, 704(%rdi), %ymm5, %ymm5 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] @@ -3687,28 +3687,28 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512-NEXT: vpermt2q %zmm11, %zmm19, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm14 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512-NEXT: vpermt2q %zmm13, %zmm19, %zmm18 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 @@ -3719,17 +3719,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-NEXT: vpermt2q %zmm11, %zmm21, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm22 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm23 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 @@ -3740,17 +3740,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512-NEXT: vpermt2q %zmm13, %zmm24, %zmm27 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 @@ -3762,21 +3762,21 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512-NEXT: vpermt2q %zmm13, %zmm28, %zmm15 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 ; AVX512-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm11, %zmm28, %zmm12 +; AVX512-NEXT: vpermt2q %zmm7, %zmm28, %zmm9 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -3788,7 +3788,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -3801,28 +3801,28 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm14 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm18 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 @@ -3833,17 +3833,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm22 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm23 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 @@ -3854,17 +3854,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm27 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 @@ -3876,21 +3876,21 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm15 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm9 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -3902,7 +3902,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -3915,28 +3915,28 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm19, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm19, %zmm14 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm19, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm19, %zmm18 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 @@ -3947,17 +3947,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm21, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm21, %zmm22 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm23 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 @@ -3968,17 +3968,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm24, %zmm27 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 @@ -3990,21 +3990,21 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm28, %zmm15 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm28, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm28, %zmm9 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -4016,7 +4016,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4029,28 +4029,28 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 @@ -4061,17 +4061,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm23 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 @@ -4082,17 +4082,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm27 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 @@ -4104,21 +4104,21 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm15 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm9 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -4130,7 +4130,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -4143,28 +4143,28 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm14 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm18 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 @@ -4175,17 +4175,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm22 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm23 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 @@ -4196,17 +4196,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm27 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 @@ -4218,21 +4218,21 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm15 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm28, %zmm9 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -4244,7 +4244,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -4257,28 +4257,28 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 @@ -4289,17 +4289,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm22 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm23 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 @@ -4310,17 +4310,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm27 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 @@ -4332,21 +4332,21 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm15 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm9 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -4358,7 +4358,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -4371,28 +4371,28 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm14 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 @@ -4403,17 +4403,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm22 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm23 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 @@ -4424,17 +4424,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm27 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 @@ -4446,21 +4446,21 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm15 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -4472,7 +4472,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -4485,28 +4485,28 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 @@ -4517,17 +4517,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 @@ -4538,17 +4538,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 @@ -4560,21 +4560,21 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%rdx) @@ -4586,7 +4586,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 @@ -5950,16 +5950,16 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm2 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm3 ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 672(%rdi), %xmm4 ; AVX2-NEXT: vinsertf128 $1, 736(%rdi), %ymm4, %ymm4 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 640(%rdi), %xmm5 ; AVX2-NEXT: vinsertf128 $1, 704(%rdi), %ymm5, %ymm5 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] @@ -6487,16 +6487,16 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm3 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 672(%rdi), %xmm4 ; AVX2-FP-NEXT: vinsertf128 $1, 736(%rdi), %ymm4, %ymm4 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm5 ; AVX2-FP-NEXT: vinsertf128 $1, 704(%rdi), %ymm5, %ymm5 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] @@ -7024,16 +7024,16 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm3 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %xmm4 ; AVX2-FCP-NEXT: vinsertf128 $1, 736(%rdi), %ymm4, %ymm4 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm5 ; AVX2-FCP-NEXT: vinsertf128 $1, 704(%rdi), %ymm5, %ymm5 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] @@ -7612,15 +7612,15 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 @@ -7908,15 +7908,15 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 @@ -8204,15 +8204,15 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 @@ -8500,15 +8500,15 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 @@ -8796,15 +8796,15 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 @@ -9092,15 +9092,15 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 @@ -9388,15 +9388,15 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 @@ -9684,15 +9684,15 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index 07988a416bac4..1e0faba213ce7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -367,17 +367,17 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm5 +; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm8[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -404,17 +404,17 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm8[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -441,17 +441,17 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm8[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -467,22 +467,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512-NEXT: vpbroadcastq %xmm4, %ymm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [2,7,12,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [11,0,5,u] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vpbroadcastq 144(%rdi), %ymm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,6,u] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: vmovdqa %ymm2, (%rsi) @@ -497,22 +497,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,6,u] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi) @@ -527,22 +527,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512DQ-NEXT: vpbroadcastq %xmm4, %ymm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [2,7,12,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [11,0,5,u] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vpbroadcastq 144(%rdi), %ymm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,6,u] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi) @@ -557,22 +557,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,6,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi) @@ -587,22 +587,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512BW-NEXT: vpbroadcastq %xmm4, %ymm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [2,7,12,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [11,0,5,u] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vpbroadcastq 144(%rdi), %ymm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,6,u] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) @@ -617,22 +617,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,6,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) @@ -647,22 +647,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512DQ-BW-NEXT: vpbroadcastq %xmm4, %ymm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [2,7,12,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [11,0,5,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vpbroadcastq 144(%rdi), %ymm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,6,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) @@ -677,22 +677,22 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,6,11,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,6,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) @@ -900,8 +900,8 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpalignr {{.*#+}} xmm13 = xmm15[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -971,8 +971,8 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm13 = xmm15[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] ; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -1042,8 +1042,8 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm13 = xmm15[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -1088,49 +1088,49 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1146,49 +1146,49 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1204,49 +1204,49 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1262,49 +1262,49 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] ; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1320,49 +1320,49 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1378,49 +1378,49 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1436,49 +1436,49 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1494,49 +1494,49 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,1,6,0,12,1,6,0] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,10,15,0,5,10,15,0] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,11,0,1,6,11,0,1] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,12,0,2,7,12,0,2] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,0,11,0,5,0,11] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,6,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,6,11,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,7,12,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [11,0,5,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) @@ -1922,7 +1922,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-NEXT: vmovdqa 576(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm2 @@ -1933,7 +1933,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm13[6,7] ; AVX2-NEXT: vmovdqa 512(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 480(%rdi), %xmm8 @@ -1949,15 +1949,15 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm6 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm6[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm15 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] @@ -2012,39 +2012,39 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm13 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,3] +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,3] ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-NEXT: # xmm4 = mem[0,1],xmm4[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[0,1],xmm0[2,3] @@ -2079,8 +2079,8 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-NEXT: vmovdqa %ymm1, (%r9) -; AVX2-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-NEXT: vmovdqa %ymm1, 96(%r9) ; AVX2-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-NEXT: vzeroupper @@ -2091,7 +2091,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm2 @@ -2102,7 +2102,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm8 @@ -2118,15 +2118,15 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm6 ; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm15 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] @@ -2181,39 +2181,39 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm13 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,3] +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,3] ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm4 = mem[0,1],xmm4[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] @@ -2248,8 +2248,8 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9) -; AVX2-FP-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm1, 96(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-FP-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-FP-NEXT: vzeroupper @@ -2260,7 +2260,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm2 @@ -2271,7 +2271,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm8 @@ -2287,15 +2287,15 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm15 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] @@ -2350,39 +2350,39 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm13 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,3] ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm4 = mem[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] @@ -2417,8 +2417,8 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-FCP-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-FCP-NEXT: vzeroupper @@ -2440,18 +2440,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2461,7 +2461,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2471,13 +2471,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2487,22 +2487,22 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2537,18 +2537,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2558,7 +2558,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2568,13 +2568,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2584,22 +2584,22 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2634,18 +2634,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2655,7 +2655,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2665,13 +2665,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2681,22 +2681,22 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2731,18 +2731,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2752,7 +2752,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2762,13 +2762,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2778,22 +2778,22 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2828,18 +2828,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2849,7 +2849,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2859,13 +2859,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2875,22 +2875,22 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -2925,18 +2925,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -2946,7 +2946,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -2956,13 +2956,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -2972,22 +2972,22 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -3022,18 +3022,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -3043,7 +3043,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -3053,13 +3053,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -3069,22 +3069,22 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -3119,18 +3119,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,6,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,6,11,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0] @@ -3140,7 +3140,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 @@ -3150,13 +3150,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm17 = [2,7,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm17, %zmm18 ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 @@ -3166,22 +3166,22 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm19 = [11,0,5,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,0,11,0,5,0,11] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 @@ -4003,9 +4003,9 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-LABEL: load_i64_stride5_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $1464, %rsp # imm = 0x5B8 -; AVX2-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX2-NEXT: vmovdqa 896(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 864(%rdi), %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm3 @@ -4016,141 +4016,140 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovdqa 512(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 480(%rdi), %xmm10 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 800(%rdi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 800(%rdi), %xmm8 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1184(%rdi), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa 1152(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 1120(%rdi), %xmm9 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 320(%rdi), %xmm15 -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 640(%rdi), %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovdqa 704(%rdi), %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 672(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa 640(%rdi), %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm4[0,1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa 992(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 960(%rdi), %xmm9 -; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqa 960(%rdi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm14 -; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 608(%rdi), %ymm11 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 608(%rdi), %ymm15 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 848(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 928(%rdi), %ymm10 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 928(%rdi), %ymm12 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1168(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm5 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm8 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 768(%rdi), %ymm15 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm3 +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4210,184 +4209,187 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 544(%rdi), %xmm6 -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 544(%rdi), %xmm15 +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 1184(%rdi), %xmm14 -; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 1024(%rdi), %xmm11 -; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm12[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 704(%rdi), %xmm9 -; AVX2-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,3] +; AVX2-NEXT: vmovdqa 704(%rdi), %xmm4 +; AVX2-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 384(%rdi), %xmm14 +; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX2-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vpblendd $3, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-NEXT: # xmm6 = mem[0,1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload +; AVX2-NEXT: # xmm7 = mem[0,1],xmm14[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = mem[0,1],xmm9[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload +; AVX2-NEXT: # xmm3 = mem[0,1],xmm15[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vpblendd $3, (%rsp), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = mem[0,1],xmm4[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = mem[0,1],xmm11[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = mem[0,1],xmm14[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm7, (%r8) -; AVX2-NEXT: vmovdqa %ymm8, 64(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-NEXT: vmovdqa %ymm0, 224(%r9) -; AVX2-NEXT: vmovdqa %ymm9, 192(%r9) -; AVX2-NEXT: vmovaps %ymm2, 160(%r9) -; AVX2-NEXT: vmovdqa %ymm6, 128(%r9) -; AVX2-NEXT: vmovdqa %ymm15, 96(%r9) -; AVX2-NEXT: vmovdqa %ymm5, 64(%r9) -; AVX2-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-NEXT: vmovdqa %ymm12, (%r9) -; AVX2-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX2-NEXT: # xmm4 = mem[0,1],xmm2[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload +; AVX2-NEXT: # xmm7 = mem[0,1],xmm13[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-NEXT: vmovdqa %ymm10, 64(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 128(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 224(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 160(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-NEXT: vmovdqa %ymm7, 192(%r9) +; AVX2-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-NEXT: vmovdqa %ymm1, 128(%r9) +; AVX2-NEXT: vmovdqa %ymm3, 96(%r9) +; AVX2-NEXT: vmovdqa %ymm14, 64(%r9) +; AVX2-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i64_stride5_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $1464, %rsp # imm = 0x5B8 -; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm3 @@ -4398,141 +4400,140 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 800(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 800(%rdi), %xmm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %xmm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 672(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm4[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 960(%rdi), %xmm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 960(%rdi), %xmm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm1[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm14 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm11 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 848(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm10 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm12 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1168(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm5 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm8 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm15 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm3 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4592,184 +4593,187 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm6 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %xmm14 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %xmm11 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm12[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm9 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,3] +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm4 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm14 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vpblendd $3, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm6 = mem[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm7 = mem[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm9[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm3 = mem[0,1],xmm15[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd $3, (%rsp), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm11[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm14[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm7, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm8, 64(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm0, 224(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm9, 192(%r9) -; AVX2-FP-NEXT: vmovaps %ymm2, 160(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm6, 128(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm15, 96(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm5, 64(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm12, (%r9) -; AVX2-FP-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm4 = mem[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm7 = mem[0,1],xmm13[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FP-NEXT: vmovdqa %ymm10, 64(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 128(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 224(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 160(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FP-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm7, 192(%r9) +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm1, 128(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm14, 64(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FP-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i64_stride5_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $1464, %rsp # imm = 0x5B8 -; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm3 @@ -4780,141 +4784,140 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm11 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 848(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %ymm10 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %ymm12 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1168(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm8 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm15 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4974,176 +4977,179 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm6 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm15 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %xmm11 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm12[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm14 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd $3, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = mem[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm7 = mem[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm3 = mem[0,1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd $3, (%rsp), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm14[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm8, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm0, 224(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm9, 192(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 128(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm15, 96(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm5, 64(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm12, (%r9) -; AVX2-FCP-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm4 = mem[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm7 = mem[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm10, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 128(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 224(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 160(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 192(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 128(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm14, 64(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FCP-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -5160,12 +5166,12 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -5179,7 +5185,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -5197,7 +5203,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5227,7 +5233,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -5267,7 +5273,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -5282,7 +5288,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -5296,7 +5302,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -5306,14 +5312,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -5361,12 +5367,12 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -5380,7 +5386,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -5398,7 +5404,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5428,7 +5434,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -5468,7 +5474,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -5483,7 +5489,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -5497,7 +5503,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -5507,14 +5513,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -5562,12 +5568,12 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -5581,7 +5587,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -5599,7 +5605,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5629,7 +5635,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -5669,7 +5675,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -5684,7 +5690,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -5698,7 +5704,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -5708,14 +5714,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -5763,12 +5769,12 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -5782,7 +5788,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -5800,7 +5806,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5830,7 +5836,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -5870,7 +5876,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -5885,7 +5891,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -5899,7 +5905,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -5909,14 +5915,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -5964,12 +5970,12 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -5983,7 +5989,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -6001,7 +6007,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6031,7 +6037,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -6071,7 +6077,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -6086,7 +6092,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -6100,7 +6106,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -6110,14 +6116,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -6165,12 +6171,12 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -6184,7 +6190,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -6202,7 +6208,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6232,7 +6238,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -6272,7 +6278,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -6287,7 +6293,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -6301,7 +6307,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -6311,14 +6317,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -6366,12 +6372,12 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -6385,7 +6391,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -6403,7 +6409,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6433,7 +6439,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -6473,7 +6479,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -6488,7 +6494,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -6502,7 +6508,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -6512,14 +6518,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -6567,12 +6573,12 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,1,6,0,12,1,6,0] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm24 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm12 @@ -6586,7 +6592,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm28 = [1,6,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm30 @@ -6604,7 +6610,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [2,7,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,7,12,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6634,7 +6640,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,0,5,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm17 @@ -6674,7 +6680,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm6 = zmm24[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 @@ -6689,7 +6695,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 @@ -6703,7 +6709,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm21 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm21 @@ -6713,14 +6719,14 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} @@ -8466,7 +8472,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-LABEL: load_i64_stride5_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $3240, %rsp # imm = 0xCA8 +; AVX2-NEXT: subq $3336, %rsp # imm = 0xD08 ; AVX2-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 864(%rdi), %ymm4 @@ -8478,7 +8484,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8506,7 +8512,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1184(%rdi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8651,7 +8657,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -8681,7 +8687,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -8690,7 +8696,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 1568(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] @@ -8700,7 +8706,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 1888(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] @@ -8727,19 +8733,19 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 448(%rdi), %ymm15 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8779,17 +8785,17 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1968(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 2048(%rdi), %ymm13 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2288(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 2368(%rdi), %ymm14 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8905,29 +8911,29 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 384(%rdi), %xmm13 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -8937,351 +8943,360 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 704(%rdi), %xmm12 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 704(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX2-NEXT: vmovdqa 864(%rdi), %xmm9 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1024(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 1184(%rdi), %xmm8 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 1344(%rdi), %xmm7 +; AVX2-NEXT: vmovdqa 1184(%rdi), %xmm7 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqa 1344(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 1504(%rdi), %xmm6 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 1664(%rdi), %xmm5 +; AVX2-NEXT: vmovdqa 1504(%rdi), %xmm5 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqa 1664(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 1824(%rdi), %xmm4 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 1984(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa 1824(%rdi), %xmm3 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1984(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 2144(%rdi), %xmm2 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 2144(%rdi), %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovdqa 2304(%rdi), %xmm1 -; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 2304(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm2[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovdqa 2464(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX2-NEXT: # xmm11 = mem[0,1],xmm11[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-NEXT: # xmm12 = mem[0,1],xmm12[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload +; AVX2-NEXT: # ymm12 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-NEXT: # xmm9 = mem[0,1],xmm9[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-NEXT: # xmm10 = mem[0,1],xmm10[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-NEXT: # xmm7 = mem[0,1],xmm7[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-NEXT: # xmm6 = mem[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%r8) -; AVX2-NEXT: vmovdqa %ymm0, 480(%r9) +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 256(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 384(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 416(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 288(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 480(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 448(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 416(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 384(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 352(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%r8) +; AVX2-NEXT: vmovdqa %ymm1, 480(%r9) ; AVX2-NEXT: vmovdqa %ymm2, 448(%r9) -; AVX2-NEXT: vmovdqa %ymm5, 416(%r9) -; AVX2-NEXT: vmovdqa %ymm7, 384(%r9) -; AVX2-NEXT: vmovdqa %ymm4, 352(%r9) -; AVX2-NEXT: vmovdqa %ymm13, 320(%r9) -; AVX2-NEXT: vmovdqa %ymm6, 288(%r9) -; AVX2-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-NEXT: vmovdqa %ymm14, 224(%r9) -; AVX2-NEXT: vmovdqa %ymm9, 192(%r9) +; AVX2-NEXT: vmovdqa %ymm3, 416(%r9) +; AVX2-NEXT: vmovdqa %ymm4, 384(%r9) +; AVX2-NEXT: vmovdqa %ymm5, 352(%r9) +; AVX2-NEXT: vmovdqa %ymm12, 320(%r9) +; AVX2-NEXT: vmovdqa %ymm13, 288(%r9) +; AVX2-NEXT: vmovdqa %ymm7, 256(%r9) +; AVX2-NEXT: vmovdqa %ymm9, 224(%r9) +; AVX2-NEXT: vmovaps %ymm10, 192(%r9) ; AVX2-NEXT: vmovdqa %ymm11, 160(%r9) -; AVX2-NEXT: vmovdqa %ymm12, 128(%r9) +; AVX2-NEXT: vmovaps %ymm15, 128(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-NEXT: vmovaps %ymm15, 32(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r9) -; AVX2-NEXT: addq $3240, %rsp # imm = 0xCA8 +; AVX2-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i64_stride5_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $3240, %rsp # imm = 0xCA8 +; AVX2-FP-NEXT: subq $3336, %rsp # imm = 0xD08 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm4 @@ -9293,7 +9308,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9321,7 +9336,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9466,7 +9481,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -9496,7 +9511,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -9505,7 +9520,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 1568(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] @@ -9515,7 +9530,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 1888(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] @@ -9542,19 +9557,19 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9594,17 +9609,17 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1968(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 2048(%rdi), %ymm13 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2288(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 2368(%rdi), %ymm14 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9720,29 +9735,29 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm13 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -9752,351 +9767,360 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm12 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm9 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %xmm8 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 1344(%rdi), %xmm7 +; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %xmm7 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqa 1344(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %xmm6 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %xmm5 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 1824(%rdi), %xmm4 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 1984(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovdqa 1824(%rdi), %xmm3 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1984(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 2144(%rdi), %xmm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 2144(%rdi), %xmm1 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 2304(%rdi), %xmm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 2304(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm2[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovdqa 2464(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm11 = mem[0,1],xmm11[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm12 = mem[0,1],xmm12[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm12 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm9 = mem[0,1],xmm9[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm10 = mem[0,1],xmm10[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm7 = mem[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm6 = mem[0,1],xmm6[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm0, 480(%r9) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 480(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 448(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 416(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 384(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 352(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FP-NEXT: vmovdqa %ymm1, 480(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm2, 448(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm5, 416(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm7, 384(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm4, 352(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm13, 320(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm6, 288(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm14, 224(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm9, 192(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm3, 416(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm4, 384(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm5, 352(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm12, 320(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm13, 288(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm7, 256(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm9, 224(%r9) +; AVX2-FP-NEXT: vmovaps %ymm10, 192(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm11, 160(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm12, 128(%r9) +; AVX2-FP-NEXT: vmovaps %ymm15, 128(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FP-NEXT: vmovaps %ymm15, 32(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FP-NEXT: addq $3240, %rsp # imm = 0xCA8 +; AVX2-FP-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i64_stride5_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $3240, %rsp # imm = 0xCA8 +; AVX2-FCP-NEXT: subq $3336, %rsp # imm = 0xD08 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm4 @@ -10108,7 +10132,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10136,7 +10160,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10281,7 +10305,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -10311,7 +10335,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -10320,7 +10344,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] @@ -10330,7 +10354,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 1888(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] @@ -10357,19 +10381,19 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10409,17 +10433,17 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1968(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 2048(%rdi), %ymm13 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2288(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 2368(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10535,29 +10559,29 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -10567,632 +10591,639 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm12 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm9 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 1344(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %xmm7 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqa 1344(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %xmm6 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %xmm5 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 1824(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 1984(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa 1824(%rdi), %xmm3 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1984(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 2144(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 2144(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 2304(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 2304(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm2[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovdqa 2464(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm11 = mem[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm12 = mem[0,1],xmm12[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm12 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm9 = mem[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm10 = mem[0,1],xmm10[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm7 = mem[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = mem[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm0, 480(%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 480(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm2, 448(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm5, 416(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm7, 384(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm4, 352(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm13, 320(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 288(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm14, 224(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm9, 192(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm3, 416(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 384(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm5, 352(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm12, 320(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm13, 288(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 256(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm9, 224(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm10, 192(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm11, 160(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm12, 128(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm15, 128(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm15, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FCP-NEXT: addq $3240, %rsp # imm = 0xCA8 +; AVX2-FCP-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i64_stride5_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512-NEXT: vpermt2q %zmm26, %zmm16, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,10,15,0,5,10,15,0] +; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,0,11,0,5,0,11] +; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm9, %zmm13, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-NEXT: vpermt2q %zmm10, %zmm13, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [0,5,10,15] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm31, %zmm13, %zmm4 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,u] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [2,7,12,u] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [11,0,5,u] ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm22 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vpermt2q %zmm22, %zmm13, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm21 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512-NEXT: vpermt2q %zmm21, %zmm13, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512-NEXT: vpermt2q %zmm30, %zmm13, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512-NEXT: vpermt2q %zmm30, %zmm6, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512-NEXT: vpermt2q %zmm30, %zmm10, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm30 ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512-NEXT: vpermi2q %zmm7, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm8 +; AVX512-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512-NEXT: vpermt2q %zmm9, %zmm10, %zmm23 +; AVX512-NEXT: vpermi2q %zmm7, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 +; AVX512-NEXT: vpermi2q %zmm0, %zmm7, %zmm12 +; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm18[4,5,6,7] ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload ; AVX512-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm17 ; AVX512-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} @@ -11203,28 +11234,27 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm26 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512-NEXT: vpermt2q %zmm19, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm14, %zmm4, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm13, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512-NEXT: vpermt2q %zmm18, %zmm4, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm16, %zmm4, %zmm26 ; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $56, %al ; AVX512-NEXT: kmovw %eax, %k1 @@ -11232,41 +11262,42 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 +; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm26 +; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512-NEXT: vpermt2q %zmm16, %zmm1, %zmm10 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -11274,47 +11305,48 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm8 +; AVX512-NEXT: vpermt2q %zmm20, %zmm2, %zmm27 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm4 ; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm5 +; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm6 +; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm12 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} +; AVX512-NEXT: vpermt2q %zmm19, %zmm3, %zmm24 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm22 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512-NEXT: vpermt2q %zmm17, %zmm3, %zmm21 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512-NEXT: vpermt2q %zmm18, %zmm3, %zmm30 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512-NEXT: vpermt2q %zmm20, %zmm3, %zmm9 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 448(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -11325,9 +11357,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovaps %zmm2, 256(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, (%rsi) @@ -11347,320 +11379,318 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm29, 192(%rcx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, (%rcx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm12, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm6, 256(%r8) +; AVX512-NEXT: vmovdqa64 %zmm5, 320(%r8) ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm7, 448(%r9) +; AVX512-NEXT: vmovdqa64 %zmm30, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm25, 320(%r9) -; AVX512-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm21, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride5_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,10,15,0,5,10,15,0] +; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,0,11,0,5,0,11] +; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,5,10,15] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm13, %zmm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,7,12,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [11,0,5,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm13, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm13, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm23 +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm18[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} @@ -11671,28 +11701,27 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm26 ; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $56, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 @@ -11700,41 +11729,42 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -11742,47 +11772,48 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm24 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm21 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm30 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm9 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm2, 448(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -11793,9 +11824,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovaps %zmm2, 256(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi) @@ -11815,320 +11846,318 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 320(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 192(%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 448(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 448(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 320(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512-FCP-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride5_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512DQ-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm16, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm16, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,10,15,0,5,10,15,0] +; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm11, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm11, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,0,11,0,5,0,11] +; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm13, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm13, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [0,5,10,15] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm13, %zmm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [2,7,12,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [11,0,5,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm13, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm21 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm13, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm6, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm10, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm30 ; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm10, %zmm23 +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm18[4,5,6,7] ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512DQ-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} @@ -12139,28 +12168,27 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm26 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm4, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm4, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm4, %zmm26 ; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $56, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 @@ -12168,41 +12196,42 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm1, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -12210,47 +12239,48 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm2, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm3, %zmm24 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm22 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm3, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm3, %zmm30 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm3, %zmm9 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, 448(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -12261,9 +12291,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovaps %zmm2, 256(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi) @@ -12283,320 +12313,318 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 192(%rcx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 448(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 256(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 448(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 320(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 128(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512DQ-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,10,15,0,5,10,15,0] +; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,0,11,0,5,0,11] +; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm13, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,7,12,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [11,0,5,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm13, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm13, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm23 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm18[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} @@ -12607,28 +12635,27 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $56, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 @@ -12636,41 +12663,42 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -12678,47 +12706,48 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 448(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -12729,9 +12758,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 256(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi) @@ -12751,320 +12780,318 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 320(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 192(%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 448(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 256(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 448(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 320(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-FCP-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512DQ-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm16, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,u] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,7,12,u] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [11,0,5,u] ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm13, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm13, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm30 ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm0 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm18[4,5,6,7] ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm1 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} @@ -13075,28 +13102,27 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm26 ; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 @@ -13104,41 +13130,42 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -13146,47 +13173,48 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm30 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 448(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -13197,9 +13225,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps %zmm2, 256(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, (%rsi) @@ -13219,320 +13247,318 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 192(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512BW-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride5_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,10,15,0,5,10,15,0] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,0,11,0,5,0,11] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm13, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,7,12,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [11,0,5,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm13, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm13, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm23 +; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm0 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm18[4,5,6,7] ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $7, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} @@ -13543,28 +13569,27 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $56, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -13572,41 +13597,42 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -13614,47 +13640,48 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm2, 448(%rsi) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -13665,9 +13692,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovaps %zmm2, 256(%rsi) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rsi) @@ -13687,320 +13714,318 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 320(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 192(%rcx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rcx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 448(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 448(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 256(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 320(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512BW-FCP-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride5_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm16, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,10,15,0,5,10,15,0] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,0,11,0,5,0,11] +; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm13, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,7,12,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [11,0,5,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm13, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm13, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm23 +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm0 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm18[4,5,6,7] ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm1 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $7, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} @@ -14011,28 +14036,27 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm26 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $56, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 @@ -14040,41 +14064,42 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -14082,47 +14107,48 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 448(%rsi) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -14133,9 +14159,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 256(%rsi) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm2, 128(%rsi) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rsi) @@ -14155,320 +14181,318 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 320(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 192(%rcx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rcx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 448(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 256(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 320(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 448(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 256(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 320(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 128(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-BW-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512DQ-BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride5_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,10,15,0,5,10,15,0] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,11,0,1,6,11,0,1] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [6,11,0,1,6,11,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,12,0,2,7,12,0,2] -; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,5,0,11,0,5,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,12,0,2,7,12,0,2] +; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,0,11,0,5,0,11] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,5,10,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm29 = [1,6,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm13, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,6,11,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,7,12,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,0,5,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [11,0,5,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm13, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm13, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm6, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm10, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,2,3],zmm18[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm11 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm14[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm1 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $7, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} @@ -14479,28 +14503,27 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,2,3,4,5,6,12] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm29, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm29, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm26 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $56, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 @@ -14508,41 +14531,42 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,13] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -14550,47 +14574,48 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,9,14] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm2, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 448(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -14601,9 +14626,9 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 256(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rsi) @@ -14623,33 +14648,33 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 448(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 256(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 320(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 320(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 192(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 448(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 256(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 448(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 384(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 448(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 256(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 320(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512DQ-BW-FCP-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <320 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 7d3209397c3df..561f3921845d2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -563,32 +563,32 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512-NEXT: vpbroadcastq %xmm6, %ymm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,10] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) @@ -605,30 +605,30 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [10,0,6,u] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,4] ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,4,0,6] +; AVX512-FCP-NEXT: vpermq 128(%rdi), %zmm8, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] -; AVX512-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,10] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -645,32 +645,32 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512DQ-NEXT: vpbroadcastq %xmm6, %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,10] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) @@ -687,30 +687,30 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [10,0,6,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,4] ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,4,0,6] +; AVX512DQ-FCP-NEXT: vpermq 128(%rdi), %zmm8, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] -; AVX512DQ-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -727,32 +727,32 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512BW-NEXT: vpbroadcastq %xmm6, %ymm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [4,10] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -769,30 +769,30 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [10,0,6,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,4] ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,4,0,6] +; AVX512BW-FCP-NEXT: vpermq 128(%rdi), %zmm8, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] -; AVX512BW-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,10] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -809,32 +809,32 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [10,0,6,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX512DQ-BW-NEXT: vpbroadcastq %xmm6, %ymm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [4,10] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -851,30 +851,30 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,6,12,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,13,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [10,0,6,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,4] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [11,1,7,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,4,0,6] +; AVX512DQ-BW-FCP-NEXT: vpermq 128(%rdi), %zmm8, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6] -; AVX512DQ-BW-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,10] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -1039,8 +1039,8 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm13[1],xmm12[1] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm10 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] ; AVX-NEXT: vmovaps 64(%rdi), %xmm11 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm11[0] @@ -1100,13 +1100,13 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 288(%rdi), %ymm6 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX2-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] +; AVX2-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm7[0] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -1114,24 +1114,24 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps 240(%rdi), %xmm8 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm10 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm8[0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm12 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 296(%rdi), %ymm6 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] +; AVX2-NEXT: vbroadcastsd 296(%rdi), %ymm7 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm8[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm8 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] ; AVX2-NEXT: vmovaps 64(%rdi), %xmm12 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm10 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] ; AVX2-NEXT: vmovaps 256(%rdi), %xmm13 ; AVX2-NEXT: vmovaps 208(%rdi), %xmm14 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] @@ -1143,9 +1143,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovaps 80(%rdi), %xmm11 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] @@ -1168,11 +1168,11 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm7, 32(%rdx) ; AVX2-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-NEXT: vmovaps %ymm10, 32(%rcx) ; AVX2-NEXT: vmovaps %ymm8, (%rcx) -; AVX2-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-NEXT: vmovaps %ymm6, 32(%r8) ; AVX2-NEXT: vmovaps %ymm9, (%r8) ; AVX2-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-NEXT: vmovaps %ymm5, (%r9) @@ -1186,13 +1186,13 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] +; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm7[0] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -1200,24 +1200,24 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps 240(%rdi), %xmm8 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm10 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm8[0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 104(%rdi), %ymm12 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 296(%rdi), %ymm6 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] +; AVX2-FP-NEXT: vbroadcastsd 296(%rdi), %ymm7 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm8[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 160(%rdi), %ymm8 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-FP-NEXT: vbroadcastsd 160(%rdi), %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm12 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 352(%rdi), %ymm10 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm13 ; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm14 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] @@ -1229,9 +1229,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm11 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] @@ -1254,11 +1254,11 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx) ; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-FP-NEXT: vmovaps %ymm10, 32(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm8, (%rcx) -; AVX2-FP-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-FP-NEXT: vmovaps %ymm6, 32(%r8) ; AVX2-FP-NEXT: vmovaps %ymm9, (%r8) ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-FP-NEXT: vmovaps %ymm5, (%r9) @@ -1272,13 +1272,13 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] +; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm7[0] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -1286,24 +1286,24 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps 240(%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm8[0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 104(%rdi), %ymm12 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 296(%rdi), %ymm6 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 296(%rdi), %ymm7 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm8[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 160(%rdi), %ymm8 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-FCP-NEXT: vbroadcastsd 160(%rdi), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm12 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 352(%rdi), %ymm10 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm13 ; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm14 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] @@ -1315,9 +1315,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm11 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] @@ -1340,11 +1340,11 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm8, (%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm9, (%r8) ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm5, (%r9) @@ -1366,7 +1366,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: movb $56, %dil ; AVX512-NEXT: kmovw %edi, %k1 @@ -1379,9 +1379,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -1393,7 +1393,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1403,18 +1403,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: movb $24, %dil ; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} ; AVX512-NEXT: movb $-32, %dil ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1451,7 +1451,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movb $56, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 @@ -1464,9 +1464,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -1478,7 +1478,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1488,18 +1488,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: movb $24, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} ; AVX512-FCP-NEXT: movb $-32, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1536,7 +1536,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: movb $56, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 @@ -1549,9 +1549,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -1563,7 +1563,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1573,18 +1573,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: movb $24, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} ; AVX512DQ-NEXT: movb $-32, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1621,7 +1621,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movb $56, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 @@ -1634,9 +1634,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -1648,7 +1648,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1658,18 +1658,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: movb $24, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} ; AVX512DQ-FCP-NEXT: movb $-32, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1706,7 +1706,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 @@ -1719,9 +1719,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -1733,7 +1733,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1743,18 +1743,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: movb $24, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1791,7 +1791,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movb $56, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 @@ -1804,9 +1804,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -1818,7 +1818,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1828,18 +1828,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: movb $24, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} ; AVX512BW-FCP-NEXT: movb $-32, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1876,7 +1876,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movb $56, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 @@ -1889,9 +1889,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -1903,7 +1903,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1913,18 +1913,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: movb $24, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} ; AVX512DQ-BW-NEXT: movb $-32, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -1961,7 +1961,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,6,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,6,12,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -1974,9 +1974,9 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] @@ -1988,7 +1988,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,7,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,7,13,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] @@ -1998,18 +1998,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [10,0,6,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $24, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} ; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [11,1,7,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} @@ -2497,637 +2497,646 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-LABEL: load_i64_stride6_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-NEXT: vmovaps 320(%rdi), %ymm9 +; AVX2-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm6[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] ; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 432(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 384(%rdi), %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-NEXT: vmovaps 432(%rdi), %xmm10 +; AVX2-NEXT: vmovaps 384(%rdi), %xmm4 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm10[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 240(%rdi), %xmm11 -; AVX2-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX2-NEXT: vmovaps %ymm10, %ymm4 -; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 240(%rdi), %xmm5 +; AVX2-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm5[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX2-NEXT: vmovaps %ymm9, %ymm2 +; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 672(%rdi), %ymm8 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-NEXT: vmovaps 704(%rdi), %ymm8 +; AVX2-NEXT: vmovaps 672(%rdi), %ymm11 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm5 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm14[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm6 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm3[1] -; AVX2-NEXT: vbroadcastsd 488(%rdi), %ymm3 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm10[1] +; AVX2-NEXT: vbroadcastsd 488(%rdi), %ymm4 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm11[1] -; AVX2-NEXT: vbroadcastsd 296(%rdi), %ymm3 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm5[1] +; AVX2-NEXT: vbroadcastsd 296(%rdi), %ymm4 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX2-NEXT: vbroadcastsd 680(%rdi), %ymm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX2-NEXT: vmovaps %ymm10, %ymm14 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] -; AVX2-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] +; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm4 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm4[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX2-NEXT: vmovaps 448(%rdi), %xmm4 -; AVX2-NEXT: vmovaps 400(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX2-NEXT: vmovaps 448(%rdi), %xmm5 +; AVX2-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-NEXT: vmovaps 256(%rdi), %xmm12 -; AVX2-NEXT: vmovaps 208(%rdi), %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-NEXT: vmovaps 208(%rdi), %xmm12 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm9[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] ; AVX2-NEXT: vmovaps 640(%rdi), %xmm13 -; AVX2-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1] +; AVX2-NEXT: vmovaps 592(%rdi), %xmm0 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm13[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX2-NEXT: vmovaps 544(%rdi), %ymm10 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX2-NEXT: vmovaps 544(%rdi), %ymm14 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm13[1] +; AVX2-NEXT: vmovaps 736(%rdi), %ymm6 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; AVX2-NEXT: vmovaps 736(%rdi), %ymm4 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm12[1] -; AVX2-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm9[1] +; AVX2-NEXT: vmovaps 352(%rdi), %ymm11 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-NEXT: vmovaps 464(%rdi), %xmm7 -; AVX2-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0] +; AVX2-NEXT: vmovaps 464(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 416(%rdi), %xmm12 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm13[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3] -; AVX2-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[0,1,0,3] +; AVX2-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-NEXT: vmovaps 80(%rdi), %xmm7 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm7[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-NEXT: vmovaps 656(%rdi), %xmm9 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX2-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm4[0],xmm5[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 712(%rdi), %ymm14 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm1[0] +; AVX2-NEXT: vbroadcastsd 712(%rdi), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 520(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm7[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm13[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 328(%rdi), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, (%r8) -; AVX2-NEXT: vmovaps %ymm8, 96(%r9) -; AVX2-NEXT: vmovaps %ymm11, 32(%r9) -; AVX2-NEXT: vmovaps %ymm13, (%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm2 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm7[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 328(%rdi), %ymm3 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%r8) +; AVX2-NEXT: vmovaps %ymm9, 96(%r9) +; AVX2-NEXT: vmovaps %ymm10, 32(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-NEXT: vmovaps %ymm0, (%rax) -; AVX2-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-NEXT: vmovaps %ymm2, (%rax) +; AVX2-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i64_stride6_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm6[0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] ; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 432(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm6 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-FP-NEXT: vmovaps 432(%rdi), %xmm10 +; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm10[0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 240(%rdi), %xmm11 -; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX2-FP-NEXT: vmovaps %ymm10, %ymm4 -; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 240(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm5[0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX2-FP-NEXT: vmovaps %ymm9, %ymm2 +; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm8 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm11 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX2-FP-NEXT: vbroadcastsd 104(%rdi), %ymm5 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm14[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX2-FP-NEXT: vbroadcastsd 104(%rdi), %ymm6 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm3[1] -; AVX2-FP-NEXT: vbroadcastsd 488(%rdi), %ymm3 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm10[1] +; AVX2-FP-NEXT: vbroadcastsd 488(%rdi), %ymm4 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm11[1] -; AVX2-FP-NEXT: vbroadcastsd 296(%rdi), %ymm3 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm5[1] +; AVX2-FP-NEXT: vbroadcastsd 296(%rdi), %ymm4 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX2-FP-NEXT: vbroadcastsd 680(%rdi), %ymm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX2-FP-NEXT: vmovaps %ymm10, %ymm14 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX2-FP-NEXT: vbroadcastsd 160(%rdi), %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] -; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] +; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm4[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm4 -; AVX2-FP-NEXT: vmovaps 400(%rdi), %xmm5 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm12 -; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm9[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] ; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm13 -; AVX2-FP-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1] +; AVX2-FP-NEXT: vmovaps 592(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm13[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm10 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm14 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm13[1] +; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm6 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm4 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm12[1] -; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm9[1] +; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm11 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 464(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0] +; AVX2-FP-NEXT: vmovaps 464(%rdi), %xmm13 +; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm13[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[0,1,0,3] +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm7 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm7[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm9 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm4[0],xmm5[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 712(%rdi), %ymm14 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm1[0] +; AVX2-FP-NEXT: vbroadcastsd 712(%rdi), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 520(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm7[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm13[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 328(%rdi), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FP-NEXT: vmovaps %ymm8, 96(%r9) -; AVX2-FP-NEXT: vmovaps %ymm11, 32(%r9) -; AVX2-FP-NEXT: vmovaps %ymm13, (%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FP-NEXT: vbroadcastsd 136(%rdi), %ymm2 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm7[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 328(%rdi), %ymm3 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FP-NEXT: vmovaps %ymm9, 96(%r9) +; AVX2-FP-NEXT: vmovaps %ymm10, 32(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-FP-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-FP-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FP-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i64_stride6_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm6[0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] ; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 432(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-FCP-NEXT: vmovaps 432(%rdi), %xmm10 +; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm4 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm10[0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 240(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX2-FCP-NEXT: vmovaps %ymm10, %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 240(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm5[0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX2-FCP-NEXT: vmovaps %ymm9, %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm8 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm11 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX2-FCP-NEXT: vbroadcastsd 104(%rdi), %ymm5 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm14[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX2-FCP-NEXT: vbroadcastsd 104(%rdi), %ymm6 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm3[1] -; AVX2-FCP-NEXT: vbroadcastsd 488(%rdi), %ymm3 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm10[1] +; AVX2-FCP-NEXT: vbroadcastsd 488(%rdi), %ymm4 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm11[1] -; AVX2-FCP-NEXT: vbroadcastsd 296(%rdi), %ymm3 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm5[1] +; AVX2-FCP-NEXT: vbroadcastsd 296(%rdi), %ymm4 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX2-FCP-NEXT: vbroadcastsd 680(%rdi), %ymm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX2-FCP-NEXT: vmovaps %ymm10, %ymm14 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX2-FCP-NEXT: vbroadcastsd 160(%rdi), %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] -; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] +; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm4 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm4[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovaps 400(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm9[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1] +; AVX2-FCP-NEXT: vmovaps 592(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm13[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm10 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm14 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm13[1] +; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm6 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm4 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm12[1] -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm9[1] +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm11 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 464(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0] +; AVX2-FCP-NEXT: vmovaps 464(%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm13[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[0,1,0,3] +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm7[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm9 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm4[0],xmm5[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 712(%rdi), %ymm14 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm1[0] +; AVX2-FCP-NEXT: vbroadcastsd 712(%rdi), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 520(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm7[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm13[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 328(%rdi), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm13, (%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FCP-NEXT: vbroadcastsd 136(%rdi), %ymm2 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm7[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 328(%rdi), %ymm3 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-FCP-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FCP-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -3145,12 +3154,12 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512-NEXT: movb $56, %dil @@ -3173,7 +3182,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3192,42 +3201,42 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512-NEXT: movb $24, %dil ; AVX512-NEXT: kmovw %edi, %k2 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,6,12,0,0,6,12] +; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512-NEXT: vpermt2q %zmm8, %zmm22, %zmm21 ; AVX512-NEXT: movb $-32, %dil ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm22 ; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,5,11,5,11,5,11,5] +; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm18 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm22 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,7,13,0,1,7,13] +; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm24 +; AVX512-NEXT: vpermi2q %zmm5, %zmm3, %zmm23 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm23 @@ -3288,12 +3297,12 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512-FCP-NEXT: movb $56, %dil @@ -3316,7 +3325,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3335,42 +3344,42 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512-FCP-NEXT: movb $24, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k2 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,6,12,0,0,6,12] +; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm21 ; AVX512-FCP-NEXT: movb $-32, %dil ; AVX512-FCP-NEXT: kmovw %edi, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm22 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,5,11,5,11,5,11,5] +; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,7,13,0,1,7,13] +; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm24 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm23 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 @@ -3431,12 +3440,12 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512DQ-NEXT: movb $56, %dil @@ -3459,7 +3468,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3478,42 +3487,42 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-NEXT: movb $24, %dil ; AVX512DQ-NEXT: kmovw %edi, %k2 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,6,12,0,0,6,12] +; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm22, %zmm21 ; AVX512DQ-NEXT: movb $-32, %dil ; AVX512DQ-NEXT: kmovw %edi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm22 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,5,11,5,11,5,11,5] +; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm18 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,7,13,0,1,7,13] +; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm24 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm3, %zmm23 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm23 @@ -3574,12 +3583,12 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: movb $56, %dil @@ -3602,7 +3611,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3621,42 +3630,42 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: movb $24, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,6,12,0,0,6,12] +; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm21 ; AVX512DQ-FCP-NEXT: movb $-32, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm22 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,5,11,5,11,5,11,5] +; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,7,13,0,1,7,13] +; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm23 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 @@ -3717,12 +3726,12 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512BW-NEXT: movb $56, %dil @@ -3745,7 +3754,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3764,42 +3773,42 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512BW-NEXT: movb $24, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,6,12,0,0,6,12] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm21 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm22 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,7,13,0,1,7,13] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm24 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm23 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 @@ -3860,12 +3869,12 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: movb $56, %dil @@ -3888,7 +3897,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -3907,42 +3916,42 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: movb $24, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,6,12,0,0,6,12] +; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm21 ; AVX512BW-FCP-NEXT: movb $-32, %dil ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm22 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,5,11,5,11,5,11,5] +; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,7,13,0,1,7,13] +; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm24 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm23 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 @@ -4003,12 +4012,12 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: movb $56, %dil @@ -4031,7 +4040,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -4050,42 +4059,42 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: movb $24, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,6,12,0,0,6,12] +; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm21 ; AVX512DQ-BW-NEXT: movb $-32, %dil ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm22 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,5,11,5,11,5,11,5] +; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,7,13,0,1,7,13] +; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm24 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm23 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm23 @@ -4146,12 +4155,12 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,6,0,10,0,6,0,10] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,6,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,6,12,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $56, %dil @@ -4174,7 +4183,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [1,7,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [1,7,13,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} @@ -4193,42 +4202,42 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm18 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movb $24, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,6,12,0,0,6,12] +; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,6,12,0,0,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm21 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,5,11,5,11,5,11,5] +; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,7,13,0,1,7,13] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,7,13,0,1,7,13] +; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,10,0,6,0,10,0,6] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 @@ -5292,11 +5301,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1056(%rdi), %ymm4 -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX2-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 288(%rdi), %ymm7 @@ -5339,48 +5348,48 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm14 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm3[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 512(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 512(%rdi), %ymm8 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 432(%rdi), %xmm11 -; AVX2-NEXT: vmovaps 384(%rdi), %xmm12 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 432(%rdi), %xmm12 +; AVX2-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm11 = xmm2[0],xmm12[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 896(%rdi), %ymm7 ; AVX2-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 816(%rdi), %xmm13 -; AVX2-NEXT: vmovaps 768(%rdi), %xmm10 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 768(%rdi), %xmm15 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm11 = xmm15[0],xmm13[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 1280(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 1280(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 1200(%rdi), %xmm14 +; AVX2-NEXT: vmovaps 1200(%rdi), %xmm11 ; AVX2-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm11[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload @@ -5406,367 +5415,368 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm4[1],mem[1] -; AVX2-NEXT: vbroadcastsd 1448(%rdi), %ymm4 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 1448(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm2[1] -; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm3[1] +; AVX2-NEXT: vbroadcastsd 104(%rdi), %ymm3 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm12[1] ; AVX2-NEXT: vbroadcastsd 488(%rdi), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm13[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm13[1] ; AVX2-NEXT: vbroadcastsd 872(%rdi), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm14[1] -; AVX2-NEXT: vbroadcastsd 1256(%rdi), %ymm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 208(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] +; AVX2-NEXT: vbroadcastsd 1256(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm1[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vmovaps 640(%rdi), %xmm6 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm7[0],xmm2[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 1120(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-NEXT: vmovaps 1024(%rdi), %xmm8 ; AVX2-NEXT: vmovaps 976(%rdi), %xmm9 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm8[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 1504(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-NEXT: vmovaps 1408(%rdi), %xmm10 ; AVX2-NEXT: vmovaps 1360(%rdi), %xmm11 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm11[0],xmm10[0] +; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm2[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 544(%rdi), %ymm1 +; AVX2-NEXT: vbroadcastsd 544(%rdi), %ymm3 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 400(%rdi), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm13 = xmm6[0],xmm0[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 928(%rdi), %ymm12 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] ; AVX2-NEXT: vmovaps 832(%rdi), %xmm12 ; AVX2-NEXT: vmovaps 784(%rdi), %xmm13 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 1312(%rdi), %ymm14 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm14[0],ymm3[2],ymm14[2] ; AVX2-NEXT: vmovaps 1216(%rdi), %xmm15 ; AVX2-NEXT: vmovaps 1168(%rdi), %xmm0 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm7[1],mem[1] ; AVX2-NEXT: vmovaps 736(%rdi), %ymm7 -; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] -; AVX2-NEXT: vmovaps 1120(%rdi), %ymm6 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm10[1] -; AVX2-NEXT: vmovaps 1504(%rdi), %ymm5 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX2-NEXT: vmovaps 1312(%rdi), %ymm3 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] -; AVX2-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] +; AVX2-NEXT: vmovaps 1120(%rdi), %ymm8 +; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm10[1] +; AVX2-NEXT: vmovaps 1504(%rdi), %ymm4 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX2-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX2-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm6[1],mem[1] +; AVX2-NEXT: vmovaps 544(%rdi), %ymm5 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm1[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 272(%rdi), %xmm6 +; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX2-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX2-NEXT: vmovaps 464(%rdi), %xmm15 +; AVX2-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm15[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-NEXT: vmovaps 656(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 608(%rdi), %xmm12 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm13[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 848(%rdi), %xmm10 -; AVX2-NEXT: vmovaps 800(%rdi), %xmm7 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0] +; AVX2-NEXT: vmovaps 800(%rdi), %xmm9 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 1040(%rdi), %xmm8 -; AVX2-NEXT: vmovaps 992(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] +; AVX2-NEXT: vmovaps 992(%rdi), %xmm7 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 1232(%rdi), %xmm6 -; AVX2-NEXT: vmovaps 1184(%rdi), %xmm3 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0] +; AVX2-NEXT: vmovaps 1184(%rdi), %xmm5 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 1424(%rdi), %xmm2 -; AVX2-NEXT: vmovaps 1376(%rdi), %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,0,3] +; AVX2-NEXT: vmovaps 1424(%rdi), %xmm4 +; AVX2-NEXT: vmovaps 1376(%rdi), %xmm3 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] ; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm14[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm15[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm12[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 904(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm13[1] +; AVX2-NEXT: vbroadcastsd 904(%rdi), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm10[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 1096(%rdi), %ymm0 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 1288(%rdi), %ymm0 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm6[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 1480(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%r9) +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%r8) +; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-NEXT: vmovaps %ymm5, 160(%rax) -; AVX2-NEXT: vmovaps %ymm7, 128(%rax) -; AVX2-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-NEXT: vmovaps %ymm13, 64(%rax) -; AVX2-NEXT: vmovaps %ymm15, 32(%rax) +; AVX2-NEXT: vmovaps %ymm1, 192(%rax) +; AVX2-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-NEXT: vmovaps %ymm11, 96(%rax) +; AVX2-NEXT: vmovaps %ymm14, 64(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%rax) ; AVX2-NEXT: addq $1496, %rsp # imm = 0x5D8 @@ -5779,11 +5789,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm7 @@ -5826,48 +5836,48 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm3[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 432(%rdi), %xmm11 -; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm12 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 432(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm11 = xmm2[0],xmm12[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 816(%rdi), %xmm13 -; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm10 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm15 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm11 = xmm15[0],xmm13[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 1200(%rdi), %xmm14 +; AVX2-FP-NEXT: vmovaps 1200(%rdi), %xmm11 ; AVX2-FP-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm11[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload @@ -5893,367 +5903,368 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm0 = xmm4[1],mem[1] -; AVX2-FP-NEXT: vbroadcastsd 1448(%rdi), %ymm4 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 1448(%rdi), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm2[1] -; AVX2-FP-NEXT: vbroadcastsd 104(%rdi), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm3[1] +; AVX2-FP-NEXT: vbroadcastsd 104(%rdi), %ymm3 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm12[1] ; AVX2-FP-NEXT: vbroadcastsd 488(%rdi), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm13[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm13[1] ; AVX2-FP-NEXT: vbroadcastsd 872(%rdi), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm14[1] -; AVX2-FP-NEXT: vbroadcastsd 1256(%rdi), %ymm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm5 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] +; AVX2-FP-NEXT: vbroadcastsd 1256(%rdi), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-FP-NEXT: vbroadcastsd 352(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm1[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm7[0],xmm2[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 1120(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-FP-NEXT: vmovaps 1024(%rdi), %xmm8 ; AVX2-FP-NEXT: vmovaps 976(%rdi), %xmm9 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm8[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 1504(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-FP-NEXT: vmovaps 1408(%rdi), %xmm10 ; AVX2-FP-NEXT: vmovaps 1360(%rdi), %xmm11 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm11[0],xmm10[0] +; AVX2-FP-NEXT: vbroadcastsd 160(%rdi), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm2[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 544(%rdi), %ymm1 +; AVX2-FP-NEXT: vbroadcastsd 544(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 400(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0] +; AVX2-FP-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm6[0],xmm0[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 928(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm12 ; AVX2-FP-NEXT: vmovaps 784(%rdi), %xmm13 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 1312(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm14[0],ymm3[2],ymm14[2] ; AVX2-FP-NEXT: vmovaps 1216(%rdi), %xmm15 ; AVX2-FP-NEXT: vmovaps 1168(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm7[1],mem[1] ; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] -; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm10[1] -; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm3 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] -; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-FP-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] +; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm10[1] +; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm6[1],mem[1] +; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm1[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX2-FP-NEXT: vmovaps 464(%rdi), %xmm15 +; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm15[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm13 +; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm13[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 848(%rdi), %xmm10 -; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0] +; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm9 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 1040(%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps 992(%rdi), %xmm5 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] +; AVX2-FP-NEXT: vmovaps 992(%rdi), %xmm7 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 1232(%rdi), %xmm6 -; AVX2-FP-NEXT: vmovaps 1184(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0] +; AVX2-FP-NEXT: vmovaps 1184(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 1424(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps 1376(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,0,3] +; AVX2-FP-NEXT: vmovaps 1424(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovaps 1376(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] ; AVX2-FP-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm14[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm15[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm12[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 904(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm13[1] +; AVX2-FP-NEXT: vbroadcastsd 904(%rdi), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm10[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 1096(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 1288(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm6[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 1480(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%r8) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 160(%rax) -; AVX2-FP-NEXT: vmovaps %ymm7, 128(%rax) -; AVX2-FP-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-FP-NEXT: vmovaps %ymm13, 64(%rax) -; AVX2-FP-NEXT: vmovaps %ymm15, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-FP-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-FP-NEXT: vmovaps %ymm11, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm14, 64(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FP-NEXT: addq $1496, %rsp # imm = 0x5D8 @@ -6266,11 +6277,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm7 @@ -6313,48 +6324,48 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm3[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 432(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 432(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm11 = xmm2[0],xmm12[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 816(%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm10 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm15 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm11 = xmm15[0],xmm13[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 1200(%rdi), %xmm14 +; AVX2-FCP-NEXT: vmovaps 1200(%rdi), %xmm11 ; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm11[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload @@ -6380,367 +6391,368 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm0 = xmm4[1],mem[1] -; AVX2-FCP-NEXT: vbroadcastsd 1448(%rdi), %ymm4 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 1448(%rdi), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm2[1] -; AVX2-FCP-NEXT: vbroadcastsd 104(%rdi), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm3[1] +; AVX2-FCP-NEXT: vbroadcastsd 104(%rdi), %ymm3 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm12[1] ; AVX2-FCP-NEXT: vbroadcastsd 488(%rdi), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm13[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm13[1] ; AVX2-FCP-NEXT: vbroadcastsd 872(%rdi), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm14[1] -; AVX2-FCP-NEXT: vbroadcastsd 1256(%rdi), %ymm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] +; AVX2-FCP-NEXT: vbroadcastsd 1256(%rdi), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-FCP-NEXT: vbroadcastsd 352(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm1[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm7[0],xmm2[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 1120(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovaps 976(%rdi), %xmm9 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm8[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 1504(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovaps 1360(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm11[0],xmm10[0] +; AVX2-FCP-NEXT: vbroadcastsd 160(%rdi), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm2[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 544(%rdi), %ymm1 +; AVX2-FCP-NEXT: vbroadcastsd 544(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 400(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0] +; AVX2-FCP-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm13 = xmm6[0],xmm0[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 928(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm12 ; AVX2-FCP-NEXT: vmovaps 784(%rdi), %xmm13 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 1312(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm14[0],ymm3[2],ymm14[2] ; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %xmm15 ; AVX2-FCP-NEXT: vmovaps 1168(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm7[1],mem[1] ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] -; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm10[1] -; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm3 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] -; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] +; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm10[1] +; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm6[1],mem[1] +; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm1[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX2-FCP-NEXT: vmovaps 464(%rdi), %xmm15 +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm15[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm13[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 848(%rdi), %xmm10 -; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0] +; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 1040(%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovaps 992(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] +; AVX2-FCP-NEXT: vmovaps 992(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 1232(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0] +; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 1424(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,0,3] +; AVX2-FCP-NEXT: vmovaps 1424(%rdi), %xmm4 +; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] ; AVX2-FCP-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm14[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm15[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm12[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 904(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm13[1] +; AVX2-FCP-NEXT: vbroadcastsd 904(%rdi), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm10[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 1096(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 1288(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm6[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 1480(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%r8) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm7, 128(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm13, 64(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm15, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm11, 96(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm14, 64(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FCP-NEXT: addq $1496, %rsp # imm = 0x5D8 @@ -6765,7 +6777,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6784,7 +6796,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6845,11 +6857,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7117,7 +7129,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7136,7 +7148,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7197,11 +7209,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7469,7 +7481,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7488,7 +7500,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7549,11 +7561,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7821,7 +7833,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7840,7 +7852,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7901,11 +7913,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8173,7 +8185,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8192,7 +8204,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8253,11 +8265,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8525,7 +8537,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8544,7 +8556,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8605,11 +8617,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8877,7 +8889,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8896,7 +8908,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8957,11 +8969,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9229,7 +9241,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,6,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,6,12,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9248,7 +9260,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [1,7,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,7,13,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9309,11 +9321,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,0,6,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm31 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11680,7 +11692,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-LABEL: load_i64_stride6_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $3432, %rsp # imm = 0xD68 +; AVX2-NEXT: subq $3448, %rsp # imm = 0xD78 ; AVX2-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1056(%rdi), %ymm4 @@ -11902,13 +11914,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vbroadcastsd 1448(%rdi), %ymm15 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 1448(%rdi), %ymm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -11934,13 +11946,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vbroadcastsd 2984(%rdi), %ymm15 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 2984(%rdi), %ymm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -11964,12 +11976,12 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm9[1],mem[1] -; AVX2-NEXT: vbroadcastsd 1256(%rdi), %ymm9 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 1256(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-NEXT: # xmm9 = xmm9[1],mem[1] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm14[1] ; AVX2-NEXT: vbroadcastsd 1640(%rdi), %ymm7 @@ -11987,14 +11999,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] -; AVX2-NEXT: vbroadcastsd 2792(%rdi), %ymm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 2792(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 208(%rdi), %xmm1 @@ -12030,13 +12042,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 1360(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vbroadcastsd 1888(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 1888(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-NEXT: vmovaps 1792(%rdi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-NEXT: vmovaps 1744(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -12070,15 +12082,15 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 2896(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vbroadcastsd 2848(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 2848(%rdi), %ymm0 -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-NEXT: vmovaps 2752(%rdi), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 2704(%rdi), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12105,14 +12117,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vbroadcastsd 1696(%rdi), %ymm0 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vmovaps 1600(%rdi), %xmm12 +; AVX2-NEXT: vmovaps 1600(%rdi), %xmm14 ; AVX2-NEXT: vmovaps 1552(%rdi), %xmm11 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm14[0] +; AVX2-NEXT: vbroadcastsd 1312(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 1312(%rdi), %ymm0 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] ; AVX2-NEXT: vmovaps 1216(%rdi), %xmm9 ; AVX2-NEXT: vmovaps 1168(%rdi), %xmm8 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] @@ -12139,13 +12151,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-NEXT: vmovaps 16(%rdi), %xmm15 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm0[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] -; AVX2-NEXT: vmovaps 160(%rdi), %ymm14 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12169,34 +12181,34 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 736(%rdi), %ymm13 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] -; AVX2-NEXT: vmovaps 928(%rdi), %ymm5 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 928(%rdi), %ymm4 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 1120(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 1120(%rdi), %ymm5 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-NEXT: vmovaps 1312(%rdi), %ymm7 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1312(%rdi), %ymm6 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12210,51 +12222,50 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] -; AVX2-NEXT: vmovaps 1696(%rdi), %ymm9 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm14[1] +; AVX2-NEXT: vmovaps 1696(%rdi), %ymm7 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: vunpckhpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 1888(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 1888(%rdi), %ymm9 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 2080(%rdi), %ymm11 +; AVX2-NEXT: vmovaps 2080(%rdi), %ymm10 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 2272(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 2272(%rdi), %ymm11 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-NEXT: vmovaps 2464(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 2464(%rdi), %ymm14 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12268,7 +12279,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[1],mem[1] ; AVX2-NEXT: vmovaps 2848(%rdi), %ymm2 @@ -12289,7 +12300,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12302,7 +12313,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-NEXT: vmovaps %xmm12, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] @@ -12314,12 +12325,12 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 464(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 656(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12329,7 +12340,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 848(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12339,7 +12350,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 1040(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12349,7 +12360,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 1232(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12369,7 +12380,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: vmovaps 1616(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12379,343 +12390,349 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX2-NEXT: vmovaps 1808(%rdi), %xmm3 +; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 1760(%rdi), %xmm13 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 2000(%rdi), %xmm1 +; AVX2-NEXT: vmovaps 2000(%rdi), %xmm3 +; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 1952(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 1952(%rdi), %xmm11 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 2192(%rdi), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 2144(%rdi), %xmm9 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] +; AVX2-NEXT: vmovaps 2192(%rdi), %xmm15 +; AVX2-NEXT: vmovaps 2144(%rdi), %xmm11 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm15[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 2384(%rdi), %xmm12 -; AVX2-NEXT: vmovaps 2336(%rdi), %xmm7 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0] +; AVX2-NEXT: vmovaps 2384(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 2336(%rdi), %xmm12 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm13[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 2576(%rdi), %xmm8 -; AVX2-NEXT: vmovaps 2528(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 2768(%rdi), %xmm4 -; AVX2-NEXT: vmovaps 2720(%rdi), %xmm3 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-NEXT: vmovaps 2960(%rdi), %xmm2 -; AVX2-NEXT: vmovaps 2912(%rdi), %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 328(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 520(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 712(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 904(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 1096(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 1288(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 1480(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 1672(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 1864(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-NEXT: # xmm13 = xmm13[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 2056(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 2248(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 2440(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 2632(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 2824(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 3016(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 480(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 448(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 416(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 384(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 352(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 320(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 288(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 256(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%r9) +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-NEXT: vmovaps 2576(%rdi), %xmm9 +; AVX2-NEXT: vmovaps 2528(%rdi), %xmm7 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm9[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-NEXT: vmovaps 2768(%rdi), %xmm6 +; AVX2-NEXT: vmovaps 2720(%rdi), %xmm5 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,0,3] +; AVX2-NEXT: vmovaps 2960(%rdi), %xmm4 +; AVX2-NEXT: vmovaps 2912(%rdi), %xmm3 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] +; AVX2-NEXT: vbroadcastsd 136(%rdi), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 328(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 520(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 712(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vbroadcastsd 904(%rdi), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 1096(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 1288(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 1480(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vbroadcastsd 1672(%rdi), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 1864(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 2056(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 2248(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm15[1] +; AVX2-NEXT: vbroadcastsd 2440(%rdi), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm13[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 2632(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 2824(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 3016(%rdi), %ymm0 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 448(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 384(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 320(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 256(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 480(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 416(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 352(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 288(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 448(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 384(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 320(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 256(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 480(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 416(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 352(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 288(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 256(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 320(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 384(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 448(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 480(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 416(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 352(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 288(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 480(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 448(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 416(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 384(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 352(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 320(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 288(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 256(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 480(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 448(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 416(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 384(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 352(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 320(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 288(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 256(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 224(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 192(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 160(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 128(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-NEXT: vmovaps %ymm3, 448(%rax) -; AVX2-NEXT: vmovaps %ymm5, 416(%rax) -; AVX2-NEXT: vmovaps %ymm7, 384(%rax) -; AVX2-NEXT: vmovaps %ymm9, 352(%rax) -; AVX2-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-NEXT: vmovaps %ymm13, 288(%rax) -; AVX2-NEXT: vmovaps %ymm15, 256(%rax) +; AVX2-NEXT: vmovaps %ymm5, 448(%rax) +; AVX2-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-NEXT: vmovaps %ymm1, 384(%rax) +; AVX2-NEXT: vmovaps %ymm2, 352(%rax) +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12732,13 +12749,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%rax) -; AVX2-NEXT: addq $3432, %rsp # imm = 0xD68 +; AVX2-NEXT: addq $3448, %rsp # imm = 0xD78 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i64_stride6_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $3432, %rsp # imm = 0xD68 +; AVX2-FP-NEXT: subq $3448, %rsp # imm = 0xD78 ; AVX2-FP-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm4 @@ -12960,13 +12977,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vbroadcastsd 1448(%rdi), %ymm15 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 1448(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -12992,13 +13009,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vbroadcastsd 2984(%rdi), %ymm15 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 2984(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -13022,12 +13039,12 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm0 = xmm9[1],mem[1] -; AVX2-FP-NEXT: vbroadcastsd 1256(%rdi), %ymm9 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 1256(%rdi), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm9 = xmm9[1],mem[1] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm14[1] ; AVX2-FP-NEXT: vbroadcastsd 1640(%rdi), %ymm7 @@ -13045,14 +13062,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vbroadcastsd 2792(%rdi), %ymm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 2792(%rdi), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-FP-NEXT: vbroadcastsd 352(%rdi), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 208(%rdi), %xmm1 @@ -13088,13 +13105,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 1360(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FP-NEXT: vbroadcastsd 1888(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 1888(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FP-NEXT: vmovaps 1792(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 1744(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -13128,15 +13145,15 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 2896(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FP-NEXT: vbroadcastsd 2848(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 2848(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FP-NEXT: vmovaps 2752(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 2704(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13163,14 +13180,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vbroadcastsd 1696(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vmovaps 1600(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovaps 1600(%rdi), %xmm14 ; AVX2-FP-NEXT: vmovaps 1552(%rdi), %xmm11 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm14[0] +; AVX2-FP-NEXT: vbroadcastsd 1312(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 1312(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] ; AVX2-FP-NEXT: vmovaps 1216(%rdi), %xmm9 ; AVX2-FP-NEXT: vmovaps 1168(%rdi), %xmm8 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] @@ -13197,13 +13214,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm15 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm0[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] -; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm14 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13227,34 +13244,34 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm5 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm4 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm7 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm6 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13268,51 +13285,50 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] -; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm9 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm14[1] +; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vunpckhpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 2080(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovaps 2080(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 2272(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 2272(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FP-NEXT: vmovaps 2464(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 2464(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13326,7 +13342,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[1],mem[1] ; AVX2-FP-NEXT: vmovaps 2848(%rdi), %ymm2 @@ -13347,7 +13363,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-FP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13360,7 +13376,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-FP-NEXT: vmovaps %xmm12, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] @@ -13372,12 +13388,12 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 464(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 656(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13387,7 +13403,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 848(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13397,7 +13413,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 1040(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13407,7 +13423,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 1232(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13427,7 +13443,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FP-NEXT: vmovaps 1616(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13437,343 +13453,349 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovaps 1808(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 1760(%rdi), %xmm13 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 2000(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovaps 2000(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 1952(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 1952(%rdi), %xmm11 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 2192(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 2144(%rdi), %xmm9 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] +; AVX2-FP-NEXT: vmovaps 2192(%rdi), %xmm15 +; AVX2-FP-NEXT: vmovaps 2144(%rdi), %xmm11 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm15[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 2384(%rdi), %xmm12 -; AVX2-FP-NEXT: vmovaps 2336(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0] +; AVX2-FP-NEXT: vmovaps 2384(%rdi), %xmm13 +; AVX2-FP-NEXT: vmovaps 2336(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm13[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 2576(%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps 2528(%rdi), %xmm5 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] +; AVX2-FP-NEXT: vmovaps 2576(%rdi), %xmm9 +; AVX2-FP-NEXT: vmovaps 2528(%rdi), %xmm7 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm9[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 2768(%rdi), %xmm4 -; AVX2-FP-NEXT: vmovaps 2720(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] +; AVX2-FP-NEXT: vmovaps 2768(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovaps 2720(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FP-NEXT: vmovaps 2960(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps 2912(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,0,3] +; AVX2-FP-NEXT: vmovaps 2960(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovaps 2912(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] ; AVX2-FP-NEXT: vbroadcastsd 136(%rdi), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vbroadcastsd 904(%rdi), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 904(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 1096(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 1288(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 1480(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vbroadcastsd 1672(%rdi), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 1672(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 1864(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm13 = xmm13[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 2056(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 2248(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 2440(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm15[1] +; AVX2-FP-NEXT: vbroadcastsd 2440(%rdi), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm13[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 2632(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 2824(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 3016(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 480(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 448(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 416(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 384(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 352(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 320(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 288(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 256(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 448(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 384(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 320(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 256(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 480(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 416(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 352(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 288(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 448(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 384(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 320(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 256(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 480(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 416(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 352(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 288(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 256(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 320(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 384(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 448(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 480(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 416(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 352(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 288(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 480(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 448(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 416(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 384(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 352(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 320(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 288(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 256(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 480(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 448(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 416(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 384(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 352(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 320(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 288(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 256(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 224(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 192(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 160(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm3, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-FP-NEXT: vmovaps %ymm3, 448(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 416(%rax) -; AVX2-FP-NEXT: vmovaps %ymm7, 384(%rax) -; AVX2-FP-NEXT: vmovaps %ymm9, 352(%rax) -; AVX2-FP-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-FP-NEXT: vmovaps %ymm13, 288(%rax) -; AVX2-FP-NEXT: vmovaps %ymm15, 256(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 448(%rax) +; AVX2-FP-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 352(%rax) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -13790,13 +13812,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FP-NEXT: addq $3432, %rsp # imm = 0xD68 +; AVX2-FP-NEXT: addq $3448, %rsp # imm = 0xD78 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i64_stride6_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $3432, %rsp # imm = 0xD68 +; AVX2-FCP-NEXT: subq $3448, %rsp # imm = 0xD78 ; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm4 @@ -14013,19 +14035,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vbroadcastsd 1064(%rdi), %ymm15 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vbroadcastsd 1448(%rdi), %ymm15 +; AVX2-FCP-NEXT: vbroadcastsd 1064(%rdi), %ymm15 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd 1448(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] @@ -14050,13 +14072,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vbroadcastsd 2984(%rdi), %ymm15 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 2984(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -14080,12 +14102,12 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm0 = xmm9[1],mem[1] -; AVX2-FCP-NEXT: vbroadcastsd 1256(%rdi), %ymm9 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 1256(%rdi), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm9 = xmm9[1],mem[1] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm14[1] ; AVX2-FCP-NEXT: vbroadcastsd 1640(%rdi), %ymm7 @@ -14103,14 +14125,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vbroadcastsd 2792(%rdi), %ymm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 2792(%rdi), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-FCP-NEXT: vbroadcastsd 352(%rdi), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 208(%rdi), %xmm1 @@ -14146,13 +14168,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 1360(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FCP-NEXT: vbroadcastsd 1888(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 1888(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 1744(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -14186,15 +14208,15 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 2896(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FCP-NEXT: vbroadcastsd 2848(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 2848(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-FCP-NEXT: vmovaps 2752(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 2704(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14221,14 +14243,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcastsd 1696(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %xmm14 ; AVX2-FCP-NEXT: vmovaps 1552(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm14[0] +; AVX2-FCP-NEXT: vbroadcastsd 1312(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 1312(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] ; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %xmm9 ; AVX2-FCP-NEXT: vmovaps 1168(%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] @@ -14255,13 +14277,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm15 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm0[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm14 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14285,34 +14307,34 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] -; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm5 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm4 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm7 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm6 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14326,51 +14348,50 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] -; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm9 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm14[1] +; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vunpckhpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 2080(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 2080(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 2272(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 2272(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-FCP-NEXT: vmovaps 2464(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 2464(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14384,7 +14405,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[1],mem[1] ; AVX2-FCP-NEXT: vmovaps 2848(%rdi), %ymm2 @@ -14405,7 +14426,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-FCP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -14418,7 +14439,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovaps %xmm12, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] @@ -14430,12 +14451,12 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 464(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 656(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -14445,7 +14466,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 848(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -14455,7 +14476,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 1040(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -14465,7 +14486,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 1232(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -14485,7 +14506,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FCP-NEXT: vmovaps 1616(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -14495,343 +14516,349 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps 1808(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 2000(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps 2000(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 2192(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 2144(%rdi), %xmm9 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] +; AVX2-FCP-NEXT: vmovaps 2192(%rdi), %xmm15 +; AVX2-FCP-NEXT: vmovaps 2144(%rdi), %xmm11 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm15[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 2384(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovaps 2336(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0] +; AVX2-FCP-NEXT: vmovaps 2384(%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovaps 2336(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm13[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 2576(%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovaps 2528(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] +; AVX2-FCP-NEXT: vmovaps 2576(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovaps 2528(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm9[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 2768(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovaps 2720(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] +; AVX2-FCP-NEXT: vmovaps 2768(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovaps 2720(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-FCP-NEXT: vmovaps 2960(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps 2912(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,0,3] +; AVX2-FCP-NEXT: vmovaps 2960(%rdi), %xmm4 +; AVX2-FCP-NEXT: vmovaps 2912(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] ; AVX2-FCP-NEXT: vbroadcastsd 136(%rdi), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vbroadcastsd 904(%rdi), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 904(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 1096(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 1288(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 1480(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vbroadcastsd 1672(%rdi), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 1672(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 1864(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm13 = xmm13[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 2056(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 2248(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 2440(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm15[1] +; AVX2-FCP-NEXT: vbroadcastsd 2440(%rdi), %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm13[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 2632(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 2824(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 3016(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 448(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 384(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 320(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 256(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 480(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 416(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 352(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 288(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 448(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 384(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 320(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 256(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 480(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 416(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 352(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 288(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 256(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 320(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 384(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 448(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 480(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 416(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 352(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 288(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 480(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 448(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 416(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 384(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 352(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 320(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 288(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 256(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 480(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 448(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 416(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 384(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 352(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 320(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 288(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 256(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 224(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 160(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 128(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm3, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm3, 448(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm5, 416(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm7, 384(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm9, 352(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm13, 288(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm15, 256(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm5, 448(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm2, 352(%rax) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -14848,7 +14875,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FCP-NEXT: addq $3432, %rsp # imm = 0xD68 +; AVX2-FCP-NEXT: addq $3448, %rsp # imm = 0xD78 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -14876,72 +14903,72 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512-NEXT: vpermt2q %zmm29, %zmm8, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512-NEXT: vpermt2q %zmm29, %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14950,16 +14977,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14969,16 +14996,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14992,44 +15019,44 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15632,72 +15659,72 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15706,16 +15733,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15725,16 +15752,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15748,44 +15775,44 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16388,72 +16415,72 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm8, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16462,16 +16489,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16481,16 +16508,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16504,44 +16531,44 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17144,72 +17171,72 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17218,16 +17245,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17237,16 +17264,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17260,44 +17287,44 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17900,72 +17927,72 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm8, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17974,16 +18001,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17993,16 +18020,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18016,44 +18043,44 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18656,72 +18683,72 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18730,16 +18757,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18749,16 +18776,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18772,44 +18799,44 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19412,72 +19439,72 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19486,16 +19513,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19505,16 +19532,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19528,44 +19555,44 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20168,72 +20195,72 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,0,0,6,12,0,0,6] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20242,16 +20269,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20261,16 +20288,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20284,44 +20311,44 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,6,12,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,6,12,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,7,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,7,13,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [10,0,6,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,0,6,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [11,1,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [11,1,7,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index cc3e5f3d1d82e..a88de77d7122c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -47,7 +47,6 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-LABEL: load_i64_stride7_vf2: ; AVX: # %bb.0: ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovapd 16(%rdi), %xmm0 ; AVX-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2 @@ -60,12 +59,13 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 96(%rdi), %xmm6 ; AVX-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX-NEXT: vmovaps %xmm4, (%rsi) ; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: vmovapd %xmm5, (%rcx) ; AVX-NEXT: vmovapd %xmm0, (%r8) ; AVX-NEXT: vmovapd %xmm3, (%r9) -; AVX-NEXT: vmovdqa %xmm6, (%r10) +; AVX-NEXT: vmovdqa %xmm6, (%rdi) ; AVX-NEXT: vmovaps %xmm1, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -73,7 +73,6 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-LABEL: load_i64_stride7_vf2: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm1[2,3] @@ -87,12 +86,13 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm6 ; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX2-NEXT: vmovaps %xmm2, (%rsi) ; AVX2-NEXT: vmovdqa %xmm3, (%rdx) ; AVX2-NEXT: vmovaps %xmm4, (%rcx) ; AVX2-NEXT: vextracti128 $1, %ymm0, (%r8) ; AVX2-NEXT: vmovaps %xmm5, (%r9) -; AVX2-NEXT: vmovdqa %xmm6, (%r10) +; AVX2-NEXT: vmovdqa %xmm6, (%rdi) ; AVX2-NEXT: vmovaps %xmm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -100,7 +100,6 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-LABEL: load_i64_stride7_vf2: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm1[2,3] @@ -114,12 +113,13 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm6 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX2-FP-NEXT: vmovaps %xmm2, (%rsi) ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX2-FP-NEXT: vmovaps %xmm4, (%rcx) ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, (%r8) ; AVX2-FP-NEXT: vmovaps %xmm5, (%r9) -; AVX2-FP-NEXT: vmovdqa %xmm6, (%r10) +; AVX2-FP-NEXT: vmovdqa %xmm6, (%rdi) ; AVX2-FP-NEXT: vmovaps %xmm1, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -127,7 +127,6 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-LABEL: load_i64_stride7_vf2: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm1[2,3] @@ -141,12 +140,13 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsi) ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rdx) ; AVX2-FCP-NEXT: vmovaps %xmm4, (%rcx) ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, (%r8) ; AVX2-FCP-NEXT: vmovaps %xmm5, (%r9) -; AVX2-FCP-NEXT: vmovdqa %xmm6, (%r10) +; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rdi) ; AVX2-FCP-NEXT: vmovaps %xmm1, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -441,32 +441,32 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovapd 192(%rdi), %ymm0 ; AVX-NEXT: vmovapd 128(%rdi), %ymm2 -; AVX-NEXT: vmovapd 160(%rdi), %ymm4 -; AVX-NEXT: vmovapd 96(%rdi), %ymm5 +; AVX-NEXT: vmovapd 160(%rdi), %ymm5 +; AVX-NEXT: vmovapd 96(%rdi), %ymm6 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1,2],ymm1[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1,2],ymm1[3] ; AVX-NEXT: vmovapd 16(%rdi), %xmm7 ; AVX-NEXT: vmovapd 48(%rdi), %xmm3 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX-NEXT: vmovapd 80(%rdi), %xmm9 -; AVX-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm3[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[3],ymm4[2] +; AVX-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm3[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm5[1],ymm6[3],ymm5[2] ; AVX-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] ; AVX-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm10 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3] -; AVX-NEXT: vblendpd {{.*#+}} xmm10 = xmm7[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[3] -; AVX-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3] +; AVX-NEXT: vblendpd {{.*#+}} xmm9 = xmm7[0],mem[1] +; AVX-NEXT: vmovapd 80(%rdi), %xmm10 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm10[0],ymm7[2],ymm10[3] +; AVX-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm8 ; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3] -; AVX-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1] +; AVX-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm10[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[3],ymm0[2] ; AVX-NEXT: vmovdqa 96(%rdi), %xmm9 @@ -475,9 +475,9 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] ; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm3[0],mem[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX-NEXT: vmovapd %ymm6, (%rsi) -; AVX-NEXT: vmovapd %ymm5, (%rdx) -; AVX-NEXT: vmovapd %ymm4, (%rcx) +; AVX-NEXT: vmovapd %ymm4, (%rsi) +; AVX-NEXT: vmovapd %ymm6, (%rdx) +; AVX-NEXT: vmovapd %ymm5, (%rcx) ; AVX-NEXT: vmovapd %ymm7, (%r8) ; AVX-NEXT: vmovapd %ymm8, (%r9) ; AVX-NEXT: vmovapd %ymm2, (%r10) @@ -495,19 +495,19 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = mem[0,1],xmm3[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm8 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm8 @@ -547,19 +547,19 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = mem[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm8 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm8 @@ -599,19 +599,19 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = mem[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 @@ -647,21 +647,21 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512-NEXT: vpbroadcastq 176(%rdi), %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX512-NEXT: vpbroadcastq %xmm6, %ymm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpbroadcastq %xmm6, %ymm7 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX512-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm8 @@ -671,17 +671,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,11] ; AVX512-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) @@ -702,18 +702,18 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] ; AVX512-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] -; AVX512-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,7] +; AVX512-FCP-NEXT: vpermq %zmm3, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 @@ -723,17 +723,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,11] ; AVX512-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -752,21 +752,21 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-NEXT: vpbroadcastq 176(%rdi), %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX512DQ-NEXT: vpbroadcastq %xmm6, %ymm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastq %xmm6, %ymm7 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm8 @@ -776,17 +776,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,11] ; AVX512DQ-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi) @@ -807,18 +807,18 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] ; AVX512DQ-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] -; AVX512DQ-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,7] +; AVX512DQ-FCP-NEXT: vpermq %zmm3, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 @@ -828,17 +828,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,11] ; AVX512DQ-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -857,21 +857,21 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512BW-NEXT: vpbroadcastq 176(%rdi), %ymm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX512BW-NEXT: vpbroadcastq %xmm6, %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vpbroadcastq %xmm6, %ymm7 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm8 @@ -881,17 +881,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,11] ; AVX512BW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -912,18 +912,18 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] ; AVX512BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] -; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,7] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 @@ -933,17 +933,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,11] ; AVX512BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -962,21 +962,21 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vpbroadcastq 176(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX512DQ-BW-NEXT: vpbroadcastq %xmm6, %ymm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vpbroadcastq %xmm6, %ymm7 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm8 @@ -986,17 +986,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,11] ; AVX512DQ-BW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi) @@ -1017,18 +1017,18 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,14,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,0,7,u] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,7] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 @@ -1038,17 +1038,17 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi) @@ -1306,313 +1306,313 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i64_stride7_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm9 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm9[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX2-NEXT: vmovdqa 272(%rdi), %xmm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm13[0,1],xmm5[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX2-NEXT: vmovdqa 272(%rdi), %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm4[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 288(%rdi), %xmm15 -; AVX2-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm13 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpbroadcastq 352(%rdi), %ymm13 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 288(%rdi), %xmm14 +; AVX2-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm8[1],ymm14[3],ymm8[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = mem[0,1],xmm13[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpbroadcastq 352(%rdi), %ymm12 ; AVX2-NEXT: vmovdqa 240(%rdi), %xmm13 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX2-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm11[2,3] ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vmovdqa 352(%rdi), %ymm15 +; AVX2-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovdqa 352(%rdi), %ymm13 ; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovdqa 416(%rdi), %ymm7 -; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX2-NEXT: vpalignr {{.*#+}} ymm13 = ymm13[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 320(%rdi), %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-NEXT: vmovdqa %ymm8, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm7, 32(%rdx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-NEXT: vmovdqa %ymm10, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-NEXT: vmovdqa %ymm12, 32(%r8) -; AVX2-NEXT: vmovdqa %ymm11, (%r8) -; AVX2-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-NEXT: vmovdqa %ymm13, (%r9) +; AVX2-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-NEXT: vmovdqa %ymm11, 32(%r8) +; AVX2-NEXT: vmovdqa %ymm6, (%r8) +; AVX2-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-NEXT: vmovdqa %ymm12, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i64_stride7_vf8: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm9[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm13[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm15 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm13 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq 352(%rdi), %ymm13 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm14 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm8[1],ymm14[3],ymm8[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = mem[0,1],xmm13[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq 352(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm13 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm11[2,3] ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] -; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] +; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm13 ; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm7 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm13 = ymm13[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm1 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FP-NEXT: vmovdqa %ymm8, 32(%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%rdx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm10, 32(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm12, 32(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm11, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm13, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm11, 32(%r8) +; AVX2-FP-NEXT: vmovdqa %ymm6, (%r8) +; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm12, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i64_stride7_vf8: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm9[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm13[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq 352(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm14 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm14[1],ymm8[1],ymm14[3],ymm8[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = mem[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq 352(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 240(%rdi), %xmm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm11[2,3] ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm13 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm13 = ymm13[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %ymm8, 32(%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 32(%rdx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm10, 32(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm12, 32(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm11, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm13, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm6, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm12, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -1639,16 +1639,16 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] ; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [5,12] ; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -1659,46 +1659,46 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: movb $24, %r10b -; AVX512-NEXT: kmovw %r10d, %k2 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512-NEXT: movb $24, %dil +; AVX512-NEXT: kmovw %edi, %k2 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512-NEXT: movb $-32, %r10b -; AVX512-NEXT: kmovw %r10d, %k1 +; AVX512-NEXT: movb $-32, %dil +; AVX512-NEXT: kmovw %edi, %k1 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,4,11] ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 ; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} @@ -1748,16 +1748,16 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [5,12] ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -1768,46 +1768,46 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-FCP-NEXT: movb $24, %r10b -; AVX512-FCP-NEXT: kmovw %r10d, %k2 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512-FCP-NEXT: movb $24, %dil +; AVX512-FCP-NEXT: kmovw %edi, %k2 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512-FCP-NEXT: movb $-32, %r10b -; AVX512-FCP-NEXT: kmovw %r10d, %k1 +; AVX512-FCP-NEXT: movb $-32, %dil +; AVX512-FCP-NEXT: kmovw %edi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,4,11] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} @@ -1857,16 +1857,16 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [5,12] ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -1877,46 +1877,46 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQ-NEXT: movb $24, %r10b -; AVX512DQ-NEXT: kmovw %r10d, %k2 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-NEXT: movb $24, %dil +; AVX512DQ-NEXT: kmovw %edi, %k2 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512DQ-NEXT: movb $-32, %r10b -; AVX512DQ-NEXT: kmovw %r10d, %k1 +; AVX512DQ-NEXT: movb $-32, %dil +; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,4,11] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} @@ -1966,16 +1966,16 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [5,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 @@ -1986,46 +1986,46 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQ-FCP-NEXT: movb $24, %r10b -; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-FCP-NEXT: movb $24, %dil +; AVX512DQ-FCP-NEXT: kmovw %edi, %k2 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: movb $-32, %r10b -; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 +; AVX512DQ-FCP-NEXT: movb $-32, %dil +; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,4,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} @@ -2059,82 +2059,82 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm10 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm10, %zmm10 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm9, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm8 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm8, %zmm7 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movb $24, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm14 ; AVX512BW-NEXT: movb $-32, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm15 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,4,11] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm4 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} @@ -2151,11 +2151,11 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -2168,82 +2168,82 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm10, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] -; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] -; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm9, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movb $24, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm14 ; AVX512BW-FCP-NEXT: movb $-32, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,4,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} @@ -2260,11 +2260,11 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -2277,82 +2277,82 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm11, %zmm10, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] -; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] -; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm9, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movb $24, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm14 ; AVX512DQ-BW-NEXT: movb $-32, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm15 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,4,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} @@ -2369,11 +2369,11 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -2386,82 +2386,82 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm10, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm9, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movb $24, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} @@ -2478,11 +2478,11 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -2786,11 +2786,11 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3] +; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm7 +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm4 -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX-NEXT: vmovaps 672(%rdi), %xmm6 ; AVX-NEXT: vmovaps 720(%rdi), %xmm7 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2866,8 +2866,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm9[4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 752(%rdi), %xmm12 -; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[3] +; AVX-NEXT: vmovapd 752(%rdi), %xmm13 +; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm13[0],ymm5[2],ymm13[3] ; AVX-NEXT: vmovdqa 864(%rdi), %xmm5 ; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 @@ -2875,139 +2875,139 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX-NEXT: vmovapd 304(%rdi), %xmm7 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3] -; AVX-NEXT: vmovdqa 416(%rdi), %xmm8 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa 416(%rdi), %xmm9 +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 80(%rdi), %xmm11 -; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm10[1],ymm11[0],ymm10[2],ymm11[3] ; AVX-NEXT: vmovdqa 192(%rdi), %xmm10 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm4[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 528(%rdi), %xmm4 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[3] ; AVX-NEXT: vmovdqa 640(%rdi), %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm2[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm6[0,1],ymm2[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2 ; AVX-NEXT: vmovapd 352(%rdi), %ymm6 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] ; AVX-NEXT: vmovapd 256(%rdi), %xmm8 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm8[0],xmm7[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm7[0,1],ymm2[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX-NEXT: vmovapd 800(%rdi), %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] -; AVX-NEXT: vmovapd 704(%rdi), %xmm5 -; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],xmm12[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2,3] +; AVX-NEXT: vmovapd 800(%rdi), %ymm5 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3] +; AVX-NEXT: vmovapd 704(%rdi), %xmm3 +; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm3[0],xmm13[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm2[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovapd 576(%rdi), %ymm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX-NEXT: vmovapd 480(%rdi), %xmm1 -; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm4[0,1],ymm0[2,3] +; AVX-NEXT: vmovapd 576(%rdi), %ymm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] +; AVX-NEXT: vmovapd 480(%rdi), %xmm13 +; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm13[0],xmm4[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm4[0,1],ymm0[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 ; AVX-NEXT: vmovapd 128(%rdi), %ymm10 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3] -; AVX-NEXT: vmovapd 416(%rdi), %ymm2 -; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[1],ymm6[3],ymm2[2] -; AVX-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3] -; AVX-NEXT: vmovapd 864(%rdi), %ymm4 -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2] -; AVX-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3] -; AVX-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 640(%rdi), %ymm3 -; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[1],ymm7[3],ymm3[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3] -; AVX-NEXT: vmovapd 192(%rdi), %ymm1 -; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[3],ymm1[2] -; AVX-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX-NEXT: # ymm2 = mem[0,1,2],ymm2[3] -; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = mem[0,1,2],ymm4[3] -; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = mem[0,1,2],ymm1[3] -; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = mem[0,1,2],ymm3[3] -; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, (%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, (%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, (%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX-NEXT: vmovapd %ymm13, 64(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, (%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 32(%r8) -; AVX-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 96(%r8) -; AVX-NEXT: vmovapd %ymm11, (%r9) -; AVX-NEXT: vmovapd %ymm9, 64(%r9) -; AVX-NEXT: vmovapd %ymm12, 96(%r9) -; AVX-NEXT: vmovapd %ymm14, 32(%r9) +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3] +; AVX-NEXT: vmovapd 416(%rdi), %ymm11 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm11[1],ymm6[3],ymm11[2] +; AVX-NEXT: vmovdqa 320(%rdi), %xmm1 +; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3] +; AVX-NEXT: vmovapd 864(%rdi), %ymm8 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm8[1],ymm5[3],ymm8[2] +; AVX-NEXT: vmovdqa 768(%rdi), %xmm5 +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3] +; AVX-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 640(%rdi), %ymm13 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[3],ymm13[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] +; AVX-NEXT: vmovapd 192(%rdi), %ymm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[3],ymm0[2] +; AVX-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX-NEXT: # ymm11 = mem[0,1,2],ymm11[3] +; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = mem[0,1,2],ymm8[3] +; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm5[0,1],ymm8[2,3] +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm5[0],mem[1] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0,1,2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3] +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: # ymm8 = mem[0,1,2],ymm13[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, (%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 64(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, (%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 96(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 64(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, (%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 96(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX-NEXT: vmovapd %ymm12, 64(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, (%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 32(%r8) +; AVX-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm8, 96(%r8) +; AVX-NEXT: vmovapd %ymm4, (%r9) +; AVX-NEXT: vmovapd %ymm7, 64(%r9) +; AVX-NEXT: vmovapd %ymm9, 96(%r9) +; AVX-NEXT: vmovapd %ymm15, 32(%r9) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovapd %ymm10, (%rax) -; AVX-NEXT: vmovapd %ymm7, 64(%rax) -; AVX-NEXT: vmovapd %ymm5, 96(%rax) -; AVX-NEXT: vmovapd %ymm8, 32(%rax) +; AVX-NEXT: vmovapd %ymm2, 64(%rax) +; AVX-NEXT: vmovapd %ymm3, 96(%rax) +; AVX-NEXT: vmovapd %ymm6, 32(%rax) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovapd %ymm3, 64(%rax) -; AVX-NEXT: vmovapd %ymm1, (%rax) -; AVX-NEXT: vmovapd %ymm0, 96(%rax) -; AVX-NEXT: vmovapd %ymm2, 32(%rax) +; AVX-NEXT: vmovapd %ymm0, 64(%rax) +; AVX-NEXT: vmovapd %ymm5, (%rax) +; AVX-NEXT: vmovapd %ymm14, 96(%rax) +; AVX-NEXT: vmovapd %ymm1, 32(%rax) ; AVX-NEXT: addq $552, %rsp # imm = 0x228 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i64_stride7_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 768(%rdi), %ymm2 @@ -3027,16 +3027,18 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovdqa 672(%rdi), %xmm6 -; AVX2-NEXT: vmovdqa 720(%rdi), %xmm13 -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm13[2,3] +; AVX2-NEXT: vmovdqa 720(%rdi), %xmm7 +; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm4 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm4 @@ -3083,163 +3085,161 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-NEXT: vmovdqa 688(%rdi), %xmm4 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = mem[0,1],xmm6[2,3] +; AVX2-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpbroadcastq 576(%rdi), %ymm3 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 416(%rdi), %xmm1 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 640(%rdi), %xmm6 +; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm3[2,3] +; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm3 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[2,3],ymm3[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 256(%rdi), %xmm9 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 704(%rdi), %xmm7 -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 704(%rdi), %xmm15 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm15[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa 480(%rdi), %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX2-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovdqa 864(%rdi), %ymm8 -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovdqa 864(%rdi), %ymm9 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqa 544(%rdi), %xmm3 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 640(%rdi), %ymm3 -; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX2-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3] +; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX2-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, (%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-NEXT: vmovdqa %ymm12, 64(%r9) -; AVX2-NEXT: vmovdqa %ymm14, 96(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm14, 64(%r8) +; AVX2-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-NEXT: vmovdqa %ymm10, (%r9) +; AVX2-NEXT: vmovdqa %ymm11, 64(%r9) +; AVX2-NEXT: vmovdqa %ymm12, 96(%r9) +; AVX2-NEXT: vmovdqa %ymm13, 32(%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-NEXT: vmovdqa %ymm5, 96(%rax) +; AVX2-NEXT: vmovdqa %ymm7, 32(%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-NEXT: vmovdqa %ymm13, 96(%rax) -; AVX2-NEXT: vmovdqa %ymm15, 32(%rax) -; AVX2-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX2-NEXT: vmovdqa %ymm15, (%rax) +; AVX2-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i64_stride7_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-FP-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm2 @@ -3259,16 +3259,18 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %xmm6 -; AVX2-FP-NEXT: vmovdqa 720(%rdi), %xmm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm13[2,3] +; AVX2-FP-NEXT: vmovdqa 720(%rdi), %xmm7 +; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3] +; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm4 @@ -3315,163 +3317,161 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-FP-NEXT: vmovdqa 688(%rdi), %xmm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm0 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = mem[0,1],xmm6[2,3] +; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpbroadcastq 576(%rdi), %ymm3 +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm6 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm3[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm3 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[2,3],ymm3[2,3] +; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm15[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm8 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm9 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm3 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm3 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FP-NEXT: vmovdqa %ymm12, 64(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm14, 96(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm14, 64(%r8) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FP-NEXT: vmovdqa %ymm10, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm11, 64(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm12, 96(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm13, 32(%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm5, 96(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-FP-NEXT: vmovdqa %ymm13, 96(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm15, 32(%rax) -; AVX2-FP-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-FP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm15, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FP-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i64_stride7_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-FCP-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm2 @@ -3491,16 +3491,18 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa 720(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vmovdqa 720(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3] +; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm4 @@ -3547,157 +3549,155 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-FCP-NEXT: vmovdqa 688(%rdi), %xmm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = mem[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpbroadcastq 576(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %xmm6 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm3[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[2,3],ymm3[2,3] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm15[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm3 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm12, 64(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm14, 96(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm14, 64(%r8) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm10, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 64(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm12, 96(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm13, 32(%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm5, 96(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 32(%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm13, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm15, 32(%rax) -; AVX2-FCP-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm15, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FCP-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -3709,7 +3709,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm13 @@ -3718,10 +3718,10 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm29 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm31 ; AVX512-NEXT: vpermt2q %zmm28, %zmm18, %zmm31 ; AVX512-NEXT: movb $24, %r11b ; AVX512-NEXT: kmovw %r11d, %k2 @@ -3738,7 +3738,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm1[4,5,4,5],zmm26[4,5,4,5] ; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 ; AVX512-NEXT: vpermt2q %zmm30, %zmm17, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} @@ -3749,7 +3749,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512-NEXT: vpermt2q %zmm7, %zmm20, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3759,7 +3759,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm17 ; AVX512-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 @@ -3784,8 +3784,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} ; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 ; AVX512-NEXT: vpermt2q %zmm30, %zmm23, %zmm22 -; AVX512-NEXT: vpermi2q %zmm26, %zmm3, %zmm25 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm26, %zmm1, %zmm25 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 @@ -3794,9 +3793,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -3806,7 +3805,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} ; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 ; AVX512-NEXT: vpermt2q %zmm30, %zmm23, %zmm24 -; AVX512-NEXT: vpermi2q %zmm3, %zmm26, %zmm25 +; AVX512-NEXT: vpermi2q %zmm1, %zmm26, %zmm25 ; AVX512-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 @@ -3816,12 +3815,12 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24 @@ -3832,45 +3831,44 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX512-NEXT: vpermi2q %zmm27, %zmm2, %zmm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm3 +; AVX512-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-NEXT: vpermt2q %zmm28, %zmm11, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm3 ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512-NEXT: vpermt2q %zmm30, %zmm10, %zmm20 -; AVX512-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpermi2q %zmm27, %zmm2, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 +; AVX512-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 +; AVX512-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpermi2q %zmm27, %zmm2, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] +; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 ; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,4,11] ; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 ; AVX512-NEXT: vpermt2q %zmm30, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm6 +; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm1 ; AVX512-NEXT: vpermt2q %zmm27, %zmm12, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm31, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx) @@ -3879,12 +3877,12 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm23, 64(%r8) ; AVX512-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%r10) -; AVX512-NEXT: vmovdqa64 %zmm4, (%r10) +; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r10) +; AVX512-NEXT: vmovdqa64 %zmm3, (%r10) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -3896,7 +3894,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 @@ -3905,10 +3903,10 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm31 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm31 ; AVX512-FCP-NEXT: movb $24, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k2 @@ -3925,7 +3923,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm1[4,5,4,5],zmm26[4,5,4,5] ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} @@ -3936,7 +3934,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3946,7 +3944,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 @@ -3971,8 +3969,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm22 -; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm1, %zmm25 ; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 @@ -3981,9 +3978,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -3993,7 +3990,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm24 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm25 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm26, %zmm25 ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 @@ -4003,12 +4000,12 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24 @@ -4019,45 +4016,44 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm3 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 +; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] +; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,4,11] ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx) @@ -4066,12 +4062,12 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r10) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -4083,7 +4079,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm13 @@ -4092,10 +4088,10 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm29 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm31 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm18, %zmm31 ; AVX512DQ-NEXT: movb $24, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k2 @@ -4112,7 +4108,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm1[4,5,4,5],zmm26[4,5,4,5] ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm17, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} @@ -4123,7 +4119,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm20, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4133,7 +4129,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 @@ -4158,8 +4154,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm23, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm3, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm1, %zmm25 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 @@ -4168,9 +4163,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -4180,7 +4175,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm23, %zmm24 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm26, %zmm25 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm26, %zmm25 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 @@ -4190,12 +4185,12 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24 @@ -4206,45 +4201,44 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm20, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm3 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm10, %zmm20 -; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm2, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 +; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm2, %zmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] +; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,4,11] ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm10, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm31, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rdx) @@ -4253,12 +4247,12 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%r10) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r10) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r10) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%r10) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4270,7 +4264,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13 @@ -4279,10 +4273,10 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0] ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm31 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm31 ; AVX512DQ-FCP-NEXT: movb $24, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2 @@ -4299,7 +4293,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm14 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm1[4,5,4,5],zmm26[4,5,4,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} @@ -4310,7 +4304,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4320,7 +4314,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17 @@ -4345,8 +4339,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm1, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21 @@ -4355,9 +4348,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -4367,7 +4360,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm24 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm25 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm26, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 @@ -4377,12 +4370,12 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24 @@ -4393,45 +4386,44 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] +; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,4,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx) @@ -4440,12 +4432,12 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -4542,9 +4534,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm27 @@ -4564,11 +4556,11 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,11] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] @@ -4580,39 +4572,39 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX512BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm20 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm20 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm7 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,4,11] ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 @@ -4627,10 +4619,10 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -4729,9 +4721,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 @@ -4751,11 +4743,11 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] @@ -4767,39 +4759,39 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,4,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 @@ -4814,10 +4806,10 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -4916,9 +4908,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm27 @@ -4938,11 +4930,11 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] @@ -4954,39 +4946,39 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm7 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,4,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 @@ -5001,10 +4993,10 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r10) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -5103,9 +5095,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 @@ -5125,11 +5117,11 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] @@ -5141,39 +5133,39 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,4,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 @@ -5188,10 +5180,10 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -5773,15 +5765,15 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i64_stride7_vf32: ; AVX: # %bb.0: -; AVX-NEXT: subq $1736, %rsp # imm = 0x6C8 +; AVX-NEXT: subq $1704, %rsp # imm = 0x6A8 ; AVX-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 768(%rdi), %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX-NEXT: vmovaps 224(%rdi), %xmm5 ; AVX-NEXT: vmovaps 272(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5800,10 +5792,10 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX-NEXT: vmovaps 1120(%rdi), %xmm11 +; AVX-NEXT: vmovaps 1120(%rdi), %xmm7 ; AVX-NEXT: vmovaps 1168(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 1664(%rdi), %ymm14 @@ -5816,10 +5808,10 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 96(%rdi), %ymm4 +; AVX-NEXT: vmovapd 96(%rdi), %ymm13 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] ; AVX-NEXT: vmovapd 48(%rdi), %xmm1 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm1[1] @@ -5845,66 +5837,67 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm1[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1440(%rdi), %ymm2 +; AVX-NEXT: vmovapd 1440(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3] -; AVX-NEXT: vmovapd 1344(%rdi), %xmm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm0[3] +; AVX-NEXT: vmovapd 1344(%rdi), %xmm2 ; AVX-NEXT: vmovapd 1392(%rdi), %xmm0 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm13 = xmm3[0],xmm0[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3] +; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm2[0],xmm0[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovapd 384(%rdi), %ymm0 -; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[3],ymm0[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 736(%rdi), %xmm5 -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 832(%rdi), %ymm7 +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm11[0],ymm0[1],ymm11[3],ymm0[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 736(%rdi), %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 832(%rdi), %ymm6 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[3],ymm7[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[3],ymm6[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 1280(%rdi), %ymm6 -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[3],ymm6[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 1280(%rdi), %ymm7 +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[3],ymm7[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 1632(%rdi), %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovapd 1728(%rdi), %ymm11 -; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[1],ymm14[3],ymm11[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[3],ymm11[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 160(%rdi), %ymm8 -; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[3],ymm8[2] +; AVX-NEXT: vmovapd 160(%rdi), %ymm4 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm13[0],ymm4[1],ymm13[3],ymm4[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 608(%rdi), %ymm14 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[1],ymm9[3],ymm14[2] -; AVX-NEXT: vmovdqa 512(%rdi), %xmm4 -; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 512(%rdi), %xmm8 +; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 1056(%rdi), %ymm13 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[1],ymm15[3],ymm13[2] ; AVX-NEXT: vmovdqa 960(%rdi), %xmm15 ; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 1504(%rdi), %ymm5 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[3],ymm5[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2] ; AVX-NEXT: vmovdqa 1408(%rdi), %xmm9 -; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 352(%rdi), %xmm3 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] @@ -5912,58 +5905,59 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],mem[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 800(%rdi), %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm10 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3] +; AVX-NEXT: vmovdqa 800(%rdi), %xmm12 +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] ; AVX-NEXT: vmovapd 688(%rdi), %xmm10 -; AVX-NEXT: vblendpd {{.*#+}} xmm12 = xmm10[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1248(%rdi), %xmm7 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm6[3] +; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm10[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX-NEXT: vmovapd 1136(%rdi), %xmm6 -; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3] -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1696(%rdi), %xmm7 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3] +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm6[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 1696(%rdi), %xmm7 +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] ; AVX-NEXT: vmovapd 1584(%rdi), %xmm11 -; AVX-NEXT: vblendpd {{.*#+}} xmm12 = xmm11[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3] -; AVX-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] -; AVX-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX-NEXT: vmovapd 16(%rdi), %xmm1 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] +; AVX-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm8[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3] +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] ; AVX-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm15[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 1472(%rdi), %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3] -; AVX-NEXT: vmovdqa 1360(%rdi), %xmm15 -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm9[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1,2,3],xmm15[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 1472(%rdi), %xmm15 +; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3] +; AVX-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3],xmm9[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 304(%rdi), %xmm9 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[3] @@ -5975,34 +5969,32 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovapd 752(%rdi), %xmm5 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[3] ; AVX-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1200(%rdi), %xmm0 -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[3] +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 1200(%rdi), %xmm2 +; AVX-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1],ymm2[0],ymm6[2],ymm2[3] ; AVX-NEXT: vmovdqa 1312(%rdi), %xmm6 -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1648(%rdi), %xmm0 -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[3] -; AVX-NEXT: vmovdqa 1760(%rdi), %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1424(%rdi), %xmm0 -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[3] -; AVX-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 1648(%rdi), %xmm2 +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm11[1],ymm2[0],ymm11[2],ymm2[3] +; AVX-NEXT: vmovdqa 1760(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 1424(%rdi), %xmm12 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[2],ymm12[3] +; AVX-NEXT: vmovdqa 1536(%rdi), %xmm3 +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm15[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6014,33 +6006,33 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 528(%rdi), %xmm0 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[1],ymm0[0],ymm4[2],ymm0[3] ; AVX-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 80(%rdi), %xmm3 +; AVX-NEXT: vmovapd 80(%rdi), %xmm7 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[3] -; AVX-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm7[0],ymm2[2],ymm7[3] +; AVX-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm11 # 16-byte Folded Reload +; AVX-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX-NEXT: vmovapd 128(%rdi), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3] -; AVX-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3] -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 -; AVX-NEXT: vmovapd 352(%rdi), %ymm7 +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3] +; AVX-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3] -; AVX-NEXT: vmovapd 256(%rdi), %xmm3 -; AVX-NEXT: vblendpd {{.*#+}} xmm8 = xmm3[0],xmm9[1] +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm7 +; AVX-NEXT: vmovapd 352(%rdi), %ymm13 +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3] +; AVX-NEXT: vmovapd 256(%rdi), %xmm14 +; AVX-NEXT: vblendpd {{.*#+}} xmm8 = xmm14[0],xmm9[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -6051,241 +6043,235 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX-NEXT: vmovapd 800(%rdi), %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] -; AVX-NEXT: vmovapd 704(%rdi), %xmm14 -; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm14[0],xmm5[1] +; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX-NEXT: vmovapd 800(%rdi), %ymm11 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm11[0,1,2],ymm0[3] +; AVX-NEXT: vmovapd 704(%rdi), %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovapd 1024(%rdi), %ymm4 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3] -; AVX-NEXT: vmovapd 928(%rdi), %xmm9 -; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm9[0],xmm15[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX-NEXT: vmovapd 928(%rdi), %xmm4 +; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm4[0],xmm15[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 ; AVX-NEXT: vmovapd 1248(%rdi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] -; AVX-NEXT: vmovapd 1152(%rdi), %xmm15 -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm15[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 -; AVX-NEXT: vmovaps 1472(%rdi), %ymm5 -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX-NEXT: vmovaps 1376(%rdi), %xmm8 -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm8[0,1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX-NEXT: vmovapd 1696(%rdi), %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3] -; AVX-NEXT: vmovapd 1600(%rdi), %xmm13 -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm13[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 192(%rdi), %ymm11 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[3],ymm11[2] -; AVX-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 416(%rdi), %ymm10 -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[3],ymm10[2] -; AVX-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3] +; AVX-NEXT: vmovapd 1152(%rdi), %xmm10 +; AVX-NEXT: vblendpd $2, (%rsp), %xmm10, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = xmm10[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm5, (%rsp) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX-NEXT: vmovapd 1472(%rdi), %ymm15 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3] +; AVX-NEXT: vmovapd 1376(%rdi), %xmm5 +; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm5[0],xmm12[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX-NEXT: vmovapd 1696(%rdi), %ymm9 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3] +; AVX-NEXT: vmovapd 1600(%rdi), %xmm8 +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = xmm8[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 192(%rdi), %ymm12 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[3],ymm12[2] +; AVX-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 640(%rdi), %ymm5 -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 416(%rdi), %ymm6 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm13[0],ymm6[1],ymm13[3],ymm6[2] +; AVX-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3] +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 640(%rdi), %ymm2 +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm2[1],ymm13[3],ymm2[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3] +; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 864(%rdi), %ymm7 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[3],ymm7[2] -; AVX-NEXT: vmovdqa 768(%rdi), %xmm4 -; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm11[0],ymm7[1],ymm11[3],ymm7[2] +; AVX-NEXT: vmovdqa 768(%rdi), %xmm11 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 1088(%rdi), %ymm3 -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[3],ymm3[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 1088(%rdi), %ymm13 +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[3],ymm13[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1312(%rdi), %ymm9 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[1],ymm1[3],ymm9[2] -; AVX-NEXT: vmovdqa 1216(%rdi), %xmm2 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vmovapd 1312(%rdi), %ymm4 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[3],ymm4[2] +; AVX-NEXT: vmovdqa 1216(%rdi), %xmm1 +; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm10[0,1],ymm0[2,3] ; AVX-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 1536(%rdi), %ymm8 -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[3],ymm8[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1760(%rdi), %ymm1 -; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[3],ymm1[2] -; AVX-NEXT: vmovdqa 1664(%rdi), %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX-NEXT: # ymm11 = mem[0,1,2],ymm11[3] -; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm11[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX-NEXT: # ymm10 = mem[0,1,2],ymm10[3] -; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1],ymm10[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1,2],ymm5[3] -; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1,2],ymm7[3] -; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = mem[0,1,2],ymm3[3] -; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = mem[0,1,2],ymm9[3] -; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1],ymm4[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX-NEXT: # ymm2 = mem[0,1,2],ymm8[3] +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 1536(%rdi), %ymm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm15[0],ymm0[1],ymm15[3],ymm0[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm5[0,1],ymm10[2,3] +; AVX-NEXT: vmovapd 1760(%rdi), %ymm5 +; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm9[0],ymm5[1],ymm9[3],ymm5[2] +; AVX-NEXT: vmovdqa 1664(%rdi), %xmm9 +; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm8[0,1],ymm10[2,3] +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vblendpd {{.*#+}} xmm8 = xmm8[0],mem[1] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX-NEXT: # ymm12 = mem[0,1,2],ymm12[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX-NEXT: # ymm6 = mem[0,1,2],ymm6[3] +; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm3[0,1],ymm6[2,3] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX-NEXT: # ymm2 = mem[0,1,2],ymm2[3] +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX-NEXT: # ymm2 = mem[0,1,2],ymm7[3] +; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm11[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX-NEXT: # ymm2 = mem[0,1,2],ymm13[3] +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = mem[0,1,2],ymm4[3] +; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0,1,2],ymm0[3] ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = mem[0,1,2],ymm1[3] -; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, (%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, (%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, (%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, (%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 64(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 128(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 192(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 224(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 160(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 96(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 32(%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 224(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 192(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 160(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 128(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 96(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 64(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 32(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, (%r9) -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovapd %ymm12, 224(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 192(%rax) -; AVX-NEXT: vmovapd %ymm15, 160(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 128(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 96(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 64(%rax) -; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 32(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, (%rax) +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = mem[0,1,2],ymm5[3] +; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[0,1,2,3],xmm9[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, (%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, (%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, (%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, (%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 64(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 128(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 192(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 224(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 160(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 96(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 32(%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 224(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 192(%r9) +; AVX-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 160(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 128(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 96(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 64(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 32(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, (%r9) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovapd %ymm10, 224(%rax) -; AVX-NEXT: vmovapd %ymm4, 192(%rax) -; AVX-NEXT: vmovapd %ymm7, 160(%rax) -; AVX-NEXT: vmovapd %ymm3, 128(%rax) -; AVX-NEXT: vmovapd %ymm5, 96(%rax) -; AVX-NEXT: vmovapd %ymm6, 64(%rax) -; AVX-NEXT: vmovapd %ymm11, 32(%rax) -; AVX-NEXT: vmovapd %ymm14, (%rax) -; AVX-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX-NEXT: vmovapd %ymm15, 192(%rax) +; AVX-NEXT: vmovapd %ymm14, 160(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 128(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 96(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 64(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 32(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, (%rax) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: vmovapd %ymm11, 224(%rax) +; AVX-NEXT: vmovapd %ymm0, 192(%rax) +; AVX-NEXT: vmovapd %ymm1, 160(%rax) +; AVX-NEXT: vmovapd %ymm2, 128(%rax) +; AVX-NEXT: vmovapd %ymm3, 96(%rax) +; AVX-NEXT: vmovapd %ymm12, 64(%rax) +; AVX-NEXT: vmovapd %ymm6, 32(%rax) +; AVX-NEXT: vmovapd %ymm8, (%rax) +; AVX-NEXT: addq $1704, %rsp # imm = 0x6A8 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -6293,7 +6279,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2: # %bb.0: ; AVX2-NEXT: subq $1576, %rsp # imm = 0x628 ; AVX2-NEXT: vmovdqa 1216(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa 768(%rdi), %ymm6 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm2 @@ -6301,58 +6287,58 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-NEXT: vmovdqa 224(%rdi), %xmm5 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovdqa 672(%rdi), %xmm5 +; AVX2-NEXT: vmovdqa 672(%rdi), %xmm6 ; AVX2-NEXT: vmovdqa 720(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovdqa 1120(%rdi), %xmm6 +; AVX2-NEXT: vmovdqa 1120(%rdi), %xmm7 ; AVX2-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vmovdqa 1568(%rdi), %xmm8 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vmovdqa 1568(%rdi), %xmm9 ; AVX2-NEXT: vmovdqa 1616(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vmovdqa 448(%rdi), %xmm10 ; AVX2-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6382,97 +6368,97 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 736(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1184(%rdi), %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1280(%rdi), %ymm5 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 832(%rdi), %ymm15 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1184(%rdi), %xmm0 +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1280(%rdi), %ymm1 +; AVX2-NEXT: vpalignr $8, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1632(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1728(%rdi), %ymm8 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1728(%rdi), %ymm7 +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 512(%rdi), %xmm7 -; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 512(%rdi), %xmm8 +; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm6 -; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 960(%rdi), %xmm10 -; AVX2-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1504(%rdi), %ymm9 -; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1408(%rdi), %xmm11 -; AVX2-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 352(%rdi), %ymm12 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] -; AVX2-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 960(%rdi), %xmm9 +; AVX2-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 800(%rdi), %ymm4 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-NEXT: vmovdqa 688(%rdi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 1248(%rdi), %ymm2 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-NEXT: vmovdqa 1136(%rdi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 1696(%rdi), %ymm2 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-NEXT: vmovdqa 1584(%rdi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm2 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 1504(%rdi), %ymm4 +; AVX2-NEXT: vpalignr {{.*#+}} ymm11 = ymm13[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 1408(%rdi), %xmm10 +; AVX2-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastq 352(%rdi), %ymm11 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-NEXT: vmovdqa 240(%rdi), %xmm11 +; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastq 800(%rdi), %ymm5 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] +; AVX2-NEXT: vmovdqa 688(%rdi), %xmm11 +; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1136(%rdi), %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-NEXT: vpbroadcastq 1248(%rdi), %ymm11 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm1[1],ymm11[3],ymm1[3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastq 1696(%rdi), %ymm1 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX2-NEXT: vmovdqa 1584(%rdi), %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm1 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] +; AVX2-NEXT: vpbroadcastq 576(%rdi), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm8[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 1024(%rdi), %ymm0 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm9[2,3] +; AVX2-NEXT: vpbroadcastq 1472(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm0 @@ -6491,22 +6477,22 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1312(%rdi), %xmm7 -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1312(%rdi), %xmm5 +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1632(%rdi), %ymm1 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1760(%rdi), %xmm10 +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1536(%rdi), %xmm5 -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1536(%rdi), %xmm6 +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6519,266 +6505,266 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 640(%rdi), %xmm8 -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] +; AVX2-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX2-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm8[2,3] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 480(%rdi), %xmm11 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 256(%rdi), %xmm13 +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 800(%rdi), %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 704(%rdi), %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 928(%rdi), %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 704(%rdi), %xmm11 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 1152(%rdi), %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa 1472(%rdi), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovdqa 1376(%rdi), %xmm3 -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 -; AVX2-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vmovdqa 1600(%rdi), %xmm9 -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 640(%rdi), %ymm5 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 864(%rdi), %ymm11 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 768(%rdi), %xmm10 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 928(%rdi), %xmm7 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 1152(%rdi), %xmm12 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm12[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 1472(%rdi), %ymm14 +; AVX2-NEXT: vmovdqa 1376(%rdi), %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm4 +; AVX2-NEXT: vmovdqa 1696(%rdi), %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vmovdqa 1600(%rdi), %xmm8 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm15[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 864(%rdi), %ymm9 +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 768(%rdi), %xmm3 +; AVX2-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 992(%rdi), %xmm1 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm7 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1312(%rdi), %ymm4 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1216(%rdi), %xmm8 -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1536(%rdi), %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovdqa 1760(%rdi), %ymm3 -; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1664(%rdi), %xmm1 -; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovdqa 1312(%rdi), %ymm11 +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 1216(%rdi), %xmm2 +; AVX2-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqa 1440(%rdi), %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovdqa 1760(%rdi), %ymm6 +; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm0[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 1664(%rdi), %xmm0 +; AVX2-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 16-byte Folded Reload +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload +; AVX2-NEXT: # xmm10 = mem[0,1],xmm15[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX2-NEXT: # xmm5 = mem[0,1],xmm8[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm9[1],ymm5[3],ymm9[3] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-NEXT: # xmm3 = mem[0,1],xmm3[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%r9) +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm7, 224(%rax) -; AVX2-NEXT: vmovdqa %ymm15, 192(%rax) -; AVX2-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rax) +; AVX2-NEXT: vmovdqa %ymm8, 224(%rax) +; AVX2-NEXT: vmovdqa %ymm14, 192(%rax) +; AVX2-NEXT: vmovdqa %ymm13, 160(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm10, 224(%rax) -; AVX2-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-NEXT: vmovdqa %ymm4, 160(%rax) -; AVX2-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-NEXT: vmovdqa %ymm11, 96(%rax) -; AVX2-NEXT: vmovdqa %ymm12, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-NEXT: vmovdqa %ymm11, 224(%rax) +; AVX2-NEXT: vmovdqa %ymm7, 192(%rax) +; AVX2-NEXT: vmovdqa %ymm9, 160(%rax) +; AVX2-NEXT: vmovdqa %ymm10, 128(%rax) +; AVX2-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX2-NEXT: vmovdqa %ymm15, 32(%rax) +; AVX2-NEXT: vmovdqa %ymm12, (%rax) ; AVX2-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -6787,7 +6773,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $1576, %rsp # imm = 0x628 ; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm2 @@ -6795,58 +6781,58 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm5 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqa 672(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovdqa 672(%rdi), %xmm6 ; AVX2-FP-NEXT: vmovdqa 720(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %xmm7 ; AVX2-FP-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vmovdqa 1568(%rdi), %xmm8 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovdqa 1568(%rdi), %xmm9 ; AVX2-FP-NEXT: vmovdqa 1616(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm10 ; AVX2-FP-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6876,97 +6862,97 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %ymm5 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %xmm0 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %ymm1 +; AVX2-FP-NEXT: vpalignr $8, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %ymm8 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %ymm7 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm7 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm8 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm6 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 960(%rdi), %xmm10 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %ymm9 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %xmm11 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 352(%rdi), %ymm12 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] -; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 960(%rdi), %xmm9 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 800(%rdi), %ymm4 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-FP-NEXT: vmovdqa 688(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 1248(%rdi), %ymm2 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-FP-NEXT: vmovdqa 1136(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 1696(%rdi), %ymm2 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-FP-NEXT: vmovdqa 1584(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm2 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %ymm4 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm11 = ymm13[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %xmm10 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq 352(%rdi), %ymm11 +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq 800(%rdi), %ymm5 +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] +; AVX2-FP-NEXT: vmovdqa 688(%rdi), %xmm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1136(%rdi), %xmm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-FP-NEXT: vpbroadcastq 1248(%rdi), %ymm11 +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm1[1],ymm11[3],ymm1[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq 1696(%rdi), %ymm1 +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX2-FP-NEXT: vmovdqa 1584(%rdi), %xmm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm1 +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpbroadcastq 576(%rdi), %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm8[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 1024(%rdi), %ymm0 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm9[2,3] +; AVX2-FP-NEXT: vpbroadcastq 1472(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm0 @@ -6985,22 +6971,22 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %xmm7 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %xmm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %ymm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %xmm10 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %xmm5 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %xmm6 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7013,266 +6999,266 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm8 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm8[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 928(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 928(%rdi), %xmm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %xmm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 -; AVX2-FP-NEXT: vmovdqa 1472(%rdi), %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %xmm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 -; AVX2-FP-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %xmm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm5 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm11 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 768(%rdi), %xmm10 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %xmm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm12[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 1472(%rdi), %ymm14 +; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %xmm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm4 +; AVX2-FP-NEXT: vmovdqa 1696(%rdi), %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %xmm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm15[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm9 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 768(%rdi), %xmm3 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %xmm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm7 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %ymm4 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %xmm8 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %ymm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %ymm3 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %xmm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %ymm11 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %xmm2 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %xmm1 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %ymm6 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm0[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %xmm0 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm10 = mem[0,1],xmm15[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm5 = mem[0,1],xmm8[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm9[1],ymm5[3],ymm9[3] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm3 = mem[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm7, 224(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm15, 192(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm8, 224(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm14, 192(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm13, 160(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm10, 224(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm4, 160(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm11, 96(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm12, 64(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm11, 224(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm7, 192(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm9, 160(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm10, 128(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm15, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm12, (%rax) ; AVX2-FP-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -7281,7 +7267,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $1576, %rsp # imm = 0x628 ; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 @@ -7289,58 +7275,58 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovdqa 720(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %xmm8 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %xmm9 ; AVX2-FCP-NEXT: vmovdqa 1616(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7370,97 +7356,97 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1280(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %xmm0 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1280(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpalignr $8, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1632(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %xmm11 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 352(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] -; AVX2-FCP-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 800(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-FCP-NEXT: vmovdqa 688(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 1248(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-FCP-NEXT: vmovdqa 1136(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 1696(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-FCP-NEXT: vmovdqa 1584(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm13[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %xmm10 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastq 352(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-FCP-NEXT: vmovdqa 240(%rdi), %xmm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastq 800(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] +; AVX2-FCP-NEXT: vmovdqa 688(%rdi), %xmm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1136(%rdi), %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpbroadcastq 1248(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm1[1],ymm11[3],ymm1[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastq 1696(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX2-FCP-NEXT: vmovdqa 1584(%rdi), %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpbroadcastq 576(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm8[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 1024(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vpbroadcastq 1472(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm0 @@ -7479,22 +7465,22 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %xmm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1632(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %xmm10 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %xmm6 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7507,305 +7493,303 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm8[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqa 1472(%rdi), %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 1376(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm11 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %xmm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm12[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 1472(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa 1376(%rdi), %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vmovdqa 1696(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm15[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm3 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %ymm6 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm0[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %xmm0 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm10 = mem[0,1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm5 = mem[0,1],xmm8[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm9[1],ymm5[3],ymm9[3] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm3 = mem[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm7, 224(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm15, 192(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm8, 224(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm14, 192(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm13, 160(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm10, 224(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm4, 160(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm11, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm12, 64(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 224(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 192(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm9, 160(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm10, 128(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm15, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm12, (%rax) ; AVX2-FCP-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i64_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX512-NEXT: subq $2824, %rsp # imm = 0xB08 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm29 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] ; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 +; AVX512-NEXT: vpermt2q %zmm8, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [10,3,10,3,10,3,10,3] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -7813,110 +7797,112 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm3 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] +; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm20 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm29 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512-NEXT: vpermt2q %zmm19, %zmm9, %zmm21 -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-NEXT: vpermt2q %zmm25, %zmm8, %zmm12 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm5 +; AVX512-NEXT: vpermt2q %zmm17, %zmm9, %zmm21 +; AVX512-NEXT: vpermt2q %zmm26, %zmm7, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512-NEXT: vpermt2q %zmm25, %zmm12, %zmm13 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm10 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512-NEXT: vpermi2q %zmm2, %zmm24, %zmm12 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 +; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-NEXT: vpermt2q %zmm10, %zmm9, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm9, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm18 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm27 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm16 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm9, %zmm23 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512-NEXT: vpermt2q %zmm7, %zmm9, %zmm27 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm28 +; AVX512-NEXT: vpermi2q %zmm2, %zmm24, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-NEXT: vpermt2q %zmm25, %zmm22, %zmm29 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512-NEXT: vpermt2q %zmm25, %zmm23, %zmm31 +; AVX512-NEXT: vpermi2q %zmm2, %zmm24, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512-NEXT: vpermi2q %zmm2, %zmm24, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm25 ; AVX512-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 +; AVX512-NEXT: vpermi2q %zmm2, %zmm24, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpermi2q %zmm10, %zmm2, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512-NEXT: vpermi2q %zmm24, %zmm2, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512-NEXT: vpermt2q %zmm2, %zmm7, %zmm24 +; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -7928,70 +7914,72 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm29 +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm31 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm30 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm24 +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm24 ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $24, %al @@ -8004,194 +7992,196 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0] -; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] +; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm21 = [u,u,4,11] ; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm16[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 -; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 +; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm16 ; AVX512-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 -; AVX512-NEXT: vpermi2q %zmm3, %zmm11, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm19 +; AVX512-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512-NEXT: vpermt2q %zmm11, %zmm21, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512-NEXT: vpermt2q %zmm18, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm14[4,5,4,5] +; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 912(%rdi), %xmm11 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm23 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm11 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21 +; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16 -; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm11 +; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm21 +; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1 +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11 -; AVX512-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm2[6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 +; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5],ymm2[6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512-NEXT: vpermt2q %zmm15, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512-NEXT: vpermt2q %zmm15, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512-NEXT: vpermt2q %zmm3, %zmm30, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 960(%rdi), %ymm15 ; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512-NEXT: vmovdqa 512(%rdi), %ymm12 -; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm30, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm30, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $-32, %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm23 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} +; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm19, %zmm19 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512-NEXT: vinserti32x4 $0, %xmm14, %zmm22, %zmm14 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm20, %zmm9 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512-NEXT: vmovdqa 640(%rdi), %ymm14 -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vmovdqa 1408(%rdi), %ymm15 ; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} ; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4 +; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 @@ -8208,31 +8198,31 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX512-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6 +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm24, %zmm6 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, 192(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm28, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm23, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm31, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm27, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm23, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm12, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm19, 128(%r8) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm4, 192(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm4, (%r9) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm4, 64(%r9) @@ -8248,45 +8238,43 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512-NEXT: vmovaps %zmm8, (%rax) ; AVX512-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512-NEXT: addq $2728, %rsp # imm = 0xAA8 +; AVX512-NEXT: addq $2824, %rsp # imm = 0xB08 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride7_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX512-FCP-NEXT: subq $2824, %rsp # imm = 0xB08 ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] ; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [10,3,10,3,10,3,10,3] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -8294,110 +8282,112 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm3 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] +; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm9, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm27 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm9, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm27 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm28 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm29 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm31 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -8409,70 +8399,72 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm31 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm30 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 ; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $24, %al @@ -8485,194 +8477,196 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0] -; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] +; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [u,u,4,11] ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm16[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm19 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm14[4,5,4,5] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm23 ; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm11 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11 -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm15 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $-32, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm19, %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm22, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm20, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} ; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 @@ -8689,31 +8683,31 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm24, %zmm6 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 128(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 128(%r8) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm4, 192(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm4, (%r9) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm4, 64(%r9) @@ -8729,45 +8723,43 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512-FCP-NEXT: vmovaps %zmm8, (%rax) ; AVX512-FCP-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8 +; AVX512-FCP-NEXT: addq $2824, %rsp # imm = 0xB08 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX512DQ-NEXT: subq $2824, %rsp # imm = 0xB08 ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm29 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] ; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [10,3,10,3,10,3,10,3] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -8775,110 +8767,112 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm3 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] +; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm20 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm8, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm30, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm9, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm7, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm30, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm11, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm9, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm11, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm27 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm9, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm9, %zmm27 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm28 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm22, %zmm29 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm23, %zmm31 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm2, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermi2q %zmm24, %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm7, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -8890,70 +8884,72 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm31 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm30 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $24, %al @@ -8966,194 +8962,196 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0] -; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] +; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm21 = [u,u,4,11] ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm16[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm16 ; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm11, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm19 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm21, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm14[4,5,4,5] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 912(%rdi), %xmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm23 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16 -; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm11 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm21 +; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11 -; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm15 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 +; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm30, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm30, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm15 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm12 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm30, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm30, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $-32, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm17 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm23 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm19, %zmm19 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm14, %zmm22, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm20, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm14 -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vmovdqa 1408(%rdi), %ymm15 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 @@ -9170,31 +9168,31 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm24, %zmm6 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 192(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 128(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%r8) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm4, 192(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm4, (%r9) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm4, 64(%r9) @@ -9210,45 +9208,43 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512DQ-NEXT: vmovaps %zmm8, (%rax) ; AVX512DQ-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512DQ-NEXT: addq $2728, %rsp # imm = 0xAA8 +; AVX512DQ-NEXT: addq $2824, %rsp # imm = 0xB08 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX512DQ-FCP-NEXT: subq $2824, %rsp # imm = 0xB08 ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [10,3,10,3,10,3,10,3] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -9256,110 +9252,112 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] +; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9] +; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm9, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm27 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm9, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm27 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm28 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm29 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm31 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -9371,70 +9369,72 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm30 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $24, %al @@ -9447,194 +9447,196 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] +; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [u,u,4,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm16[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm17 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm19 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm14[4,5,4,5] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm11 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm15 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $-32, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm19, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm22, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm20, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 @@ -9651,31 +9653,31 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm24, %zmm6 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 128(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 128(%r8) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm4, (%r9) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 64(%r9) @@ -9691,147 +9693,152 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rax) ; AVX512DQ-FCP-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8 +; AVX512DQ-FCP-NEXT: addq $2824, %rsp # imm = 0xB08 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 +; AVX512BW-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm28 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm27 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm26 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm31 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm16, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm31 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm30, %zmm28 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -9846,20 +9853,20 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 @@ -9869,8 +9876,8 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm31 @@ -9880,81 +9887,82 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm30 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm16[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm24 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,4,11] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm18[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm11[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] @@ -9962,24 +9970,24 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm16, %zmm18 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18 -; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm11 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm11[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 @@ -9989,71 +9997,71 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm2 -; AVX512BW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm23, %zmm12, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm23, %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm23 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm23[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm23[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm23 ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10062,87 +10070,86 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm24, %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm17 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm21, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm17 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm8, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vpalignr $8, (%rsp), %ymm17, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm28[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm12 -; AVX512BW-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rsi) +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm5 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm5, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm7 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -10157,154 +10164,158 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovaps %zmm8, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512BW-NEXT: vmovaps %zmm9, (%rax) -; AVX512BW-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512BW-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX512BW-NEXT: vmovaps %zmm5, 64(%rax) +; AVX512BW-NEXT: addq $2728, %rsp # imm = 0xAA8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride7_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] -; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] +; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3] +; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,0,7,0,9,0,7,0] +; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,4,11,4,11,4,11,4] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [12,5,12,5,12,5,12,5] +; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm9, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm27 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm26 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm9, %zmm22 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm29 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm24, %zmm29 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm31 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm31 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm30, %zmm28 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2q %zmm13, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -10319,20 +10330,20 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 @@ -10342,8 +10353,8 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm31 @@ -10353,81 +10364,82 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm30 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm16[4,5,4,5] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,0,9,0,7,0,9,0] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,4,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm18[4,5,4,5] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm11[4,5,4,5] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5] -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm18, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] @@ -10435,24 +10447,24 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm16, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm11 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm11[6,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 @@ -10462,71 +10474,71 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm12, %zmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm23 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm23[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm23[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm23 ; AVX512BW-FCP-NEXT: movb $-32, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10535,87 +10547,86 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm24, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm21, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm17 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm8, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm17, %ymm5 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm28[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm12 -; AVX512BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm5 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm5, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm9 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm8 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm7 +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm8 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -10630,154 +10641,158 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512BW-FCP-NEXT: vmovaps %zmm9, (%rax) -; AVX512BW-FCP-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512BW-FCP-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX512BW-FCP-NEXT: vmovaps %zmm5, 64(%rax) +; AVX512BW-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride7_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] +; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3] +; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,0,7,0,9,0,7,0] +; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,4,11,4,11,4,11,4] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [12,5,12,5,12,5,12,5] +; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm9, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm27 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm26 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm22 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm29 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm29 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm31 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm31 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm30, %zmm28 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2q %zmm13, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -10792,20 +10807,20 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 @@ -10815,8 +10830,8 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm31 @@ -10826,81 +10841,82 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm30 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm16[4,5,4,5] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,0,9,0,7,0,9,0] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,4,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm18[4,5,4,5] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm11[4,5,4,5] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 912(%rdi), %xmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm18, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] @@ -10908,24 +10924,24 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm16, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa 1360(%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm11 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm14 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 @@ -10935,71 +10951,71 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 1472(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa 1472(%rdi), %ymm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm23, %zmm12, %zmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm30, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vpermi2q %zmm23, %zmm12, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %ymm23 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm23[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm23[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm23 ; AVX512DQ-BW-NEXT: movb $-32, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11008,87 +11024,86 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm24, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm17 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm21, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %ymm17 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm8, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vpalignr $8, (%rsp), %ymm17, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm28[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm12 -; AVX512DQ-BW-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm5 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm5, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm8 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm8 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 64(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 192(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 192(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%r8) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11103,154 +11118,158 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 128(%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512DQ-BW-NEXT: vmovaps %zmm9, (%rax) -; AVX512DQ-BW-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512DQ-BW-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX512DQ-BW-NEXT: vmovaps %zmm5, 64(%rax) +; AVX512DQ-BW-NEXT: addq $2728, %rsp # imm = 0xAA8 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0] +; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,0,7,0,9,0,7,0] +; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,4,11,4,11,4,11,4] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [12,5,12,5,12,5,12,5] +; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm9, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm27 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm26 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm9, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm24, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm30, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm13, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm13, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -11265,20 +11284,20 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 @@ -11288,8 +11307,8 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm31 @@ -11299,81 +11318,82 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm16[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [7,0,9,0,7,0,9,0] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,4,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm18[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm11[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm18, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] @@ -11381,24 +11401,24 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm16, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 @@ -11408,71 +11428,71 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm23, %zmm12, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm30, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm23, %zmm12, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm23[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm23[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm23 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11481,87 +11501,86 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm24, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm21, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm17 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm8, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm17, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm28[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm5, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm9 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11576,14 +11595,13 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm9, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm5, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <224 x i64>, ptr %in.vec, align 64 @@ -12740,41 +12758,41 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i64_stride7_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $4232, %rsp # imm = 0x1088 +; AVX-NEXT: subq $4280, %rsp # imm = 0x10B8 ; AVX-NEXT: vmovaps 1216(%rdi), %ymm3 ; AVX-NEXT: vmovaps 768(%rdi), %ymm4 -; AVX-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX-NEXT: vmovaps 224(%rdi), %xmm7 ; AVX-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX-NEXT: vmovaps 672(%rdi), %xmm11 +; AVX-NEXT: vmovaps 672(%rdi), %xmm8 ; AVX-NEXT: vmovaps 720(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX-NEXT: vmovaps 1120(%rdi), %xmm12 +; AVX-NEXT: vmovaps 1120(%rdi), %xmm11 ; AVX-NEXT: vmovaps 1168(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1664(%rdi), %ymm6 +; AVX-NEXT: vmovapd 1664(%rdi), %ymm5 ; AVX-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] ; AVX-NEXT: vmovapd 1568(%rdi), %xmm2 ; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovapd 1616(%rdi), %xmm1 @@ -12782,10 +12800,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 2112(%rdi), %ymm7 +; AVX-NEXT: vmovapd 2112(%rdi), %ymm9 ; AVX-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] ; AVX-NEXT: vmovapd 2016(%rdi), %xmm2 ; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovapd 2064(%rdi), %xmm1 @@ -12793,10 +12811,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 2560(%rdi), %ymm8 +; AVX-NEXT: vmovapd 2560(%rdi), %ymm12 ; AVX-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3] ; AVX-NEXT: vmovapd 2464(%rdi), %xmm2 ; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovapd 2512(%rdi), %xmm1 @@ -12804,11 +12822,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 3008(%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 3008(%rdi), %ymm15 ; AVX-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX-NEXT: vmovaps 2912(%rdi), %xmm0 ; AVX-NEXT: vmovaps 2960(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12820,10 +12837,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovaps 3360(%rdi), %xmm15 +; AVX-NEXT: vmovaps 3360(%rdi), %xmm10 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 3408(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%rdi), %ymm2 @@ -12841,11 +12859,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 448(%rdi), %xmm10 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 496(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 992(%rdi), %ymm2 @@ -12853,11 +12871,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovaps 896(%rdi), %xmm9 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 896(%rdi), %xmm10 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 944(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 1440(%rdi), %ymm2 @@ -12865,24 +12883,23 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovaps 1344(%rdi), %xmm9 +; AVX-NEXT: vmovaps 1344(%rdi), %xmm10 ; AVX-NEXT: vmovaps 1392(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 1888(%rdi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 1888(%rdi), %ymm14 ; AVX-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovaps 1792(%rdi), %xmm13 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 1840(%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3] +; AVX-NEXT: vmovapd 1792(%rdi), %xmm13 +; AVX-NEXT: vmovapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovapd 1840(%rdi), %xmm2 +; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm13[0],xmm2[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 2336(%rdi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm1 @@ -12913,75 +12930,74 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovaps 3136(%rdi), %xmm13 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 3184(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 384(%rdi), %ymm2 -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[3],ymm2[2] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 384(%rdi), %ymm7 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[3],ymm7[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 832(%rdi), %ymm14 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[1],ymm4[3],ymm14[2] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 832(%rdi), %ymm6 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[3],ymm6[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 1280(%rdi), %ymm13 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[3],ymm13[2] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 1280(%rdi), %ymm2 +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[3],ymm2[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 1632(%rdi), %xmm1 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 1728(%rdi), %ymm12 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[1],ymm6[3],ymm12[2] +; AVX-NEXT: vmovapd 1728(%rdi), %ymm8 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm8[1],ymm5[3],ymm8[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 2080(%rdi), %xmm1 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovapd 2176(%rdi), %ymm11 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[1],ymm7[3],ymm11[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[0],ymm11[1],ymm9[3],ymm11[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 2528(%rdi), %xmm1 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 2624(%rdi), %ymm10 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[1],ymm8[3],ymm10[2] +; AVX-NEXT: vmovapd 2624(%rdi), %ymm9 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm12[0],ymm9[1],ymm12[3],ymm9[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 2976(%rdi), %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 3072(%rdi), %ymm2 -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2] +; AVX-NEXT: vmovapd 3072(%rdi), %ymm1 +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[3],ymm1[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 3424(%rdi), %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 3520(%rdi), %ymm15 +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 3520(%rdi), %ymm12 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[3],ymm15[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[3],ymm12[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] -; AVX-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 608(%rdi), %ymm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13009,471 +13025,467 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2] ; AVX-NEXT: vmovdqa 1408(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1952(%rdi), %ymm9 -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[3],ymm9[2] -; AVX-NEXT: vmovdqa 1856(%rdi), %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 2400(%rdi), %ymm6 -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2] -; AVX-NEXT: vmovdqa 2304(%rdi), %xmm8 -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 1952(%rdi), %ymm10 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[0],ymm10[1],ymm14[3],ymm10[2] +; AVX-NEXT: vmovdqa 1856(%rdi), %xmm14 +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 2848(%rdi), %ymm4 +; AVX-NEXT: vmovapd 2400(%rdi), %ymm4 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[3],ymm4[2] -; AVX-NEXT: vmovdqa 2752(%rdi), %xmm5 +; AVX-NEXT: vmovdqa 2304(%rdi), %xmm5 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 3296(%rdi), %ymm2 +; AVX-NEXT: vmovapd 2848(%rdi), %ymm2 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[2] -; AVX-NEXT: vmovdqa 3200(%rdi), %xmm3 -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX-NEXT: vmovdqa 2752(%rdi), %xmm3 +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 3296(%rdi), %ymm1 +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] +; AVX-NEXT: vmovdqa 3200(%rdi), %xmm15 +; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 240(%rdi), %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] +; AVX-NEXT: vmovapd 240(%rdi), %xmm13 +; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm13[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 800(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] -; AVX-NEXT: vmovapd 688(%rdi), %xmm7 -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX-NEXT: vmovapd 688(%rdi), %xmm6 +; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] -; AVX-NEXT: vmovapd 1136(%rdi), %xmm7 -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 1136(%rdi), %xmm6 +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 1696(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] -; AVX-NEXT: vmovapd 1584(%rdi), %xmm7 -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] +; AVX-NEXT: vmovapd 1584(%rdi), %xmm6 +; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 2144(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] -; AVX-NEXT: vmovapd 2032(%rdi), %xmm7 -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX-NEXT: vmovapd 2032(%rdi), %xmm6 +; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 2592(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] -; AVX-NEXT: vmovapd 2480(%rdi), %xmm7 -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX-NEXT: vmovapd 2480(%rdi), %xmm6 +; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 3040(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovaps 2928(%rdi), %xmm6 +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] ; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 2928(%rdi), %xmm7 -; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 3488(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] -; AVX-NEXT: vmovapd 3376(%rdi), %xmm1 -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX-NEXT: vmovapd 3376(%rdi), %xmm6 +; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 3264(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] -; AVX-NEXT: vmovdqa 3152(%rdi), %xmm2 -; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX-NEXT: vmovdqa 3152(%rdi), %xmm1 +; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 2816(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX-NEXT: vmovdqa 2704(%rdi), %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 2368(%rdi), %xmm14 -; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] -; AVX-NEXT: vmovdqa 2256(%rdi), %xmm10 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm8[4,5,6,7] +; AVX-NEXT: vmovdqa 2368(%rdi), %xmm6 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX-NEXT: vmovdqa 2256(%rdi), %xmm11 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm5[4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 1920(%rdi), %xmm8 -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] -; AVX-NEXT: vmovapd 1808(%rdi), %xmm5 -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm5[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 1472(%rdi), %xmm7 -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 -; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm1[0,1,2],mem[3] -; AVX-NEXT: vmovapd 1360(%rdi), %xmm3 -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm3[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 1024(%rdi), %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX-NEXT: # ymm2 = ymm1[0,1,2],mem[3] -; AVX-NEXT: vmovapd 912(%rdi), %xmm1 -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = xmm1[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 1920(%rdi), %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] +; AVX-NEXT: vmovdqa 1808(%rdi), %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm14[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 1472(%rdi), %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: # ymm3 = ymm1[0,1,2],mem[3] +; AVX-NEXT: vmovapd 1360(%rdi), %xmm1 +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm1[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 1024(%rdi), %xmm7 +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm3 +; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: # ymm5 = ymm3[0,1,2],mem[3] +; AVX-NEXT: vmovapd 912(%rdi), %xmm3 +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm3[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 576(%rdi), %xmm9 -; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2 -; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX-NEXT: # ymm2 = ymm2[0,1,2],mem[3] -; AVX-NEXT: vmovapd 464(%rdi), %xmm6 -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload -; AVX-NEXT: # xmm11 = xmm6[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm11 -; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX-NEXT: # ymm11 = ymm11[0,1,2],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm5 +; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: # ymm5 = ymm5[0,1,2],mem[3] +; AVX-NEXT: vmovapd 464(%rdi), %xmm8 +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm10 # 16-byte Folded Reload +; AVX-NEXT: # xmm10 = xmm8[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm10 +; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX-NEXT: # ymm10 = ymm10[0,1,2],mem[3] ; AVX-NEXT: vmovapd 16(%rdi), %xmm12 -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload -; AVX-NEXT: # xmm13 = xmm12[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3] -; AVX-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 80(%rdi), %xmm13 -; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[3] +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm14 # 16-byte Folded Reload +; AVX-NEXT: # xmm14 = xmm12[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3] +; AVX-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 80(%rdi), %xmm14 +; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm12[1],ymm14[0],ymm12[2],ymm14[3] ; AVX-NEXT: vmovdqa 192(%rdi), %xmm12 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 304(%rdi), %xmm2 -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[3] -; AVX-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 528(%rdi), %xmm11 -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[3] -; AVX-NEXT: vmovdqa 640(%rdi), %xmm2 -; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 304(%rdi), %xmm5 +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm13[1],ymm5[0],ymm13[2],ymm5[3] +; AVX-NEXT: vmovdqa 416(%rdi), %xmm10 +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr $8, (%rsp), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 528(%rdi), %xmm10 +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm8[1],ymm10[0],ymm8[2],ymm10[3] +; AVX-NEXT: vmovdqa 640(%rdi), %xmm8 +; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 752(%rdi), %xmm9 -; AVX-NEXT: vmovupd %ymm9, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[3] -; AVX-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1],ymm9[0],ymm5[2],ymm9[3] +; AVX-NEXT: vmovdqa 864(%rdi), %xmm9 +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 976(%rdi), %xmm9 -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[3] -; AVX-NEXT: vmovdqa 1088(%rdi), %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1200(%rdi), %xmm6 -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[3] -; AVX-NEXT: vmovdqa 1312(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1424(%rdi), %xmm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm3[1],ymm9[0],ymm3[2],ymm9[3] +; AVX-NEXT: vmovdqa 1088(%rdi), %xmm3 +; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 1200(%rdi), %xmm7 +; AVX-NEXT: vmovupd %ymm7, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1],ymm7[0],ymm5[2],ymm7[3] +; AVX-NEXT: vmovdqa 1312(%rdi), %xmm7 +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 1424(%rdi), %xmm7 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[3] +; AVX-NEXT: vmovdqa 1536(%rdi), %xmm5 +; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 1648(%rdi), %xmm1 +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] +; AVX-NEXT: vmovdqa 1760(%rdi), %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[3] -; AVX-NEXT: vmovdqa 1536(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1648(%rdi), %xmm7 -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3] -; AVX-NEXT: vmovdqa 1760(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 1872(%rdi), %xmm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[3] -; AVX-NEXT: vmovdqa 1984(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[1],ymm0[0],ymm4[2],ymm0[3] +; AVX-NEXT: vmovdqa 1984(%rdi), %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 2096(%rdi), %xmm3 -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 2096(%rdi), %xmm1 +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3] -; AVX-NEXT: vmovdqa 2208(%rdi), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] +; AVX-NEXT: vmovdqa 2208(%rdi), %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 2320(%rdi), %xmm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[3] -; AVX-NEXT: vmovdqa 2432(%rdi), %xmm3 -; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[3] +; AVX-NEXT: vmovdqa 2432(%rdi), %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 2544(%rdi), %xmm1 +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] +; AVX-NEXT: vmovdqa 2656(%rdi), %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 2544(%rdi), %xmm14 +; AVX-NEXT: vmovapd 2768(%rdi), %xmm1 +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[3] -; AVX-NEXT: vmovdqa 2656(%rdi), %xmm4 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] +; AVX-NEXT: vmovdqa 2880(%rdi), %xmm6 +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 2992(%rdi), %xmm4 +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm4[0],ymm1[2],ymm4[3] +; AVX-NEXT: vmovdqa 3104(%rdi), %xmm4 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 2768(%rdi), %xmm4 -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[3] -; AVX-NEXT: vmovdqa 2880(%rdi), %xmm8 -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 3216(%rdi), %xmm4 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 2992(%rdi), %xmm5 -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3] -; AVX-NEXT: vmovdqa 3104(%rdi), %xmm5 -; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm4[0],ymm1[2],ymm4[3] +; AVX-NEXT: vmovdqa 3328(%rdi), %xmm4 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 3440(%rdi), %xmm4 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 3216(%rdi), %xmm5 -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3] -; AVX-NEXT: vmovdqa 3328(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3] +; AVX-NEXT: vmovdqa 3552(%rdi), %xmm1 +; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 3440(%rdi), %xmm5 -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[3] -; AVX-NEXT: vmovdqa 3552(%rdi), %xmm4 -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX-NEXT: vmovapd 128(%rdi), %ymm6 -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3] -; AVX-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm13[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX-NEXT: vmovapd 352(%rdi), %ymm13 -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3] -; AVX-NEXT: vmovapd 256(%rdi), %xmm6 -; AVX-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = xmm6[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX-NEXT: vmovapd 128(%rdi), %ymm5 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vmovapd 576(%rdi), %ymm6 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] -; AVX-NEXT: vmovapd 480(%rdi), %xmm0 -; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],xmm11[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX-NEXT: vmovapd 800(%rdi), %ymm11 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] +; AVX-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm14[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX-NEXT: vmovapd 352(%rdi), %ymm14 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3] +; AVX-NEXT: vmovapd 256(%rdi), %xmm5 +; AVX-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm5[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm4 +; AVX-NEXT: vmovapd 576(%rdi), %ymm8 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3] +; AVX-NEXT: vmovapd 480(%rdi), %xmm5 +; AVX-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm5[0],xmm10[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX-NEXT: vmovapd 800(%rdi), %ymm10 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3] ; AVX-NEXT: vmovapd 704(%rdi), %xmm5 ; AVX-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd $2, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX-NEXT: # xmm5 = xmm5[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX-NEXT: vmovapd 1024(%rdi), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] -; AVX-NEXT: vmovapd 928(%rdi), %xmm0 -; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm9[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3] +; AVX-NEXT: vmovapd 928(%rdi), %xmm15 +; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm15[0],xmm9[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX-NEXT: vmovaps 1248(%rdi), %ymm4 +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX-NEXT: vmovaps 1152(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm0[0,1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX-NEXT: vmovaps 1472(%rdi), %ymm9 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovaps 1376(%rdi), %xmm15 -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm15[0,1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX-NEXT: vmovapd 1696(%rdi), %ymm0 -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX-NEXT: vblendps $12, (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX-NEXT: vmovapd 1472(%rdi), %ymm9 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3] +; AVX-NEXT: vmovapd 1376(%rdi), %xmm12 +; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm12[0],xmm7[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX-NEXT: vmovapd 1696(%rdi), %ymm13 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3] ; AVX-NEXT: vmovapd 1600(%rdi), %xmm0 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm7[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm0[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload ; AVX-NEXT: vmovaps 1920(%rdi), %ymm7 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovaps 1824(%rdi), %xmm12 -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm12[0,1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX-NEXT: vmovapd 2144(%rdi), %ymm10 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3] -; AVX-NEXT: vmovapd 2048(%rdi), %xmm0 -; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm0[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX-NEXT: vmovapd 2368(%rdi), %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vmovaps 1824(%rdi), %xmm11 +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm11[0,1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX-NEXT: vmovaps 2144(%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vmovaps 2048(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vmovapd 2368(%rdi), %ymm4 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3] ; AVX-NEXT: vmovapd 2272(%rdi), %xmm0 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm0[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX-NEXT: vmovapd 2592(%rdi), %ymm0 -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] -; AVX-NEXT: vmovapd 2496(%rdi), %xmm0 -; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm14[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 -; AVX-NEXT: vmovapd 2816(%rdi), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX-NEXT: vmovapd 2720(%rdi), %xmm1 -; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = xmm0[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX-NEXT: vmovaps 2592(%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovaps 2496(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX-NEXT: vmovapd 2816(%rdi), %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3] +; AVX-NEXT: vmovapd 2720(%rdi), %xmm2 +; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm2[0],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: vmovaps 3040(%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX-NEXT: vmovaps 2944(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovaps 3040(%rdi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX-NEXT: vmovaps 2944(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: vmovapd 3264(%rdi), %ymm14 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX-NEXT: vmovapd 3168(%rdi), %xmm1 -; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX-NEXT: vmovaps 3264(%rdi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX-NEXT: vmovaps 3168(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 3488(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -13493,7 +13505,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 416(%rdi), %ymm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[3],ymm0[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[3],ymm0[2] ; AVX-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -13505,12 +13517,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovapd 640(%rdi), %ymm1 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[3],ymm1[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[3],ymm1[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 864(%rdi), %ymm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[3],ymm0[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[3],ymm0[2] ; AVX-NEXT: vmovdqa 768(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -13518,8 +13530,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovapd 1088(%rdi), %ymm1 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[3],ymm1[2] @@ -13536,28 +13547,28 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovapd 1536(%rdi), %ymm15 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[1],ymm9[3],ymm15[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 1760(%rdi), %ymm1 -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] +; AVX-NEXT: vmovapd 1760(%rdi), %ymm0 +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[3],ymm0[2] ; AVX-NEXT: vmovdqa 1664(%rdi), %xmm13 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 1888(%rdi), %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX-NEXT: vmovapd 1984(%rdi), %ymm11 -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[3],ymm11[2] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovapd 1984(%rdi), %ymm12 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm12[1],ymm7[3],ymm12[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 2208(%rdi), %ymm12 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[3],ymm12[2] +; AVX-NEXT: vmovapd 2208(%rdi), %ymm11 +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[3],ymm11[2] ; AVX-NEXT: vmovdqa 2112(%rdi), %xmm10 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] @@ -13567,7 +13578,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovapd 2432(%rdi), %ymm9 -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[3],ymm9[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[0],ymm9[1],ymm4[3],ymm9[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 2656(%rdi), %ymm8 @@ -13582,7 +13593,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovapd 2880(%rdi), %ymm5 -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[1],ymm2[3],ymm5[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm5[1],ymm3[3],ymm5[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 3104(%rdi), %ymm6 @@ -13597,7 +13608,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vmovapd 3328(%rdi), %ymm3 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[1],ymm14[3],ymm3[2] +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[3],ymm3[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 3552(%rdi), %ymm2 @@ -13608,12 +13620,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -13657,28 +13669,29 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm15[3] ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX-NEXT: # xmm13 = mem[0,1,2,3],xmm13[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX-NEXT: # ymm12 = mem[0,1,2],ymm12[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm0[0,1],ymm12[2,3] ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm11[3] -; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3] -; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = mem[0,1,2],ymm12[3] ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload ; AVX-NEXT: # xmm10 = mem[0,1,2,3],xmm10[4,5,6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm10[0,1],ymm0[2,3] ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm9[3] ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX-NEXT: vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm9[0,1],ymm0[2,3] ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm8[3] ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload @@ -13852,9 +13865,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm1, 224(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 192(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 160(%r9) ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, 160(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 128(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 96(%r9) @@ -13903,11 +13916,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovapd %ymm4, 416(%rax) ; AVX-NEXT: vmovapd %ymm5, 384(%rax) ; AVX-NEXT: vmovapd %ymm8, 352(%rax) -; AVX-NEXT: vmovapd %ymm9, 320(%rax) -; AVX-NEXT: vmovapd %ymm10, 288(%rax) -; AVX-NEXT: vmovapd %ymm11, 256(%rax) -; AVX-NEXT: vmovaps %ymm14, 224(%rax) -; AVX-NEXT: vmovapd %ymm15, 192(%rax) +; AVX-NEXT: vmovapd %ymm10, 320(%rax) +; AVX-NEXT: vmovapd %ymm13, 288(%rax) +; AVX-NEXT: vmovapd %ymm14, 256(%rax) +; AVX-NEXT: vmovaps %ymm15, 224(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 192(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 160(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -13920,7 +13934,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm0, 32(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, (%rax) -; AVX-NEXT: addq $4232, %rsp # imm = 0x1088 +; AVX-NEXT: addq $4280, %rsp # imm = 0x10B8 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -13983,11 +13997,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 2560(%rdi), %ymm13 +; AVX2-NEXT: vmovdqa 2560(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa 2624(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa 2464(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 2512(%rdi), %xmm1 @@ -13995,18 +14009,19 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 3008(%rdi), %ymm14 -; AVX2-NEXT: vmovdqa 3072(%rdi), %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 2912(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 2960(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 3072(%rdi), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovaps 2912(%rdi), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 2960(%rdi), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 3456(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 3520(%rdi), %xmm0 @@ -14031,41 +14046,41 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX2-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 992(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 1056(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 496(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 992(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 1056(%rdi), %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 896(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 944(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa 896(%rdi), %xmm15 -; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 944(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1440(%rdi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1440(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 1504(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa 1344(%rdi), %xmm15 -; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovdqa 1344(%rdi), %xmm13 +; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 1392(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1888(%rdi), %ymm3 @@ -14074,11 +14089,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa 1792(%rdi), %xmm15 -; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 1792(%rdi), %xmm13 +; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 1840(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2336(%rdi), %ymm3 @@ -14087,11 +14102,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa 2240(%rdi), %xmm15 -; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 2240(%rdi), %xmm13 +; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 2288(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2784(%rdi), %ymm3 @@ -14100,11 +14115,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa 2688(%rdi), %xmm15 -; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 2688(%rdi), %xmm13 +; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 2736(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 3232(%rdi), %ymm3 @@ -14113,25 +14128,23 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovdqa 3136(%rdi), %xmm15 -; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 3136(%rdi), %xmm13 +; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 3184(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm2 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm15 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 736(%rdi), %xmm2 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 832(%rdi), %ymm13 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1184(%rdi), %xmm2 @@ -14143,75 +14156,78 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1632(%rdi), %xmm2 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1728(%rdi), %ymm15 -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2080(%rdi), %xmm2 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 2176(%rdi), %ymm12 -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 2176(%rdi), %ymm11 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2528(%rdi), %xmm2 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 2624(%rdi), %ymm11 -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 2624(%rdi), %ymm9 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2976(%rdi), %xmm2 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 3072(%rdi), %ymm10 -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 3424(%rdi), %xmm2 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 3520(%rdi), %ymm9 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 3520(%rdi), %ymm10 +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 608(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa 512(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa 960(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 960(%rdi), %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 1504(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 1408(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 1504(%rdi), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1408(%rdi), %xmm13 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1952(%rdi), %ymm8 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] @@ -14244,53 +14260,52 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps 240(%rdi), %xmm14 -; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 800(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps 688(%rdi), %xmm14 -; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 1248(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps 1136(%rdi), %xmm14 -; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 1696(%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastq 352(%rdi), %ymm0 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-NEXT: vmovdqa 1584(%rdi), %xmm14 +; AVX2-NEXT: vmovdqa 240(%rdi), %xmm14 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 2144(%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastq 800(%rdi), %ymm0 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-NEXT: vmovdqa 688(%rdi), %xmm13 +; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 1136(%rdi), %xmm0 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 1248(%rdi), %ymm13 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastq 1696(%rdi), %ymm0 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-NEXT: vmovdqa 2032(%rdi), %xmm12 +; AVX2-NEXT: vmovdqa 1584(%rdi), %xmm12 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 2592(%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastq 2144(%rdi), %ymm0 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-NEXT: vmovdqa 2480(%rdi), %xmm11 +; AVX2-NEXT: vmovdqa 2032(%rdi), %xmm11 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 3040(%rdi), %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-NEXT: vmovdqa 2928(%rdi), %xmm10 -; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpbroadcastq 2592(%rdi), %ymm0 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-NEXT: vmovdqa 2480(%rdi), %xmm9 +; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 2928(%rdi), %xmm0 +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 3040(%rdi), %ymm9 +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 3488(%rdi), %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-NEXT: vmovdqa 3376(%rdi), %xmm9 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] @@ -14298,34 +14313,35 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpbroadcastq 3264(%rdi), %ymm0 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-NEXT: vpbroadcastq 2816(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 2816(%rdi), %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 2368(%rdi), %ymm0 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3] +; AVX2-NEXT: vpbroadcastq 1920(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 1920(%rdi), %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 1024(%rdi), %ymm0 +; AVX2-NEXT: vbroadcastsd 1472(%rdi), %ymm0 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-NEXT: vbroadcastsd 1024(%rdi), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 576(%rdi), %ymm0 @@ -14333,11 +14349,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-NEXT: vbroadcastsd 128(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 128(%rdi), %ymm0 -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -14365,186 +14381,186 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 864(%rdi), %xmm9 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 960(%rdi), %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1088(%rdi), %xmm8 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1184(%rdi), %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1312(%rdi), %xmm8 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1312(%rdi), %xmm7 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1408(%rdi), %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1536(%rdi), %xmm7 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1536(%rdi), %xmm6 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1632(%rdi), %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1760(%rdi), %xmm5 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1856(%rdi), %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 1984(%rdi), %xmm5 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 1984(%rdi), %xmm4 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2080(%rdi), %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 2208(%rdi), %xmm3 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 2208(%rdi), %xmm2 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2304(%rdi), %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 2432(%rdi), %xmm2 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 2528(%rdi), %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 2656(%rdi), %xmm14 -; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 2752(%rdi), %ymm1 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 2880(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 2432(%rdi), %xmm1 +; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 2528(%rdi), %ymm0 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 2656(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 2976(%rdi), %ymm1 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vmovdqa 3104(%rdi), %xmm4 -; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX2-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 2880(%rdi), %xmm3 +; AVX2-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 2976(%rdi), %ymm10 +; AVX2-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 3104(%rdi), %xmm10 +; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 3200(%rdi), %ymm1 -; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 3200(%rdi), %ymm14 +; AVX2-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 3328(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 3424(%rdi), %ymm1 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 3424(%rdi), %ymm14 +; AVX2-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 3552(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-NEXT: vmovdqa 576(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-NEXT: vmovdqa 800(%rdi), %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] ; AVX2-NEXT: vmovdqa 928(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-NEXT: vmovdqa 1152(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vmovdqa 1472(%rdi), %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vmovdqa 1600(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vmovdqa 1920(%rdi), %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovdqa 1824(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa 2144(%rdi), %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqa 2048(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa 2368(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 2272(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vmovdqa 2592(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -14553,7 +14569,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 2816(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa 2720(%rdi), %xmm1 @@ -14561,22 +14577,22 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 3040(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa 2944(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 3168(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload @@ -14597,7 +14613,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -14609,12 +14625,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 864(%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm12[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 768(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -14626,7 +14642,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1312(%rdi), %ymm0 @@ -14641,8 +14657,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 1440(%rdi), %xmm0 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 1760(%rdi), %ymm0 @@ -14671,7 +14687,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 2432(%rdi), %ymm8 -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 2656(%rdi), %ymm9 @@ -14693,75 +14709,75 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 3008(%rdi), %xmm5 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: vpalignr $8, (%rsp), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 3328(%rdi), %ymm3 -; AVX2-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa 3552(%rdi), %ymm2 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-NEXT: vmovdqa 3456(%rdi), %xmm1 -; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload @@ -14769,47 +14785,47 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload ; AVX2-NEXT: # xmm10 = mem[0,1],xmm11[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX2-NEXT: # xmm4 = mem[0,1],xmm5[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = mem[0,1],xmm5[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -15004,15 +15020,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm14, 480(%rax) -; AVX2-NEXT: vmovdqa %ymm0, 448(%rax) -; AVX2-NEXT: vmovdqa %ymm4, 416(%rax) -; AVX2-NEXT: vmovdqa %ymm8, 384(%rax) -; AVX2-NEXT: vmovdqa %ymm7, 352(%rax) -; AVX2-NEXT: vmovdqa %ymm10, 320(%rax) -; AVX2-NEXT: vmovdqa %ymm11, 288(%rax) -; AVX2-NEXT: vmovdqa %ymm13, 256(%rax) -; AVX2-NEXT: vmovdqa %ymm15, 224(%rax) +; AVX2-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-NEXT: vmovdqa %ymm5, 448(%rax) +; AVX2-NEXT: vmovdqa %ymm7, 416(%rax) +; AVX2-NEXT: vmovdqa %ymm4, 384(%rax) +; AVX2-NEXT: vmovdqa %ymm8, 352(%rax) +; AVX2-NEXT: vmovdqa %ymm11, 320(%rax) +; AVX2-NEXT: vmovdqa %ymm10, 288(%rax) +; AVX2-NEXT: vmovdqa %ymm15, 256(%rax) +; AVX2-NEXT: vmovdqa %ymm14, 224(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -15090,11 +15106,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 2560(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovdqa 2560(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovdqa 2624(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa 2464(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 2512(%rdi), %xmm1 @@ -15102,18 +15118,19 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 3008(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovdqa 3072(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 2912(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 2960(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 3072(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps 2912(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 2960(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 3456(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 3520(%rdi), %xmm0 @@ -15138,41 +15155,41 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 992(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa 496(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 992(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %xmm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 896(%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa 944(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqa 896(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 944(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqa 1344(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovdqa 1344(%rdi), %xmm13 +; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 1392(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1888(%rdi), %ymm3 @@ -15181,11 +15198,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqa 1792(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 1792(%rdi), %xmm13 +; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 1840(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2336(%rdi), %ymm3 @@ -15194,11 +15211,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqa 2240(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 2240(%rdi), %xmm13 +; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 2288(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2784(%rdi), %ymm3 @@ -15207,11 +15224,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqa 2688(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 2688(%rdi), %xmm13 +; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 2736(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 3232(%rdi), %ymm3 @@ -15220,25 +15237,23 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqa 3136(%rdi), %xmm15 -; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 3136(%rdi), %xmm13 +; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 3184(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm2 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %xmm2 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm13 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %xmm2 @@ -15250,75 +15265,78 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %xmm2 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %ymm15 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2080(%rdi), %xmm2 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 2176(%rdi), %ymm12 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 2176(%rdi), %ymm11 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2528(%rdi), %xmm2 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 2624(%rdi), %ymm11 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 2624(%rdi), %ymm9 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2976(%rdi), %xmm2 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 3072(%rdi), %ymm10 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 3424(%rdi), %xmm2 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 3520(%rdi), %ymm9 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 3520(%rdi), %ymm10 +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovdqa 960(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 960(%rdi), %xmm1 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %xmm13 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1952(%rdi), %ymm8 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] @@ -15351,53 +15369,52 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps 240(%rdi), %xmm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 800(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps 688(%rdi), %xmm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 1248(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps 1136(%rdi), %xmm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 1696(%rdi), %ymm0 +; AVX2-FP-NEXT: vpbroadcastq 352(%rdi), %ymm0 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-FP-NEXT: vmovdqa 1584(%rdi), %xmm14 +; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm14 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 2144(%rdi), %ymm0 +; AVX2-FP-NEXT: vpbroadcastq 800(%rdi), %ymm0 +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-FP-NEXT: vmovdqa 688(%rdi), %xmm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 1136(%rdi), %xmm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 1248(%rdi), %ymm13 +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq 1696(%rdi), %ymm0 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-FP-NEXT: vmovdqa 2032(%rdi), %xmm12 +; AVX2-FP-NEXT: vmovdqa 1584(%rdi), %xmm12 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 2592(%rdi), %ymm0 +; AVX2-FP-NEXT: vpbroadcastq 2144(%rdi), %ymm0 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-FP-NEXT: vmovdqa 2480(%rdi), %xmm11 +; AVX2-FP-NEXT: vmovdqa 2032(%rdi), %xmm11 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 3040(%rdi), %ymm0 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-FP-NEXT: vmovdqa 2928(%rdi), %xmm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq 2592(%rdi), %ymm0 +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-FP-NEXT: vmovdqa 2480(%rdi), %xmm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 2928(%rdi), %xmm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 3040(%rdi), %ymm9 +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 3488(%rdi), %ymm0 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-FP-NEXT: vmovdqa 3376(%rdi), %xmm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] @@ -15405,34 +15422,35 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpbroadcastq 3264(%rdi), %ymm0 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpbroadcastq 2816(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 2816(%rdi), %ymm0 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 2368(%rdi), %ymm0 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vpbroadcastq 1920(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 1920(%rdi), %ymm0 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-FP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 1024(%rdi), %ymm0 +; AVX2-FP-NEXT: vbroadcastsd 1472(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vbroadcastsd 1024(%rdi), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 576(%rdi), %ymm0 @@ -15440,11 +15458,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vbroadcastsd 128(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 128(%rdi), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -15472,186 +15490,186 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm9 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 960(%rdi), %ymm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %xmm8 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %ymm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %xmm8 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %xmm7 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %ymm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %xmm7 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %xmm6 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %ymm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %xmm5 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1856(%rdi), %ymm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 1984(%rdi), %xmm5 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 1984(%rdi), %xmm4 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2080(%rdi), %ymm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 2208(%rdi), %xmm3 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 2208(%rdi), %xmm2 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2304(%rdi), %ymm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 2432(%rdi), %xmm2 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vmovdqa 2432(%rdi), %xmm1 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2528(%rdi), %ymm0 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 2656(%rdi), %xmm14 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 2752(%rdi), %ymm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 2880(%rdi), %xmm0 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 2656(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 2976(%rdi), %ymm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vmovdqa 3104(%rdi), %xmm4 -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 2880(%rdi), %xmm3 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 2976(%rdi), %ymm10 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 3104(%rdi), %xmm10 +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 3200(%rdi), %ymm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 3200(%rdi), %ymm14 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 3328(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 3424(%rdi), %ymm1 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 3424(%rdi), %ymm14 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 3552(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FP-NEXT: vmovdqa 928(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vmovdqa 1472(%rdi), %ymm8 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FP-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vmovdqa 1920(%rdi), %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovdqa 1824(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vmovdqa 2144(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqa 2048(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqa 2368(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 2272(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-FP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vmovdqa 2592(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -15660,7 +15678,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa 2816(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa 2720(%rdi), %xmm1 @@ -15668,22 +15686,22 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa 3040(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa 2944(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 3168(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload @@ -15704,7 +15722,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -15716,12 +15734,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm12[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -15733,7 +15751,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %ymm0 @@ -15748,8 +15766,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %xmm0 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %ymm0 @@ -15778,7 +15796,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 2432(%rdi), %ymm8 -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 2656(%rdi), %ymm9 @@ -15800,75 +15818,75 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 3008(%rdi), %xmm5 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpalignr $8, (%rsp), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 3328(%rdi), %ymm3 -; AVX2-FP-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 3552(%rdi), %ymm2 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FP-NEXT: vmovdqa 3456(%rdi), %xmm1 -; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload @@ -15876,47 +15894,47 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm10 = mem[0,1],xmm11[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm4 = mem[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = mem[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -16111,15 +16129,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm14, 480(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm0, 448(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm4, 416(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm8, 384(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm7, 352(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm10, 320(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm11, 288(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm13, 256(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm15, 224(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm5, 448(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm7, 416(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm4, 384(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm8, 352(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm11, 320(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm10, 288(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm15, 256(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm14, 224(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -16197,11 +16215,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 2560(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovdqa 2560(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 2624(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 2464(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2512(%rdi), %xmm1 @@ -16209,18 +16227,19 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 3008(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovdqa 3072(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 2912(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 2960(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 3072(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 2912(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 2960(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 3456(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 3520(%rdi), %xmm0 @@ -16245,41 +16264,41 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa 496(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa 944(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 944(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa 1344(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovdqa 1344(%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1392(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1888(%rdi), %ymm3 @@ -16288,11 +16307,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa 1792(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1792(%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1840(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2336(%rdi), %ymm3 @@ -16301,11 +16320,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa 2240(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 2240(%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2288(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2784(%rdi), %ymm3 @@ -16314,11 +16333,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa 2688(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 2688(%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2736(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 3232(%rdi), %ymm3 @@ -16327,25 +16346,23 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovdqa 3136(%rdi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 3136(%rdi), %xmm13 +; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 3184(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm13 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %xmm2 @@ -16357,75 +16374,78 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1632(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %ymm15 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2080(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 2176(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2528(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 2624(%rdi), %ymm11 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 2624(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2976(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 3072(%rdi), %ymm10 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 3424(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 3520(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 3520(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1952(%rdi), %ymm8 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] @@ -16458,53 +16478,52 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps 240(%rdi), %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 800(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps 688(%rdi), %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 1248(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps 1136(%rdi), %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 1696(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpbroadcastq 352(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-FCP-NEXT: vmovdqa 1584(%rdi), %xmm14 +; AVX2-FCP-NEXT: vmovdqa 240(%rdi), %xmm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 2144(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpbroadcastq 800(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-FCP-NEXT: vmovdqa 688(%rdi), %xmm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 1136(%rdi), %xmm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 1248(%rdi), %ymm13 +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastq 1696(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-FCP-NEXT: vmovdqa 2032(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovdqa 1584(%rdi), %xmm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 2592(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpbroadcastq 2144(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-FCP-NEXT: vmovdqa 2480(%rdi), %xmm11 +; AVX2-FCP-NEXT: vmovdqa 2032(%rdi), %xmm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 3040(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-FCP-NEXT: vmovdqa 2928(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq 2592(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-FCP-NEXT: vmovdqa 2480(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 2928(%rdi), %xmm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 3040(%rdi), %ymm9 +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 3488(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-FCP-NEXT: vmovdqa 3376(%rdi), %xmm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] @@ -16512,34 +16531,35 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpbroadcastq 3264(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpbroadcastq 2816(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 2816(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 2368(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpbroadcastq 1920(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 1920(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 1024(%rdi), %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd 1472(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 1024(%rdi), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 576(%rdi), %ymm0 @@ -16547,11 +16567,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 128(%rdi), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 128(%rdi), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -16579,186 +16599,186 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm9 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %xmm7 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %xmm6 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1632(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %xmm5 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1856(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 1984(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 1984(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2080(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 2208(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 2208(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2304(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 2432(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vmovdqa 2432(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2528(%rdi), %ymm0 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 2656(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 2752(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 2880(%rdi), %xmm0 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 2656(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 2976(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vmovdqa 3104(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 2880(%rdi), %xmm3 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 2976(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 3104(%rdi), %xmm10 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 3200(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 3200(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 3328(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 3424(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 3424(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 3552(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vmovdqa 1472(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vmovdqa 1920(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqa 1824(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqa 2144(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqa 2048(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqa 2368(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 2272(%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa 2592(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -16767,7 +16787,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 2816(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 2720(%rdi), %xmm1 @@ -16775,22 +16795,22 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 3040(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 2944(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 3168(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload @@ -16811,7 +16831,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -16823,12 +16843,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm12[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -16840,7 +16860,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %ymm0 @@ -16855,8 +16875,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %ymm0 @@ -16885,7 +16905,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 2432(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 2656(%rdi), %ymm9 @@ -16907,75 +16927,75 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 3008(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpalignr $8, (%rsp), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 3328(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 3552(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FCP-NEXT: vmovdqa 3456(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload @@ -16983,47 +17003,47 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm10 = mem[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = mem[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm5 = mem[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -17218,15 +17238,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm14, 480(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm0, 448(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm4, 416(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm8, 384(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm7, 352(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm10, 320(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm11, 288(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm13, 256(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm15, 224(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm5, 448(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 416(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 384(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm8, 352(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 320(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm10, 288(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm15, 256(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm14, 224(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -17247,46 +17267,44 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: subq $7432, %rsp # imm = 0x1D08 +; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 3264(%rdi), %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm20 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -17294,290 +17312,290 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm16, %zmm8, %zmm0 +; AVX512-NEXT: vpermi2q %zmm17, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 ; AVX512-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512-NEXT: vmovdqa64 3072(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 ; AVX512-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-NEXT: vpermt2q %zmm28, %zmm10, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm29 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm31, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm3 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm30 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512-NEXT: vpermt2q %zmm30, %zmm10, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 -; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 -; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-NEXT: vpermt2q %zmm24, %zmm11, %zmm6 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 +; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm20, %zmm10, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm9 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm9 +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 -; AVX512-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm9, %zmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm14 -; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10 -; AVX512-NEXT: vmovdqa 1920(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm22 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm9, %zmm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm26 +; AVX512-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm12 +; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm12 +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm9 +; AVX512-NEXT: vmovdqa 1920(%rdi), %ymm1 +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm25, %zmm10, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm15 -; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm28 -; AVX512-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10 -; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm14 -; AVX512-NEXT: vpermi2q %zmm25, %zmm14, %zmm9 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm23, %zmm11, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm30, %zmm11, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm15 +; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm15 +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm12 +; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm1 +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm15 +; AVX512-NEXT: vpermi2q %zmm27, %zmm15, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm28, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512-NEXT: vpermt2q %zmm30, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 3392(%rdi), %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 -; AVX512-NEXT: vmovdqa64 3520(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm4, %zmm11 -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm13, %zmm0, %zmm11 +; AVX512-NEXT: vmovdqa64 3520(%rdi), %zmm21 +; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 +; AVX512-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm10 +; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm23, %zmm11, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 @@ -17587,567 +17605,558 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 1984(%rdi), %ymm1 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vpermt2q %zmm28, %zmm4, %zmm2 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 3328(%rdi), %ymm1 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm25, %zmm14, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm27, %zmm15, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 ; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm25 ; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512-NEXT: vpermi2q %zmm6, %zmm9, %zmm0 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm7, %zmm13, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [10,3,10,3,10,3,10,3] +; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512-NEXT: vpermt2q %zmm31, %zmm23, %zmm12 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm28 -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,4,11,4,11,4,11,4] +; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm19, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512-NEXT: vpermt2q %zmm29, %zmm21, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-NEXT: vpermt2q %zmm29, %zmm23, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-NEXT: vpermt2q %zmm29, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm23, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm30 +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm9 -; AVX512-NEXT: vpermt2q %zmm29, %zmm23, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm8, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 ; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512-NEXT: vpermt2q %zmm8, %zmm31, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512-NEXT: vpermt2q %zmm8, %zmm16, %zmm11 +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512-NEXT: vpermt2q %zmm14, %zmm23, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-NEXT: vpermt2q %zmm14, %zmm16, %zmm4 +; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512-NEXT: vpermi2q %zmm13, %zmm7, %zmm21 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm30, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm8, %zmm7, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm24 +; AVX512-NEXT: vpermi2q %zmm8, %zmm7, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm5, %zmm16, %zmm1 +; AVX512-NEXT: vpermi2q %zmm8, %zmm7, %zmm16 +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] +; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm20 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm28 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm29 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-NEXT: vpermi2q %zmm25, %zmm6, %zmm23 +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm21 +; AVX512-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm27 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm5, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm10, %zmm5, %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm22, %zmm5, %zmm27 +; AVX512-NEXT: vpermt2q %zmm15, %zmm5, %zmm23 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,12,0,5,6,12] +; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm31 +; AVX512-NEXT: vpermt2q %zmm20, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm25, %zmm31, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512-NEXT: vpermt2q %zmm20, %zmm8, %zmm25 -; AVX512-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512-NEXT: vpermi2q %zmm3, %zmm31, %zmm16 -; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm30 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm24 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm10, %zmm5, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm15, %zmm5, %zmm16 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] +; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm18 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm19 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $24, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] -; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] +; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,4,11] +; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k2} = zmm0[4,5,4,5],zmm30[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm5[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k2} = zmm0[4,5,4,5],zmm5[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm26 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 ; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: movb $-32, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm5 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm30 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 {%k1} ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm21 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa 2752(%rdi), %ymm4 +; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa 512(%rdi), %ymm3 ; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-NEXT: vinserti32x4 $0, %xmm3, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa 960(%rdi), %ymm8 -; AVX512-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm1, %zmm17 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm1, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512-NEXT: # ymm5 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa 960(%rdi), %ymm9 +; AVX512-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm9, %zmm1, %zmm9 +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm5, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqa 2304(%rdi), %ymm10 ; AVX512-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX512-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm1, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX512-NEXT: # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vmovdqa 1856(%rdi), %ymm12 ; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm1, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinsertf64x4 $0, %ymm11, %zmm1, %zmm11 ; AVX512-NEXT: vmovdqa 3200(%rdi), %ymm13 ; AVX512-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm31, 448(%rsi) +; AVX512-NEXT: vinserti32x4 $0, %xmm13, %zmm14, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm30, 448(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm13, 384(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload @@ -18158,130 +18167,126 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovaps %zmm13, 192(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm13, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm13, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm18, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm27, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 320(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, (%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm24, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm26, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm29, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm25, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 448(%r8) ; AVX512-NEXT: vmovdqa64 %zmm12, 256(%r8) ; AVX512-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 448(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 256(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 320(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 128(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, (%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 64(%r9) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 384(%r9) +; AVX512-NEXT: vmovdqa64 %zmm9, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm4, 384(%r8) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 448(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 256(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 320(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 192(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, (%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 384(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 448(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 256(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 320(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 192(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, (%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm3, 384(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 256(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 128(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 192(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, (%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 64(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 384(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovaps %zmm11, 384(%rax) -; AVX512-NEXT: vmovaps %zmm4, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512-NEXT: vmovaps %zmm5, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 192(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, (%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512-NEXT: addq $7432, %rsp # imm = 0x1D08 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: subq $7432, %rsp # imm = 0x1D08 +; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -18289,290 +18294,290 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm17, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 ; AVX512-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 -; AVX512-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 1920(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm12 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm9 +; AVX512-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14 -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm15 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm12 +; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm15 +; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm15, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm11 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 @@ -18582,567 +18587,558 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 1984(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 3328(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm15, %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm13, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [10,3,10,3,10,3,10,3] +; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] -; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,4,11,4,11,4,11,4] +; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm30 ; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm7, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm7, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm24 +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm7, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm7, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] +; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm28 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm29 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm23 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,12,0,5,6,12] +; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm31, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 -; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] +; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm18 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $24, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] +; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,4,11] +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k2} = zmm0[4,5,4,5],zmm30[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm5[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k2} = zmm0[4,5,4,5],zmm5[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm26 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: movb $-32, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm30 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm30 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm21 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 2752(%rdi), %ymm4 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm3 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm3 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm1, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm9 +; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm1, %zmm9 +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm5, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqa 2304(%rdi), %ymm10 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 1856(%rdi), %ymm12 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm1, %zmm11 ; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm13 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 448(%rsi) +; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 448(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm13, 384(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload @@ -19153,130 +19149,126 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovaps %zmm13, 192(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm13, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm13, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 448(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 448(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 320(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 384(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 448(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 448(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 256(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 320(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, (%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 448(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 320(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 448(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 448(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 256(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 320(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 128(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, (%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%r9) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 384(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 128(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 384(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 448(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 256(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 320(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 192(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, (%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 384(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 448(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 256(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 320(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 384(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 256(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 128(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, (%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 64(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 384(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovaps %zmm11, 384(%rax) -; AVX512-FCP-NEXT: vmovaps %zmm4, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512-FCP-NEXT: vmovaps %zmm5, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512-FCP-NEXT: addq $7432, %rsp # imm = 0x1D08 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: subq $7432, %rsp # imm = 0x1D08 +; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 3264(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -19284,290 +19276,290 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm17, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm26 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm10, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm29 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm11, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm24 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm10, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 -; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm11, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 +; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm10, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 -; AVX512DQ-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm9, %zmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm11, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10 -; AVX512DQ-NEXT: vmovdqa 1920(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm22 -; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm9, %zmm15 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm12 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm9 +; AVX512DQ-NEXT: vmovdqa 1920(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm10, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10 -; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm14 -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm14, %zmm9 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm11, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm15 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm12 +; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm27 +; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm15 +; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm15, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm11, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 3392(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 3520(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm4, %zmm11 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm13, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 3520(%rdi), %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm10 +; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 @@ -19577,567 +19569,558 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 1984(%rdi), %ymm1 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm4, %zmm2 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 3328(%rdi), %ymm1 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm14, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm15, %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 ; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm9, %zmm0 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm13, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [10,3,10,3,10,3,10,3] +; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm23, %zmm12 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] -; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm28 -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,4,11,4,11,4,11,4] +; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm21, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm23, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm31, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm23, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm31, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm30 +; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm23, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm8, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm23, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm31, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 ; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm31, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm16, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm23, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm16, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-NEXT: vpermi2q %zmm13, %zmm7, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm7, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm31, %zmm24 +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm7, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm16, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm7, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] +; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm28 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm29 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm6, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm5, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm5, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm5, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm5, %zmm23 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,12,0,5,6,12] +; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm31, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm25 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm31, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm30 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm24 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 -; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm5, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm5, %zmm16 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] +; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $24, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] +; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,4,11] +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k2} = zmm0[4,5,4,5],zmm30[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm5[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k2} = zmm0[4,5,4,5],zmm5[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm26 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: movb $-32, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm30 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm30 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm21 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 2752(%rdi), %ymm4 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm3 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm3, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm3 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm8 -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm1, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm1, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm5 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm9 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm9, %zmm1, %zmm9 +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm5, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqa 2304(%rdi), %ymm10 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 1856(%rdi), %ymm12 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm1, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm11, %zmm1, %zmm11 ; AVX512DQ-NEXT: vmovdqa 3200(%rdi), %ymm13 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 448(%rsi) +; AVX512DQ-NEXT: vinserti32x4 $0, %xmm13, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 448(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm13, 384(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload @@ -20148,130 +20131,126 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovaps %zmm13, 192(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm13, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm13, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 320(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 320(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 448(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 256(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 320(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 448(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 256(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 320(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 128(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, (%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 64(%r9) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 384(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 384(%r8) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 448(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 256(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 320(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 192(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, (%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 384(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 448(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 256(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 320(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, (%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm3, 384(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 256(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 128(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, (%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 384(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovaps %zmm11, 384(%rax) -; AVX512DQ-NEXT: vmovaps %zmm4, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512DQ-NEXT: vmovaps %zmm5, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-NEXT: addq $7432, %rsp # imm = 0x1D08 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: subq $7432, %rsp # imm = 0x1D08 +; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -20279,290 +20258,290 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm17, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm11, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 1920(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm15, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm13, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 @@ -20572,567 +20551,558 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 1984(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 3328(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm15, %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm13, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [10,3,10,3,10,3,10,3] +; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,4,11,4,11,4,11,4] +; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm31, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm31, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm13, %zmm7, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm7, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm7, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm7, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] +; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm23 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,12,0,5,6,12] +; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm31, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm31 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] +; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $24, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0] +; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,4,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k2} = zmm0[4,5,4,5],zmm30[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm5[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k2} = zmm0[4,5,4,5],zmm5[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm26 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 +; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $-32, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 2752(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm3 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm1, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm5, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 2304(%rdi), %ymm10 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 1856(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm13 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 448(%rsi) +; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 448(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 384(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload @@ -21143,122 +21113,120 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 192(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 448(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 448(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 320(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 384(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 448(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 448(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 256(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 320(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 448(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 320(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 448(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 448(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 256(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 320(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 384(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 384(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 448(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 256(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 320(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 384(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 384(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovaps %zmm11, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovaps %zmm5, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $7432, %rsp # imm = 0x1D08 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512BW-NEXT: subq $7688, %rsp # imm = 0x1E08 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17 @@ -21273,9 +21241,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -21285,30 +21253,30 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 @@ -21317,17 +21285,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] @@ -21337,12 +21306,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 %ymm1, %ymm26 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 @@ -21359,14 +21327,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21377,15 +21346,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 @@ -21396,15 +21365,14 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 @@ -21414,16 +21382,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 @@ -21433,19 +21401,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0 @@ -21453,111 +21420,112 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm14, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm10 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm0 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm9 = ymm26[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm26[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm9 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm9 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21566,10 +21534,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm9 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 @@ -21577,25 +21545,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm2 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm2 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm14, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21607,325 +21575,323 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm24, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [10,3,10,3,10,3,10,3] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm20 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm31, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm27, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm8, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm8, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm8, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm30, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm26 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm19 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm14 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm0[4,5,4,5],zmm11[4,5,4,5] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm28 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,11] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm0[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm5[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -21944,29 +21910,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21975,32 +21943,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -22013,16 +21980,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} @@ -22036,49 +22003,48 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm0 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm25, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqa 1408(%rdi), %ymm4 @@ -22086,10 +22052,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm5 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 @@ -22100,10 +22066,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa 1856(%rdi), %ymm7 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7 @@ -22112,38 +22078,39 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm8 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm12, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm12, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm12, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm12, 128(%rsi) +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm29, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm11, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm11, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm11, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm11, 128(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm12, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm11, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 384(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 256(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 384(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 256(%r8) @@ -22189,55 +22156,56 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-NEXT: addq $7688, %rsp # imm = 0x1E08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: subq $7688, %rsp # imm = 0x1E08 +; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17 @@ -22252,9 +22220,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -22264,30 +22232,30 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 @@ -22296,17 +22264,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] @@ -22316,12 +22285,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 @@ -22338,14 +22306,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22356,15 +22325,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 @@ -22375,15 +22344,14 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 @@ -22393,16 +22361,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 @@ -22412,19 +22380,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0 @@ -22432,111 +22399,112 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm14, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm26[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm26[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22545,10 +22513,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 @@ -22556,25 +22524,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22586,325 +22554,323 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [10,3,10,3,10,3,10,3] +; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm25 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,6,13,6,13,6,13,6] +; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm20 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,4,11,4,11,4,11,4] ; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [12,5,12,5,12,5,12,5] +; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm31, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm27, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29 -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm8, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm2 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm8, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm30, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm26 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,12,0,5,6,12] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm0[4,5,4,5],zmm11[4,5,4,5] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0] +; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0] -; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm0[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm5[4,5,4,5] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm28 -; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm6, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -22923,29 +22889,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22954,32 +22922,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -22992,16 +22959,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} @@ -23015,49 +22982,48 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm25, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4 @@ -23065,10 +23031,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 @@ -23079,10 +23045,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa 1856(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7 @@ -23091,38 +23057,39 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm8 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm12, 384(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm12, 320(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm12, 256(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm12, 128(%rsi) +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm29, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 448(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm11, 384(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm11, 320(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm11, 256(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm11, 128(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm12, (%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm11, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 192(%rdx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rdx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rdx) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%r8) @@ -23168,55 +23135,56 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-FCP-NEXT: addq $7688, %rsp # imm = 0x1E08 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride7_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512DQ-BW-NEXT: vmovdqa64 3328(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: subq $7688, %rsp # imm = 0x1E08 +; AVX512DQ-BW-NEXT: vmovdqa64 3328(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm17 @@ -23231,9 +23199,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -23243,30 +23211,30 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 @@ -23275,17 +23243,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 2816(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] @@ -23295,12 +23264,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %ymm1, %ymm26 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28 @@ -23317,14 +23285,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23335,15 +23304,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 @@ -23354,15 +23323,14 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 @@ -23372,16 +23340,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 2368(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 @@ -23391,19 +23359,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 1920(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0 @@ -23411,111 +23378,112 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm11, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm14, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 3392(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 3520(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 3520(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 2880(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm9 = ymm26[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm26[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm9 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm9 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23524,10 +23492,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 @@ -23535,25 +23503,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 1984(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 3328(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23565,325 +23533,323 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm24, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm4 +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [10,3,10,3,10,3,10,3] +; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm25 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,6,13,6,13,6,13,6] +; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm20 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [12,5,12,5,12,5,12,5] +; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm31, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm27, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm29 -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm8, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm8, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm30, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,12,0,5,6,12] +; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm0[4,5,4,5],zmm11[4,5,4,5] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0] +; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm0[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm5[4,5,4,5] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm28 -; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -23902,29 +23868,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23933,32 +23901,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm11 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -23971,16 +23938,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} @@ -23994,49 +23961,48 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa 2752(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm20, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm25, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa 1408(%rdi), %ymm4 @@ -24044,10 +24010,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa 960(%rdi), %ymm5 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 @@ -24058,10 +24024,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa 1856(%rdi), %ymm7 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7 @@ -24070,38 +24036,39 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa 3200(%rdi), %ymm8 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm12, 384(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm12, 320(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm12, 256(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm12, 128(%rsi) +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm8, %zmm29, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 448(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm11, 384(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm11, 320(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm11, 256(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm11, 128(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm12, (%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm11, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 192(%rdx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 384(%rdx) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm2, 384(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 448(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 256(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 320(%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 384(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 448(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 256(%r8) @@ -24147,55 +24114,56 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-BW-NEXT: addq $7688, %rsp # imm = 0x1E08 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: subq $7688, %rsp # imm = 0x1E08 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17 @@ -24210,9 +24178,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -24222,30 +24190,30 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 @@ -24254,17 +24222,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] @@ -24274,12 +24243,11 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 @@ -24296,14 +24264,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24314,15 +24283,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 @@ -24333,15 +24302,14 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 @@ -24351,16 +24319,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 @@ -24370,19 +24338,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0 @@ -24390,111 +24357,112 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm14, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm26[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm26[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24503,10 +24471,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 @@ -24514,25 +24482,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24544,325 +24512,323 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [10,3,10,3,10,3,10,3] +; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,6,13,6,13,6,13,6] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [12,5,12,5,12,5,12,5] +; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm31, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm27, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm31, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm8, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm8, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm30, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,12,0,5,6,12] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm0[4,5,4,5],zmm11[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0] +; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0] -; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm0[4,5,4,5],zmm4[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm5[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm6, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -24881,29 +24847,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24912,32 +24880,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -24950,16 +24917,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} @@ -24973,49 +24940,48 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm25, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4 @@ -25023,10 +24989,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 @@ -25037,10 +25003,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1856(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7 @@ -25049,38 +25015,39 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm8 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 384(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 320(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 256(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm29, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 448(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, 384(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, 320(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, 256(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, 128(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 192(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%r8) @@ -25126,18 +25093,19 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8 +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: addq $7688, %rsp # imm = 0x1E08 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <448 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index 51b6222077f82..bb555a0c681c3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -583,32 +583,32 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps 192(%rdi), %ymm6 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm10, %ymm10 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm9, %ymm9 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm8, %ymm8 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[2,3],ymm9[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] ; AVX2-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm5, %ymm5 -; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm6 +; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm10, %ymm6 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm10[2,3],ymm6[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-NEXT: vmovaps %ymm11, (%rsi) ; AVX2-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-NEXT: vmovaps %ymm10, (%rcx) +; AVX2-NEXT: vmovaps %ymm9, (%rcx) ; AVX2-NEXT: vmovaps %ymm4, (%r8) ; AVX2-NEXT: vmovaps %ymm7, (%r9) ; AVX2-NEXT: vmovaps %ymm5, (%r11) @@ -631,32 +631,32 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm10, %ymm10 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm9, %ymm9 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm8, %ymm8 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[2,3],ymm9[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm5, %ymm5 -; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm6 +; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm10, %ymm6 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm10[2,3],ymm6[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vmovaps %ymm11, (%rsi) ; AVX2-FP-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-FP-NEXT: vmovaps %ymm10, (%rcx) +; AVX2-FP-NEXT: vmovaps %ymm9, (%rcx) ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8) ; AVX2-FP-NEXT: vmovaps %ymm7, (%r9) ; AVX2-FP-NEXT: vmovaps %ymm5, (%r11) @@ -679,32 +679,32 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm10, %ymm10 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm9, %ymm9 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm8, %ymm8 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[2,3],ymm9[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm5, %ymm5 -; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm6 +; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm10, %ymm6 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm10[2,3],ymm6[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vmovaps %ymm11, (%rsi) ; AVX2-FCP-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-FCP-NEXT: vmovaps %ymm10, (%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm9, (%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8) ; AVX2-FCP-NEXT: vmovaps %ymm7, (%r9) ; AVX2-FCP-NEXT: vmovaps %ymm5, (%r11) @@ -738,24 +738,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,12] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,5,13] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,6,14] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,7,15] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovaps %ymm7, (%rsi) @@ -794,24 +794,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,12] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,5,13] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,7,15] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovaps %ymm7, (%rsi) @@ -850,24 +850,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512DQ-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQ-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,12] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,5,13] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,7,15] ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovaps %ymm7, (%rsi) @@ -906,24 +906,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQ-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,5,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,6,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,7,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovaps %ymm7, (%rsi) @@ -962,24 +962,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512BW-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512BW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,12] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,5,13] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,6,14] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,7,15] ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vmovaps %ymm7, (%rsi) @@ -1018,24 +1018,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512BW-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,5,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,7,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovaps %ymm7, (%rsi) @@ -1074,24 +1074,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQ-BW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,5,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,7,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovaps %ymm7, (%rsi) @@ -1130,24 +1130,24 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512DQ-BW-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,4,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm7, (%rsi) @@ -1478,22 +1478,22 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-NEXT: vmovaps (%rdi), %ymm9 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm12 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm14 -; AVX2-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX2-NEXT: vmovaps 384(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 448(%rdi), %ymm13 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2 +; AVX2-NEXT: vmovaps 384(%rdi), %ymm15 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm4 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm3, %ymm5 ; AVX2-NEXT: vmovaps (%rdi), %xmm3 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm5, %ymm5 ; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm3, %ymm6 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill @@ -1501,39 +1501,39 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm6[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 288(%rdi), %ymm9 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX2-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-NEXT: vmovaps 480(%rdi), %ymm13 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] ; AVX2-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] -; AVX2-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11 -; AVX2-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15 +; AVX2-NEXT: vmovaps 352(%rdi), %xmm9 +; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm9, %ymm9 +; AVX2-NEXT: vmovaps 288(%rdi), %xmm11 +; AVX2-NEXT: vinsertf128 $1, 416(%rdi), %ymm11, %ymm11 +; AVX2-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm12, %ymm12 ; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-NEXT: vmovaps 416(%rdi), %ymm15 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX2-NEXT: vmovaps 416(%rdi), %ymm14 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] ; AVX2-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 @@ -1542,9 +1542,9 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm10[2,3],ymm6[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] @@ -1563,11 +1563,11 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps %ymm8, (%r8) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-NEXT: vmovaps %ymm13, (%r9) -; AVX2-NEXT: vmovaps %ymm10, 32(%r9) +; AVX2-NEXT: vmovaps %ymm11, (%r9) +; AVX2-NEXT: vmovaps %ymm15, 32(%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm14, (%rax) -; AVX2-NEXT: vmovaps %ymm11, 32(%rax) +; AVX2-NEXT: vmovaps %ymm12, (%rax) +; AVX2-NEXT: vmovaps %ymm9, 32(%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps %ymm5, (%rax) ; AVX2-NEXT: vmovaps %ymm7, 32(%rax) @@ -1584,22 +1584,22 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm9 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2 +; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm4 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm3, %ymm5 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm5, %ymm5 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm3, %ymm6 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill @@ -1607,39 +1607,39 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm6[2,3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm9 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm13 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] -; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11 -; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15 +; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm9 +; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm9, %ymm9 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm11 +; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdi), %ymm11, %ymm11 +; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm12, %ymm12 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm15 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm14 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2 @@ -1648,9 +1648,9 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm10[2,3],ymm6[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] @@ -1669,11 +1669,11 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps %ymm8, (%r8) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FP-NEXT: vmovaps %ymm13, (%r9) -; AVX2-FP-NEXT: vmovaps %ymm10, 32(%r9) +; AVX2-FP-NEXT: vmovaps %ymm11, (%r9) +; AVX2-FP-NEXT: vmovaps %ymm15, 32(%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm14, (%rax) -; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm12, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm9, 32(%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps %ymm5, (%rax) ; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rax) @@ -1690,22 +1690,22 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2 +; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm4 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm3, %ymm5 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm5, %ymm5 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm3, %ymm6 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill @@ -1713,39 +1713,39 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm6[2,3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm9 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm13 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11 -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15 +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm9 +; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm9, %ymm9 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm11 +; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdi), %ymm11, %ymm11 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm12, %ymm12 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm15 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm14 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2 @@ -1754,9 +1754,9 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm10[2,3],ymm6[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] @@ -1775,11 +1775,11 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps %ymm8, (%r8) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FCP-NEXT: vmovaps %ymm13, (%r9) -; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm11, (%r9) +; AVX2-FCP-NEXT: vmovaps %ymm15, 32(%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm14, (%rax) -; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm12, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rax) ; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rax) @@ -1860,18 +1860,18 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm11 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 @@ -1894,8 +1894,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512-NEXT: vmovdqa64 %zmm12, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm11, (%r11) ; AVX512-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-NEXT: popq %rbx @@ -1972,18 +1972,18 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm11 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 @@ -2006,8 +2006,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r11) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-FCP-NEXT: popq %rbx @@ -2084,18 +2084,18 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm11 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 @@ -2118,8 +2118,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r11) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-NEXT: popq %rbx @@ -2196,18 +2196,18 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm11 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 @@ -2230,8 +2230,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r11) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: popq %rbx @@ -2308,18 +2308,18 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 @@ -2342,8 +2342,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r11) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: popq %rbx @@ -2420,18 +2420,18 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 @@ -2454,8 +2454,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r11) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FCP-NEXT: popq %rbx @@ -2532,18 +2532,18 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 @@ -2566,8 +2566,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r11) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: popq %rbx @@ -2644,18 +2644,18 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm0[1],zmm3[1],zmm0[3],zmm3[3],zmm0[5],zmm3[5],zmm0[7],zmm3[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 @@ -2678,8 +2678,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r11) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: popq %rbx @@ -3339,28 +3339,28 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX2-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 256(%rdi), %ymm13 ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%rdi), %xmm6 -; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm6 -; AVX2-NEXT: vmovaps 256(%rdi), %xmm7 -; AVX2-NEXT: vinsertf128 $1, 384(%rdi), %ymm7, %ymm7 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 320(%rdi), %xmm4 +; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm4, %ymm6 +; AVX2-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX2-NEXT: vinsertf128 $1, 384(%rdi), %ymm5, %ymm7 +; AVX2-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 832(%rdi), %xmm8 ; AVX2-NEXT: vinsertf128 $1, 960(%rdi), %ymm8, %ymm8 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 768(%rdi), %xmm9 ; AVX2-NEXT: vinsertf128 $1, 896(%rdi), %ymm9, %ymm9 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11 -; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm10, %ymm10 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm10, %ymm10 +; AVX2-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm11, %ymm11 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 576(%rdi), %xmm12 ; AVX2-NEXT: vinsertf128 $1, 704(%rdi), %ymm12, %ymm12 @@ -3372,12 +3372,13 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vmovaps %ymm4, %ymm5 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] ; AVX2-NEXT: vmovaps %ymm3, %ymm4 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3574,28 +3575,28 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm6 -; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm6 -; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm7 -; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdi), %ymm7, %ymm7 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm4 +; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm4, %ymm6 +; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdi), %ymm5, %ymm7 +; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm8 ; AVX2-FP-NEXT: vinsertf128 $1, 960(%rdi), %ymm8, %ymm8 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm9 ; AVX2-FP-NEXT: vinsertf128 $1, 896(%rdi), %ymm9, %ymm9 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11 -; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm10, %ymm10 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm10, %ymm10 +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm11, %ymm11 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm12 ; AVX2-FP-NEXT: vinsertf128 $1, 704(%rdi), %ymm12, %ymm12 @@ -3607,12 +3608,13 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vmovaps %ymm4, %ymm5 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] ; AVX2-FP-NEXT: vmovaps %ymm3, %ymm4 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3809,28 +3811,28 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm6 -; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm6 -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm7 -; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdi), %ymm7, %ymm7 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm4 +; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm4, %ymm6 +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdi), %ymm5, %ymm7 +; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm8 ; AVX2-FCP-NEXT: vinsertf128 $1, 960(%rdi), %ymm8, %ymm8 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm9 ; AVX2-FCP-NEXT: vinsertf128 $1, 896(%rdi), %ymm9, %ymm9 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11 -; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm10, %ymm10 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm10, %ymm10 +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm11, %ymm11 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm12 ; AVX2-FCP-NEXT: vinsertf128 $1, 704(%rdi), %ymm12, %ymm12 @@ -3842,12 +3844,13 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vmovaps %ymm4, %ymm5 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] ; AVX2-FCP-NEXT: vmovaps %ymm3, %ymm4 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7180,29 +7183,29 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 ; AVX2-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX2-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1344(%rdi), %xmm4 ; AVX2-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1280(%rdi), %xmm5 ; AVX2-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1856(%rdi), %xmm6 ; AVX2-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1792(%rdi), %xmm7 ; AVX2-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX2-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7214,16 +7217,16 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1088(%rdi), %xmm1 ; AVX2-NEXT: vinsertf128 $1, 1216(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -7700,29 +7703,29 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX2-FP-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1344(%rdi), %xmm4 ; AVX2-FP-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %xmm5 ; AVX2-FP-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1856(%rdi), %xmm6 ; AVX2-FP-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1792(%rdi), %xmm7 ; AVX2-FP-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7734,16 +7737,16 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1088(%rdi), %xmm1 ; AVX2-FP-NEXT: vinsertf128 $1, 1216(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -8220,29 +8223,29 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX2-FCP-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %xmm4 ; AVX2-FCP-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %xmm5 ; AVX2-FCP-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %xmm6 ; AVX2-FCP-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %xmm7 ; AVX2-FCP-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8254,16 +8257,16 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %xmm1 ; AVX2-FCP-NEXT: vinsertf128 $1, 1216(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -8735,475 +8738,481 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride8_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm27 +; AVX512-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm30 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm29[0],ymm12[2],ymm29[2] +; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm10 ; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[2,3],ymm4[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa 640(%rdi), %ymm14 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] ; AVX512-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm17[0],ymm7[2],ymm17[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm9[2,3] +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512-NEXT: vpermi2q %zmm15, %zmm23, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 1728(%rdi), %ymm16 +; AVX512-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm16[0],ymm13[2],ymm16[2] +; AVX512-NEXT: vmovdqa 1600(%rdi), %ymm8 +; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm29[1],ymm12[3],ymm29[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm17[1],ymm7[3],ymm17[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm4[1],ymm22[3],ymm4[3] +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm9, %zmm23, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm16[1],ymm13[3],ymm16[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm30[0],zmm21[2],zmm30[2],zmm21[4],zmm30[4],zmm21[6],zmm30[6] +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512-NEXT: vpermi2q %zmm18, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [5,13,5,13,5,13,5,13] +; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [1,9,1,9,1,9,1,9] +; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [6,14,6,14,6,14,6,14] +; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm27, %zmm0 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] ; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512-NEXT: vpermt2q %zmm19, %zmm17, %zmm16 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm25[0],zmm29[0],zmm25[2],zmm29[2],zmm25[4],zmm29[4],zmm25[6],zmm29[6] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm25[1],zmm29[1],zmm25[3],zmm29[3],zmm25[5],zmm29[5],zmm25[7],zmm29[7] +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-NEXT: vpermt2q %zmm29, %zmm21, %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm29, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512-NEXT: vpermt2q %zmm29, %zmm26, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512-NEXT: vpermt2q %zmm22, %zmm26, %zmm7 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm13, %zmm26, %zmm8 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512-NEXT: vpermt2q %zmm12, %zmm20, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512-NEXT: vpermt2q %zmm12, %zmm27, %zmm28 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512-NEXT: vpermt2q %zmm12, %zmm17, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm24[1],zmm31[1],zmm24[3],zmm31[3],zmm24[5],zmm31[5],zmm24[7],zmm31[7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512-NEXT: vpermt2q %zmm31, %zmm21, %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm31, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512-NEXT: vpermt2q %zmm31, %zmm26, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512-NEXT: vpermt2q %zmm4, %zmm20, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512-NEXT: vpermt2q %zmm4, %zmm27, %zmm18 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm4[1],zmm0[3],zmm4[3],zmm0[5],zmm4[5],zmm0[7],zmm4[7] +; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm25, %zmm26, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 +; AVX512-NEXT: vpermi2q %zmm19, %zmm2, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm12[1],zmm15[1],zmm12[3],zmm15[3],zmm12[5],zmm15[5],zmm12[7],zmm15[7] +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm15, %zmm12, %zmm21 +; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512-NEXT: vpermt2q %zmm19, %zmm20, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm15, %zmm12, %zmm20 +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm24, %zmm27, %zmm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm22, %zmm27, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm13, %zmm27, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm12, %zmm27, %zmm8 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512-NEXT: vpermt2q %zmm9, %zmm27, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-NEXT: vpermt2q %zmm6, %zmm27, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm16, %zmm20, %zmm27 +; AVX512-NEXT: vpermt2q %zmm16, %zmm17, %zmm20 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 +; AVX512-NEXT: vpermt2q %zmm9, %zmm17, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm25, %zmm17, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm24, %zmm17, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm22, %zmm17, %zmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm15[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm9 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm30 {%k1} ; AVX512-NEXT: vmovdqa 576(%rdi), %xmm5 ; AVX512-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm30, %zmm11 +; AVX512-NEXT: vmovdqa 1088(%rdi), %xmm15 +; AVX512-NEXT: vinserti128 $1, 1216(%rdi), %ymm15, %ymm15 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %xmm19 +; AVX512-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm19, %ymm19 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm21, %zmm6, %zmm21 ; AVX512-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm19[0],ymm25[2],ymm19[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm26, %zmm6, %zmm26 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm15 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm29 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm19[1],ymm25[3],ymm19[3] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX512-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm6 +; AVX512-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm8 +; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm3, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm26, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9229,7 +9238,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, (%rax) @@ -9239,489 +9248,495 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride8_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 +; AVX512-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512-FCP-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm29[0],ymm12[2],ymm29[2] +; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm10 ; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[2,3],ymm4[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm14 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm17[0],ymm7[2],ymm17[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm9[2,3] +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %ymm16 +; AVX512-FCP-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm16[0],ymm13[2],ymm16[2] +; AVX512-FCP-NEXT: vmovdqa 1600(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm29[1],ymm12[3],ymm29[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm17[1],ymm7[3],ymm17[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm4[1],ymm22[3],ymm4[3] +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm16[1],ymm13[3],ymm16[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm30[0],zmm21[2],zmm30[2],zmm21[4],zmm30[4],zmm21[6],zmm30[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [5,13,5,13,5,13,5,13] +; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [1,9,1,9,1,9,1,9] +; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [6,14,6,14,6,14,6,14] +; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] ; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm16 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm25[0],zmm29[0],zmm25[2],zmm29[2],zmm25[4],zmm29[4],zmm25[6],zmm29[6] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm25[1],zmm29[1],zmm25[3],zmm29[3],zmm25[5],zmm29[5],zmm25[7],zmm29[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm26, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm26, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm28 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm24[1],zmm31[1],zmm24[3],zmm31[3],zmm24[5],zmm31[5],zmm24[7],zmm31[7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm18 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm4[1],zmm0[3],zmm4[3],zmm0[5],zmm4[5],zmm0[7],zmm4[7] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm2, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm12[1],zmm15[1],zmm12[3],zmm15[3],zmm12[5],zmm15[5],zmm12[7],zmm15[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm20, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm20 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm17, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm15[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm30 {%k1} ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm5 ; AVX512-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm30, %zmm11 +; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %xmm15 +; AVX512-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm15, %ymm15 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm19 +; AVX512-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm19, %ymm19 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm6, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm19[0],ymm25[2],ymm19[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm6, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm19[1],ymm25[3],ymm19[3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm6 +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm3, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9747,7 +9762,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) @@ -9757,489 +9772,495 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512-FCP-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512-FCP-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride8_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm27 +; AVX512DQ-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512DQ-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm29[0],ymm12[2],ymm29[2] +; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm10 ; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[2,3],ymm4[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm14 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm17[0],ymm7[2],ymm17[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm9[2,3] +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm23, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %ymm16 +; AVX512DQ-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm16[0],ymm13[2],ymm16[2] +; AVX512DQ-NEXT: vmovdqa 1600(%rdi), %ymm8 +; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm29[1],ymm12[3],ymm29[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm17[1],ymm7[3],ymm17[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm4[1],ymm22[3],ymm4[3] +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm23, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm16[1],ymm13[3],ymm16[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm30[0],zmm21[2],zmm30[2],zmm21[4],zmm30[4],zmm21[6],zmm30[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [5,13,5,13,5,13,5,13] +; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [1,9,1,9,1,9,1,9] +; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [6,14,6,14,6,14,6,14] +; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm27, %zmm0 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm17, %zmm16 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm25[0],zmm29[0],zmm25[2],zmm29[2],zmm25[4],zmm29[4],zmm25[6],zmm29[6] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm25[1],zmm29[1],zmm25[3],zmm29[3],zmm25[5],zmm29[5],zmm25[7],zmm29[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm21, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm26, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm26, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm26, %zmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm21, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm20, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm27, %zmm28 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm24[1],zmm31[1],zmm24[3],zmm31[3],zmm24[5],zmm31[5],zmm24[7],zmm31[7] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm21, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm26, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm20, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm27, %zmm18 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm4[1],zmm0[3],zmm4[3],zmm0[5],zmm4[5],zmm0[7],zmm4[7] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm26, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm2, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm12[1],zmm15[1],zmm12[3],zmm15[3],zmm12[5],zmm15[5],zmm12[7],zmm15[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm12, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm20, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm12, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm27, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm27, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm27, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm27, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm27, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm27, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm20, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm17, %zmm20 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm17, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm17, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm17, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm17, %zmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm30 {%k1} ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm5 ; AVX512DQ-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512DQ-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512DQ-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm30, %zmm11 +; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %xmm15 +; AVX512DQ-NEXT: vinserti128 $1, 1216(%rdi), %ymm15, %ymm15 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512DQ-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512DQ-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %xmm19 +; AVX512DQ-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm19, %ymm19 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm21, %zmm6, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512DQ-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm19[0],ymm25[2],ymm19[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm26, %zmm6, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm29 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm19[1],ymm25[3],ymm19[3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm6 +; AVX512DQ-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm3, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10265,7 +10286,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) @@ -10275,489 +10296,495 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512DQ-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512DQ-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 +; AVX512DQ-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm29[0],ymm12[2],ymm29[2] +; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[2,3],ymm4[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm17[0],ymm7[2],ymm17[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm9[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm16[0],ymm13[2],ymm16[2] +; AVX512DQ-FCP-NEXT: vmovdqa 1600(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm29[1],ymm12[3],ymm29[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm17[1],ymm7[3],ymm17[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm4[1],ymm22[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm16[1],ymm13[3],ymm16[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm30[0],zmm21[2],zmm30[2],zmm21[4],zmm30[4],zmm21[6],zmm30[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm25[0],zmm29[0],zmm25[2],zmm29[2],zmm25[4],zmm29[4],zmm25[6],zmm29[6] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm25[1],zmm29[1],zmm25[3],zmm29[3],zmm25[5],zmm29[5],zmm25[7],zmm29[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm26, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm26, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm28 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm24[1],zmm31[1],zmm24[3],zmm31[3],zmm24[5],zmm31[5],zmm24[7],zmm31[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm18 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm4[1],zmm0[3],zmm4[3],zmm0[5],zmm4[5],zmm0[7],zmm4[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm2, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm12[1],zmm15[1],zmm12[3],zmm15[3],zmm12[5],zmm15[5],zmm12[7],zmm15[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm20, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm20 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm17, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm30 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm30, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm15, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm19 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm19, %ymm19 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm6, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm19[0],ymm25[2],ymm19[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm6, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm19[1],ymm25[3],ymm19[3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm6 +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10783,7 +10810,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) @@ -10793,489 +10820,495 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-FCP-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512DQ-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 +; AVX512BW-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm29[0],ymm12[2],ymm29[2] +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm10 ; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[2,3],ymm4[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm17[0],ymm7[2],ymm17[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm9[2,3] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %ymm16 +; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm16[0],ymm13[2],ymm16[2] +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %ymm8 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm29[1],ymm12[3],ymm29[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm17[1],ymm7[3],ymm17[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm4[1],ymm22[3],ymm4[3] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm16[1],ymm13[3],ymm16[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm30[0],zmm21[2],zmm30[2],zmm21[4],zmm30[4],zmm21[6],zmm30[6] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm27, %zmm0 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm25[0],zmm29[0],zmm25[2],zmm29[2],zmm25[4],zmm29[4],zmm25[6],zmm29[6] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm25[1],zmm29[1],zmm25[3],zmm29[3],zmm25[5],zmm29[5],zmm25[7],zmm29[7] +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm21, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm26, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm26, %zmm7 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm27, %zmm28 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm24[1],zmm31[1],zmm24[3],zmm31[3],zmm24[5],zmm31[5],zmm24[7],zmm31[7] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm26, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm20, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm18 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm4[1],zmm0[3],zmm4[3],zmm0[5],zmm4[5],zmm0[7],zmm4[7] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm2, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm12[1],zmm15[1],zmm12[3],zmm15[3],zmm12[5],zmm15[5],zmm12[7],zmm15[7] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm20, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm27, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm16, %zmm20, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm17, %zmm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm17, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm5 ; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm30, %zmm11 +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm15 +; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm15, %ymm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm19 +; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm19, %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm6, %zmm21 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm19[0],ymm25[2],ymm19[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm6, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm29 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm19[1],ymm25[3],ymm19[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm8 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11301,7 +11334,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) @@ -11311,489 +11344,495 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512BW-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512BW-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride8_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 +; AVX512BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm29[0],ymm12[2],ymm29[2] +; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm10 ; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[2,3],ymm4[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm14 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm17[0],ymm7[2],ymm17[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm9[2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm16[0],ymm13[2],ymm16[2] +; AVX512BW-FCP-NEXT: vmovdqa 1600(%rdi), %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm29[1],ymm12[3],ymm29[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm17[1],ymm7[3],ymm17[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm4[1],ymm22[3],ymm4[3] +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm16[1],ymm13[3],ymm16[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm30[0],zmm21[2],zmm30[2],zmm21[4],zmm30[4],zmm21[6],zmm30[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [5,13,5,13,5,13,5,13] +; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [1,9,1,9,1,9,1,9] +; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [6,14,6,14,6,14,6,14] +; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm16 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm25[0],zmm29[0],zmm25[2],zmm29[2],zmm25[4],zmm29[4],zmm25[6],zmm29[6] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm25[1],zmm29[1],zmm25[3],zmm29[3],zmm25[5],zmm29[5],zmm25[7],zmm29[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm26, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm26, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm28 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm24[1],zmm31[1],zmm24[3],zmm31[3],zmm24[5],zmm31[5],zmm24[7],zmm31[7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm18 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm4[1],zmm0[3],zmm4[3],zmm0[5],zmm4[5],zmm0[7],zmm4[7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm2, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm12[1],zmm15[1],zmm12[3],zmm15[3],zmm12[5],zmm15[5],zmm12[7],zmm15[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm20, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm20 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm17, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm30, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm15, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm19 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm19, %ymm19 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm6, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm19[0],ymm25[2],ymm19[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm6, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm19[1],ymm25[3],ymm19[3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm6 +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11819,7 +11858,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) @@ -11829,489 +11868,495 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-FCP-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512BW-FCP-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512BW-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride8_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 +; AVX512DQ-BW-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512DQ-BW-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm29[0],ymm12[2],ymm29[2] +; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm10 ; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[2,3],ymm4[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm14 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm17[0],ymm7[2],ymm17[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm9[2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm16[0],ymm13[2],ymm16[2] +; AVX512DQ-BW-NEXT: vmovdqa 1600(%rdi), %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm29[1],ymm12[3],ymm29[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm17[1],ymm7[3],ymm17[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm4[1],ymm22[3],ymm4[3] +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm16[1],ymm13[3],ymm16[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm30[0],zmm21[2],zmm30[2],zmm21[4],zmm30[4],zmm21[6],zmm30[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm16 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm25[0],zmm29[0],zmm25[2],zmm29[2],zmm25[4],zmm29[4],zmm25[6],zmm29[6] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm25[1],zmm29[1],zmm25[3],zmm29[3],zmm25[5],zmm29[5],zmm25[7],zmm29[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm21, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm26, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm26, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm27, %zmm28 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm24[1],zmm31[1],zmm24[3],zmm31[3],zmm24[5],zmm31[5],zmm24[7],zmm31[7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm26, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm20, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm18 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm4[1],zmm0[3],zmm4[3],zmm0[5],zmm4[5],zmm0[7],zmm4[7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm2, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm12[1],zmm15[1],zmm12[3],zmm15[3],zmm12[5],zmm15[5],zmm12[7],zmm15[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm20, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm27, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm27, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm20, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm17, %zmm20 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm17, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm17, %zmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm30, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %xmm15 +; AVX512DQ-BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm15, %ymm15 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %xmm19 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm21, %zmm6, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm19[0],ymm25[2],ymm19[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm26, %zmm6, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm29 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm19[1],ymm25[3],ymm19[3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm6 +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12337,7 +12382,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) @@ -12347,489 +12392,495 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-BW-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512DQ-BW-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512DQ-BW-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $3208, %rsp # imm = 0xC88 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm22[0],ymm10[2],ymm22[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm29[0],ymm12[2],ymm29[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm9[2,3],ymm4[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm7[0],ymm17[0],ymm7[2],ymm17[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm22[0],ymm4[0],ymm22[2],ymm4[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm25[0],ymm28[2],ymm25[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %ymm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm30[0],ymm1[2],ymm30[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm16[0],ymm13[2],ymm16[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1600(%rdi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm21[1],ymm17[1],ymm21[3],ymm17[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm29[1],ymm12[3],ymm29[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm17[1],ymm7[3],ymm17[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm22[1],ymm4[1],ymm22[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm16[1],ymm13[3],ymm16[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm17[0],zmm19[0],zmm17[2],zmm19[2],zmm17[4],zmm19[4],zmm17[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm24[0],zmm16[2],zmm24[2],zmm16[4],zmm24[4],zmm16[6],zmm24[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm30[0],zmm21[2],zmm30[2],zmm21[4],zmm30[4],zmm21[6],zmm30[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [5,13,5,13,5,13,5,13] +; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [1,9,1,9,1,9,1,9] +; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [6,14,6,14,6,14,6,14] +; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm27[0],zmm24[0],zmm27[2],zmm24[2],zmm27[4],zmm24[4],zmm27[6],zmm24[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm25[0],zmm29[0],zmm25[2],zmm29[2],zmm25[4],zmm29[4],zmm25[6],zmm29[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm27[1],zmm24[1],zmm27[3],zmm24[3],zmm27[5],zmm24[5],zmm27[7],zmm24[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm25[1],zmm29[1],zmm25[3],zmm29[3],zmm25[5],zmm29[5],zmm25[7],zmm29[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm26, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm26, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm10[1],zmm12[1],zmm10[3],zmm12[3],zmm10[5],zmm12[5],zmm10[7],zmm12[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm24[1],zmm31[1],zmm24[3],zmm31[3],zmm24[5],zmm31[5],zmm24[7],zmm31[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm21, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm2, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm20[1],zmm22[3],zmm20[3],zmm22[5],zmm20[5],zmm22[7],zmm20[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm27, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm4[1],zmm0[3],zmm4[3],zmm0[5],zmm4[5],zmm0[7],zmm4[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm22, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm2, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm12[1],zmm15[1],zmm12[3],zmm15[3],zmm12[5],zmm15[5],zmm12[7],zmm15[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm27, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm22, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm21[1],zmm20[1],zmm21[3],zmm20[3],zmm21[5],zmm20[5],zmm21[7],zmm20[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm20, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm17, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm17, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm18, %zmm7, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm18, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm7, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm30, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm21, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm19, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm16[0],ymm15[0],ymm16[2],ymm15[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm21, %zmm6, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %xmm25 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm25, %ymm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm21[0],ymm25[2],ymm21[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm25[0],ymm19[0],ymm25[2],ymm19[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm26, %zmm6, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm15[1],ymm16[3],ymm15[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm21[1],ymm25[3],ymm21[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm19[1],ymm25[3],ymm19[3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm23, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm10 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%rsi) +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12855,7 +12906,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) @@ -12865,15 +12916,15 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $3208, %rsp # imm = 0xC88 +; AVX512DQ-BW-FCP-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 @@ -12899,7 +12950,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i64_stride8_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $3720, %rsp # imm = 0xE88 +; SSE-NEXT: subq $3736, %rsp # imm = 0xE98 ; SSE-NEXT: movaps 960(%rdi), %xmm0 ; SSE-NEXT: movaps 832(%rdi), %xmm1 ; SSE-NEXT: movaps 768(%rdi), %xmm8 @@ -13119,8 +13170,9 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 4032(%rdi), %xmm0 ; SSE-NEXT: movaps 3968(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm0 @@ -13732,11 +13784,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 3056(%rdi), %xmm0 -; SSE-NEXT: movaps 2992(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps 2992(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 3184(%rdi), %xmm0 ; SSE-NEXT: movaps 3120(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm15, %xmm1 @@ -13744,496 +13796,497 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 3312(%rdi), %xmm0 -; SSE-NEXT: movaps 3248(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 3248(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 3440(%rdi), %xmm0 -; SSE-NEXT: movaps 3376(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 3376(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 3568(%rdi), %xmm0 ; SSE-NEXT: movaps 3504(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 4080(%rdi), %xmm0 ; SSE-NEXT: movaps 4016(%rdi), %xmm4 ; SSE-NEXT: movaps 3952(%rdi), %xmm3 -; SSE-NEXT: movaps 3696(%rdi), %xmm0 +; SSE-NEXT: movaps 3888(%rdi), %xmm1 +; SSE-NEXT: movaps 3696(%rdi), %xmm5 ; SSE-NEXT: movaps 3632(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps 4080(%rdi), %xmm1 -; SSE-NEXT: movaps 3888(%rdi), %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps 3824(%rdi), %xmm6 -; SSE-NEXT: movaps 3760(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm7, 496(%rsi) -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 480(%rsi) -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: movaps 3824(%rdi), %xmm5 +; SSE-NEXT: movaps 3760(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 496(%rsi) +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 480(%rsi) ; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 464(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 432(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 416(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 368(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 496(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 480(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 464(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 432(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 416(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 368(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 496(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 480(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 464(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 432(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 416(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 368(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 496(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 480(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 464(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 432(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 416(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 368(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 496(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 480(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 464(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 432(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 416(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 368(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r9) +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 496(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 480(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 464(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 432(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 416(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 368(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 496(%rax) -; SSE-NEXT: movaps %xmm6, 480(%rax) +; SSE-NEXT: movaps %xmm5, 480(%rax) ; SSE-NEXT: movaps %xmm7, 464(%rax) -; SSE-NEXT: movaps %xmm10, 448(%rax) +; SSE-NEXT: movaps %xmm11, 448(%rax) ; SSE-NEXT: movaps %xmm13, 432(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 416(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 368(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rax) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm4, 496(%rax) -; SSE-NEXT: movaps %xmm5, 480(%rax) -; SSE-NEXT: movaps %xmm0, 464(%rax) +; SSE-NEXT: movaps %xmm1, 480(%rax) +; SSE-NEXT: movaps %xmm6, 464(%rax) ; SSE-NEXT: movaps %xmm2, 448(%rax) ; SSE-NEXT: movaps %xmm8, 432(%rax) -; SSE-NEXT: movaps %xmm12, 416(%rax) -; SSE-NEXT: movaps %xmm9, 400(%rax) +; SSE-NEXT: movaps %xmm9, 416(%rax) +; SSE-NEXT: movaps %xmm10, 400(%rax) ; SSE-NEXT: movaps %xmm15, 384(%rax) -; SSE-NEXT: movaps %xmm11, 368(%rax) +; SSE-NEXT: movaps %xmm12, 368(%rax) ; SSE-NEXT: movaps %xmm14, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) @@ -14279,7 +14332,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $3720, %rsp # imm = 0xE88 +; SSE-NEXT: addq $3736, %rsp # imm = 0xE98 ; SSE-NEXT: retq ; ; AVX-LABEL: load_i64_stride8_vf64: @@ -15641,22 +15694,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 ; AVX2-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX2-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1344(%rdi), %xmm4 ; AVX2-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1280(%rdi), %xmm5 ; AVX2-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1856(%rdi), %xmm6 ; AVX2-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 1792(%rdi), %xmm7 ; AVX2-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] @@ -15675,16 +15728,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vinsertf128 $1, 2944(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 3392(%rdi), %xmm1 ; AVX2-NEXT: vinsertf128 $1, 3520(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 3328(%rdi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, 3456(%rdi), %ymm2, %ymm2 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -16737,22 +16790,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX2-FP-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1344(%rdi), %xmm4 ; AVX2-FP-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %xmm5 ; AVX2-FP-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1856(%rdi), %xmm6 ; AVX2-FP-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 1792(%rdi), %xmm7 ; AVX2-FP-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] @@ -16771,16 +16824,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vinsertf128 $1, 2944(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 3392(%rdi), %xmm1 ; AVX2-FP-NEXT: vinsertf128 $1, 3520(%rdi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 3328(%rdi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, 3456(%rdi), %ymm2, %ymm2 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -17833,22 +17886,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX2-FCP-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %xmm4 ; AVX2-FCP-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %xmm5 ; AVX2-FCP-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %xmm6 ; AVX2-FCP-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %xmm7 ; AVX2-FCP-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] @@ -17867,16 +17920,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vinsertf128 $1, 2944(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 3392(%rdi), %xmm1 ; AVX2-FCP-NEXT: vinsertf128 $1, 3520(%rdi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 3328(%rdi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 3456(%rdi), %ymm2, %ymm2 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -18924,41 +18977,41 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride8_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512-NEXT: vmovdqa64 3392(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -18972,12 +19025,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -18989,10 +19040,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm15 @@ -19006,10 +19057,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -19032,7 +19083,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa 1216(%rdi), %ymm3 @@ -19050,44 +19101,43 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512-NEXT: vmovdqa64 2752(%rdi), %ymm28 +; AVX512-NEXT: vmovdqa64 2688(%rdi), %ymm29 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] ; AVX512-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] +; AVX512-NEXT: vmovdqa64 2560(%rdi), %ymm24 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512-NEXT: vmovdqa64 2240(%rdi), %ymm26 +; AVX512-NEXT: vmovdqa64 2176(%rdi), %ymm31 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm26[0],ymm31[2],ymm26[2] +; AVX512-NEXT: vmovdqa 2112(%rdi), %ymm12 ; AVX512-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19113,28 +19163,26 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm5 +; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX512-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload ; AVX512-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] @@ -19175,7 +19223,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -19187,82 +19235,82 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm18 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm31[1],ymm26[1],ymm31[3],ymm26[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm27, %zmm30, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 3264(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm10[0],zmm3[2],zmm10[2],zmm3[4],zmm10[4],zmm3[6],zmm10[6] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm31[0],zmm9[2],zmm31[2],zmm9[4],zmm31[4],zmm9[6],zmm31[6] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -19270,10 +19318,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm12 ; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19291,47 +19340,51 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm4[0],zmm18[0],zmm4[2],zmm18[2],zmm4[4],zmm18[4],zmm4[6],zmm18[6] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19346,11 +19399,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19365,56 +19418,53 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512-NEXT: vpermi2q %zmm26, %zmm28, %zmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -19424,65 +19474,66 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm14[1],zmm13[1],zmm14[3],zmm13[3],zmm14[5],zmm13[5],zmm14[7],zmm13[7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm4[1],zmm18[1],zmm4[3],zmm18[3],zmm4[5],zmm18[5],zmm4[7],zmm18[7] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm24[1],zmm19[3],zmm24[3],zmm19[5],zmm24[5],zmm19[7],zmm24[7] +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload ; AVX512-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] @@ -19492,135 +19543,136 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 {%k1} = zmm30[0],mem[0],zmm30[2],mem[2],zmm30[4],mem[4],zmm30[6],mem[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 {%k1} # 64-byte Folded Reload +; AVX512-NEXT: # zmm29 {%k1} = zmm23[0],mem[0],zmm23[2],mem[2],zmm23[4],mem[4],zmm23[6],mem[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm29[0],zmm25[0],zmm29[2],zmm25[2],zmm29[4],zmm25[4],zmm29[6],zmm25[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512-NEXT: vpermi2q %zmm26, %zmm15, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm13[0],zmm3[0],zmm13[2],zmm3[2],zmm13[4],zmm3[4],zmm13[6],zmm3[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm21 ; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -19628,46 +19680,45 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm31[1],zmm10[1],zmm31[3],zmm10[3],zmm31[5],zmm10[5],zmm31[7],zmm10[7] ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm28 ; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm17 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm29[1],zmm6[1],zmm29[3],zmm6[3],zmm29[5],zmm6[5],zmm29[7],zmm6[7] +; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm29 +; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm30, %zmm18, %zmm20 +; AVX512-NEXT: vpermt2q %zmm30, %zmm2, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512-NEXT: vpermt2q %zmm22, %zmm18, %zmm25 +; AVX512-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm16 +; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm27 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 @@ -19676,79 +19727,77 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm18, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm23 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-NEXT: vpermt2q %zmm12, %zmm18, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512-NEXT: vpermi2q %zmm26, %zmm15, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: vpermi2q %zmm26, %zmm15, %zmm2 +; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm9[1],zmm12[1],zmm9[3],zmm12[3],zmm9[5],zmm12[5],zmm9[7],zmm12[7] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19756,118 +19805,119 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm26 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512-NEXT: vmovdqa64 576(%rdi), %xmm28 +; AVX512-NEXT: vinserti32x4 $1, 704(%rdi), %ymm28, %ymm28 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm29, %zmm17, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 512(%rdi), %xmm29 +; AVX512-NEXT: vinserti32x4 $1, 640(%rdi), %ymm29, %ymm29 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm21, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX512-NEXT: vinserti128 $1, 1216(%rdi), %ymm7, %ymm7 +; AVX512-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512-NEXT: vinserti128 $1, 1152(%rdi), %ymm13, %ymm13 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm25 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 ; AVX512-NEXT: vmovdqa 1536(%rdi), %xmm11 ; AVX512-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512-NEXT: vmovdqa 2112(%rdi), %xmm6 -; AVX512-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2112(%rdi), %xmm16 +; AVX512-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm16, %ymm16 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512-NEXT: vmovdqa 2048(%rdi), %xmm6 +; AVX512-NEXT: vinserti128 $1, 2176(%rdi), %ymm6, %ymm6 +; AVX512-NEXT: vmovdqa 2624(%rdi), %xmm3 +; AVX512-NEXT: vinserti128 $1, 2752(%rdi), %ymm3, %ymm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm6[0],ymm16[0],ymm6[2],ymm16[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm23, %zmm20 ; AVX512-NEXT: vmovdqa64 2560(%rdi), %xmm23 ; AVX512-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa 3136(%rdi), %xmm12 +; AVX512-NEXT: vinserti128 $1, 3264(%rdi), %ymm12, %ymm12 +; AVX512-NEXT: vmovdqa64 3072(%rdi), %xmm17 +; AVX512-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm17, %ymm17 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] ; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512-NEXT: vmovdqa 3648(%rdi), %xmm9 ; AVX512-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 ; AVX512-NEXT: vmovdqa 3584(%rdi), %xmm14 ; AVX512-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm18 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm31, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm16[1],ymm6[3],ymm16[3] ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} @@ -19875,21 +19925,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm1, 448(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm24, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm10, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm20, 256(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512-NEXT: vmovaps %zmm1, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, 384(%rdx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19975,60 +20026,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm26, 448(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, (%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i64_stride8_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512-FCP-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -20042,12 +20093,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -20059,10 +20108,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 @@ -20076,10 +20125,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -20102,7 +20151,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3 @@ -20120,44 +20169,43 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm28 +; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm29 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] ; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] +; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm24 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %ymm26 +; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %ymm31 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm26[0],ymm31[2],ymm26[2] +; AVX512-FCP-NEXT: vmovdqa 2112(%rdi), %ymm12 ; AVX512-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20183,28 +20231,26 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] @@ -20245,7 +20291,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -20257,82 +20303,82 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm31[1],ymm26[1],ymm31[3],ymm26[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm30, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm10[0],zmm3[2],zmm10[2],zmm3[4],zmm10[4],zmm3[6],zmm10[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm31[0],zmm9[2],zmm31[2],zmm9[4],zmm31[4],zmm9[6],zmm31[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -20340,10 +20386,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20361,47 +20408,51 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm4[0],zmm18[0],zmm4[2],zmm18[2],zmm4[4],zmm18[4],zmm4[6],zmm18[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20416,11 +20467,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20435,56 +20486,53 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -20494,65 +20542,66 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm14[1],zmm13[1],zmm14[3],zmm13[3],zmm14[5],zmm13[5],zmm14[7],zmm13[7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm4[1],zmm18[1],zmm4[3],zmm18[3],zmm4[5],zmm18[5],zmm4[7],zmm18[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm24[1],zmm19[3],zmm24[3],zmm19[5],zmm24[5],zmm19[7],zmm24[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] @@ -20562,135 +20611,136 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 {%k1} = zmm30[0],mem[0],zmm30[2],mem[2],zmm30[4],mem[4],zmm30[6],mem[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 {%k1} # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm29 {%k1} = zmm23[0],mem[0],zmm23[2],mem[2],zmm23[4],mem[4],zmm23[6],mem[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm29[0],zmm25[0],zmm29[2],zmm25[2],zmm29[4],zmm25[4],zmm29[6],zmm25[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm13[0],zmm3[0],zmm13[2],zmm3[2],zmm13[4],zmm3[4],zmm13[6],zmm3[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -20698,46 +20748,45 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm31[1],zmm10[1],zmm31[3],zmm10[3],zmm31[5],zmm10[5],zmm31[7],zmm10[7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm28 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm29[1],zmm6[1],zmm29[3],zmm6[3],zmm29[5],zmm6[5],zmm29[7],zmm6[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm29 +; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm18, %zmm20 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 @@ -20746,79 +20795,77 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm9[1],zmm12[1],zmm9[3],zmm12[3],zmm9[5],zmm12[5],zmm9[7],zmm12[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20826,118 +20873,119 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm26 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %xmm28 +; AVX512-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm28, %ymm28 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %xmm29 +; AVX512-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm29, %ymm29 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm21, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm7, %ymm7 +; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm13, %ymm13 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512-FCP-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 ; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11 ; AVX512-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %xmm16 +; AVX512-FCP-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm16, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512-FCP-NEXT: vmovdqa 2048(%rdi), %xmm6 +; AVX512-FCP-NEXT: vinserti128 $1, 2176(%rdi), %ymm6, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 2624(%rdi), %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm6[0],ymm16[0],ymm6[2],ymm16[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm23, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23 ; AVX512-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa 3136(%rdi), %xmm12 +; AVX512-FCP-NEXT: vinserti128 $1, 3264(%rdi), %ymm12, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm17 +; AVX512-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm17, %ymm17 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9 ; AVX512-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 ; AVX512-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14 ; AVX512-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm31, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm16[1],ymm6[3],ymm16[3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} @@ -20945,21 +20993,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 256(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512-FCP-NEXT: vmovaps %zmm1, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 384(%rdx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -21045,60 +21094,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512-FCP-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i64_stride8_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512DQ-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512DQ-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-NEXT: vmovdqa64 3392(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -21112,12 +21161,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -21129,10 +21176,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm15 @@ -21146,10 +21193,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -21172,7 +21219,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa 1216(%rdi), %ymm3 @@ -21190,44 +21237,43 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %ymm28 +; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %ymm29 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] ; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] +; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %ymm24 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512DQ-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512DQ-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %ymm26 +; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %ymm31 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm26[0],ymm31[2],ymm26[2] +; AVX512DQ-NEXT: vmovdqa 2112(%rdi), %ymm12 ; AVX512DQ-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21253,28 +21299,26 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] @@ -21315,7 +21359,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -21327,82 +21371,82 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm2, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm31[1],ymm26[1],ymm31[3],ymm26[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm30, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 3264(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm10[0],zmm3[2],zmm10[2],zmm3[4],zmm10[4],zmm3[6],zmm10[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm31[0],zmm9[2],zmm31[2],zmm9[4],zmm31[4],zmm9[6],zmm31[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -21410,10 +21454,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21431,47 +21476,51 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm4[0],zmm18[0],zmm4[2],zmm18[2],zmm4[4],zmm18[4],zmm4[6],zmm18[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21486,11 +21535,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21505,56 +21554,53 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm28, %zmm0 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -21564,65 +21610,66 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm14[1],zmm13[1],zmm14[3],zmm13[3],zmm14[5],zmm13[5],zmm14[7],zmm13[7] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm4[1],zmm18[1],zmm4[3],zmm18[3],zmm4[5],zmm18[5],zmm4[7],zmm18[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm24[1],zmm19[3],zmm24[3],zmm19[5],zmm24[5],zmm19[7],zmm24[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] @@ -21632,135 +21679,136 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 {%k1} = zmm30[0],mem[0],zmm30[2],mem[2],zmm30[4],mem[4],zmm30[6],mem[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 {%k1} # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm29 {%k1} = zmm23[0],mem[0],zmm23[2],mem[2],zmm23[4],mem[4],zmm23[6],mem[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm29[0],zmm25[0],zmm29[2],zmm25[2],zmm29[4],zmm25[4],zmm29[6],zmm25[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm15, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm13[0],zmm3[0],zmm13[2],zmm3[2],zmm13[4],zmm3[4],zmm13[6],zmm3[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm18, %zmm17 ; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm18, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -21768,46 +21816,45 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm31[1],zmm10[1],zmm31[3],zmm10[3],zmm31[5],zmm10[5],zmm31[7],zmm10[7] ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm18, %zmm28 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm29[1],zmm6[1],zmm29[3],zmm6[3],zmm29[5],zmm6[5],zmm29[7],zmm6[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm18, %zmm29 +; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm18, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm2, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm18, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm18, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 @@ -21816,79 +21863,77 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm18, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm18, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm15, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm15, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm9[1],zmm12[1],zmm9[3],zmm12[3],zmm9[5],zmm12[5],zmm9[7],zmm12[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm26 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21896,118 +21941,119 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm26 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512DQ-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %xmm28 +; AVX512DQ-NEXT: vinserti32x4 $1, 704(%rdi), %ymm28, %ymm28 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm29, %zmm17, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %xmm29 +; AVX512DQ-NEXT: vinserti32x4 $1, 640(%rdi), %ymm29, %ymm29 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm21, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX512DQ-NEXT: vinserti128 $1, 1216(%rdi), %ymm7, %ymm7 +; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512DQ-NEXT: vinserti128 $1, 1152(%rdi), %ymm13, %ymm13 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512DQ-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512DQ-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512DQ-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512DQ-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 ; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %xmm11 ; AVX512DQ-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512DQ-NEXT: vmovdqa 2112(%rdi), %xmm6 -; AVX512DQ-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512DQ-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %xmm16 +; AVX512DQ-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm16, %ymm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512DQ-NEXT: vmovdqa 2048(%rdi), %xmm6 +; AVX512DQ-NEXT: vinserti128 $1, 2176(%rdi), %ymm6, %ymm6 +; AVX512DQ-NEXT: vmovdqa 2624(%rdi), %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, 2752(%rdi), %ymm3, %ymm3 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm6[0],ymm16[0],ymm6[2],ymm16[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm23, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %xmm23 ; AVX512DQ-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512DQ-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512DQ-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa 3136(%rdi), %xmm12 +; AVX512DQ-NEXT: vinserti128 $1, 3264(%rdi), %ymm12, %ymm12 +; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %xmm17 +; AVX512DQ-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm17, %ymm17 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512DQ-NEXT: vmovdqa 3648(%rdi), %xmm9 ; AVX512DQ-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 ; AVX512DQ-NEXT: vmovdqa 3584(%rdi), %xmm14 ; AVX512DQ-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm18 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm31, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm16[1],ymm6[3],ymm16[3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} @@ -22015,21 +22061,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 256(%rsi) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512DQ-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rdx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -22115,60 +22162,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 448(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512DQ-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i64_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -22182,12 +22229,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -22199,10 +22244,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 @@ -22216,10 +22261,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -22242,7 +22287,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3 @@ -22260,44 +22305,43 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm29 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] ; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512DQ-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %ymm31 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm26[0],ymm31[2],ymm26[2] +; AVX512DQ-FCP-NEXT: vmovdqa 2112(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22323,28 +22367,26 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] @@ -22385,7 +22427,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -22397,82 +22439,82 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm31[1],ymm26[1],ymm31[3],ymm26[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm30, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm10[0],zmm3[2],zmm10[2],zmm3[4],zmm10[4],zmm3[6],zmm10[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm31[0],zmm9[2],zmm31[2],zmm9[4],zmm31[4],zmm9[6],zmm31[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -22480,10 +22522,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22501,47 +22544,51 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm4[0],zmm18[0],zmm4[2],zmm18[2],zmm4[4],zmm18[4],zmm4[6],zmm18[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22556,11 +22603,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22575,56 +22622,53 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -22634,65 +22678,66 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm14[1],zmm13[1],zmm14[3],zmm13[3],zmm14[5],zmm13[5],zmm14[7],zmm13[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm4[1],zmm18[1],zmm4[3],zmm18[3],zmm4[5],zmm18[5],zmm4[7],zmm18[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm24[1],zmm19[3],zmm24[3],zmm19[5],zmm24[5],zmm19[7],zmm24[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] @@ -22702,135 +22747,136 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm2 {%k1} = zmm30[0],mem[0],zmm30[2],mem[2],zmm30[4],mem[4],zmm30[6],mem[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 {%k1} # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm29 {%k1} = zmm23[0],mem[0],zmm23[2],mem[2],zmm23[4],mem[4],zmm23[6],mem[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm29[0],zmm25[0],zmm29[2],zmm25[2],zmm29[4],zmm25[4],zmm29[6],zmm25[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm2 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm13[0],zmm3[0],zmm13[2],zmm3[2],zmm13[4],zmm3[4],zmm13[6],zmm3[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -22838,46 +22884,45 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm31[1],zmm10[1],zmm31[3],zmm10[3],zmm31[5],zmm10[5],zmm31[7],zmm10[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm29[1],zmm6[1],zmm29[3],zmm6[3],zmm29[5],zmm6[5],zmm29[7],zmm6[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm18, %zmm20 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 @@ -22886,79 +22931,77 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm9[1],zmm12[1],zmm9[3],zmm12[3],zmm9[5],zmm12[5],zmm9[7],zmm12[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22966,118 +23009,119 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm26 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %xmm28 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm28, %ymm28 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %xmm29 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm29, %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm21, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %xmm16 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm16, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 2048(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 2176(%rdi), %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 2624(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm6[0],ymm16[0],ymm6[2],ymm16[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm23, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 3136(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 3264(%rdi), %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm17 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm17, %ymm17 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm31, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm16[1],ymm6[3],ymm16[3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} @@ -23085,21 +23129,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 256(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 384(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -23185,60 +23230,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512DQ-FCP-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512BW-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -23252,12 +23297,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -23269,10 +23312,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm15 @@ -23286,10 +23329,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -23312,7 +23355,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm3 @@ -23330,44 +23373,43 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm28 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %ymm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] ; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %ymm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %ymm26 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %ymm31 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm26[0],ymm31[2],ymm26[2] +; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm12 ; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23393,28 +23435,26 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] @@ -23455,7 +23495,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -23467,82 +23507,82 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm31[1],ymm26[1],ymm31[3],ymm26[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm30, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm10[0],zmm3[2],zmm10[2],zmm3[4],zmm10[4],zmm3[6],zmm10[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm31[0],zmm9[2],zmm31[2],zmm9[4],zmm31[4],zmm9[6],zmm31[6] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -23550,10 +23590,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23571,47 +23612,51 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm4[0],zmm18[0],zmm4[2],zmm18[2],zmm4[4],zmm18[4],zmm4[6],zmm18[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23626,11 +23671,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23645,56 +23690,53 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm28, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -23704,65 +23746,66 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm14[1],zmm13[1],zmm14[3],zmm13[3],zmm14[5],zmm13[5],zmm14[7],zmm13[7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm4[1],zmm18[1],zmm4[3],zmm18[3],zmm4[5],zmm18[5],zmm4[7],zmm18[7] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm24[1],zmm19[3],zmm24[3],zmm19[5],zmm24[5],zmm19[7],zmm24[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] @@ -23772,135 +23815,136 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm2 {%k1} = zmm30[0],mem[0],zmm30[2],mem[2],zmm30[4],mem[4],zmm30[6],mem[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm29 {%k1} = zmm23[0],mem[0],zmm23[2],mem[2],zmm23[4],mem[4],zmm23[6],mem[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm29[0],zmm25[0],zmm29[2],zmm25[2],zmm29[4],zmm25[4],zmm29[6],zmm25[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm2 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm13[0],zmm3[0],zmm13[2],zmm3[2],zmm13[4],zmm3[4],zmm13[6],zmm3[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -23908,46 +23952,45 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm31[1],zmm10[1],zmm31[3],zmm10[3],zmm31[5],zmm10[5],zmm31[7],zmm10[7] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm28 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm29[1],zmm6[1],zmm29[3],zmm6[3],zmm29[5],zmm6[5],zmm29[7],zmm6[7] +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 @@ -23956,79 +23999,77 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm9[1],zmm12[1],zmm9[3],zmm12[3],zmm9[5],zmm12[5],zmm9[7],zmm12[7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24036,118 +24077,119 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm26 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm28 +; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm28, %ymm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm29 +; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm29, %ymm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm7, %ymm7 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm13, %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 ; AVX512BW-NEXT: vmovdqa 1536(%rdi), %xmm11 ; AVX512BW-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa 2112(%rdi), %xmm6 -; AVX512BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm16, %ymm16 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa 2048(%rdi), %xmm6 +; AVX512BW-NEXT: vinserti128 $1, 2176(%rdi), %ymm6, %ymm6 +; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm3, %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm6[0],ymm16[0],ymm6[2],ymm16[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm23, %zmm20 ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm23 ; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm12, %ymm12 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm17, %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm9 ; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 ; AVX512BW-NEXT: vmovdqa 3584(%rdi), %xmm14 ; AVX512BW-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm31, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm16[1],ymm6[3],ymm16[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} @@ -24155,21 +24197,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -24255,60 +24298,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512BW-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i64_stride8_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -24322,12 +24365,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -24339,10 +24380,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 @@ -24356,10 +24397,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -24382,7 +24423,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3 @@ -24400,44 +24441,43 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm28 +; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm29 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] ; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] +; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm24 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512BW-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %ymm26 +; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %ymm31 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm26[0],ymm31[2],ymm26[2] +; AVX512BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm12 ; AVX512BW-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24463,28 +24503,26 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] @@ -24525,7 +24563,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -24537,82 +24575,82 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm31[1],ymm26[1],ymm31[3],ymm26[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm27, %zmm30, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm10[0],zmm3[2],zmm10[2],zmm3[4],zmm10[4],zmm3[6],zmm10[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm31[0],zmm9[2],zmm31[2],zmm9[4],zmm31[4],zmm9[6],zmm31[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -24620,10 +24658,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24641,47 +24680,51 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm4[0],zmm18[0],zmm4[2],zmm18[2],zmm4[4],zmm18[4],zmm4[6],zmm18[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24696,11 +24739,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24715,56 +24758,53 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -24774,65 +24814,66 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm14[1],zmm13[1],zmm14[3],zmm13[3],zmm14[5],zmm13[5],zmm14[7],zmm13[7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm4[1],zmm18[1],zmm4[3],zmm18[3],zmm4[5],zmm18[5],zmm4[7],zmm18[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm24[1],zmm19[3],zmm24[3],zmm19[5],zmm24[5],zmm19[7],zmm24[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] @@ -24842,135 +24883,136 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm2 {%k1} = zmm30[0],mem[0],zmm30[2],mem[2],zmm30[4],mem[4],zmm30[6],mem[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 {%k1} # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm29 {%k1} = zmm23[0],mem[0],zmm23[2],mem[2],zmm23[4],mem[4],zmm23[6],mem[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm29[0],zmm25[0],zmm29[2],zmm25[2],zmm29[4],zmm25[4],zmm29[6],zmm25[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm2 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm13[0],zmm3[0],zmm13[2],zmm3[2],zmm13[4],zmm3[4],zmm13[6],zmm3[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -24978,46 +25020,45 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm31[1],zmm10[1],zmm31[3],zmm10[3],zmm31[5],zmm10[5],zmm31[7],zmm10[7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm29[1],zmm6[1],zmm29[3],zmm6[3],zmm29[5],zmm6[5],zmm29[7],zmm6[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm18, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 @@ -25026,79 +25067,77 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm9[1],zmm12[1],zmm9[3],zmm12[3],zmm9[5],zmm12[5],zmm9[7],zmm12[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -25106,118 +25145,119 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm26 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %xmm28 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm28, %ymm28 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm29 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm29, %ymm29 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm21, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX512BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm7, %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm13, %ymm13 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 ; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6 -; AVX512BW-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512BW-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 2048(%rdi), %xmm6 +; AVX512BW-FCP-NEXT: vinserti128 $1, 2176(%rdi), %ymm6, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 2624(%rdi), %xmm3 +; AVX512BW-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm6[0],ymm16[0],ymm6[2],ymm16[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm23, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 3136(%rdi), %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, 3264(%rdi), %ymm12, %ymm12 +; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm17, %ymm17 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9 ; AVX512BW-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 ; AVX512BW-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14 ; AVX512BW-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm31, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm16[1],ymm6[3],ymm16[3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} @@ -25225,21 +25265,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 256(%rsi) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rdx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -25325,60 +25366,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512BW-FCP-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i64_stride8_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512DQ-BW-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-BW-NEXT: vmovdqa64 3392(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 3328(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -25392,12 +25433,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -25409,10 +25448,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm15 @@ -25426,10 +25465,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -25452,7 +25491,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 1216(%rdi), %ymm3 @@ -25470,44 +25509,43 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %ymm28 +; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %ymm29 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] ; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] +; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %ymm24 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512DQ-BW-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512DQ-BW-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %ymm26 +; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %ymm31 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm26[0],ymm31[2],ymm26[2] +; AVX512DQ-BW-NEXT: vmovdqa 2112(%rdi), %ymm12 ; AVX512DQ-BW-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -25533,28 +25571,26 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] @@ -25595,7 +25631,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -25607,82 +25643,82 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm31[1],ymm26[1],ymm31[3],ymm26[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm30, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm10[0],zmm3[2],zmm10[2],zmm3[4],zmm10[4],zmm3[6],zmm10[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm31[0],zmm9[2],zmm31[2],zmm9[4],zmm31[4],zmm9[6],zmm31[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -25690,10 +25726,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -25711,47 +25748,51 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm4[0],zmm18[0],zmm4[2],zmm18[2],zmm4[4],zmm18[4],zmm4[6],zmm18[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -25766,11 +25807,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -25785,56 +25826,53 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -25844,65 +25882,66 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm14[1],zmm13[1],zmm14[3],zmm13[3],zmm14[5],zmm13[5],zmm14[7],zmm13[7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm4[1],zmm18[1],zmm4[3],zmm18[3],zmm4[5],zmm18[5],zmm4[7],zmm18[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm24[1],zmm19[3],zmm24[3],zmm19[5],zmm24[5],zmm19[7],zmm24[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] @@ -25912,135 +25951,136 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm2 {%k1} = zmm30[0],mem[0],zmm30[2],mem[2],zmm30[4],mem[4],zmm30[6],mem[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm29 {%k1} = zmm23[0],mem[0],zmm23[2],mem[2],zmm23[4],mem[4],zmm23[6],mem[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm29[0],zmm25[0],zmm29[2],zmm25[2],zmm29[4],zmm25[4],zmm29[6],zmm25[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm2 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm13[0],zmm3[0],zmm13[2],zmm3[2],zmm13[4],zmm3[4],zmm13[6],zmm3[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -26048,46 +26088,45 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm31[1],zmm10[1],zmm31[3],zmm10[3],zmm31[5],zmm10[5],zmm31[7],zmm10[7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm18, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm29[1],zmm6[1],zmm29[3],zmm6[3],zmm29[5],zmm6[5],zmm29[7],zmm6[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 @@ -26096,79 +26135,77 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm9[1],zmm12[1],zmm9[3],zmm12[3],zmm9[5],zmm12[5],zmm9[7],zmm12[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -26176,118 +26213,119 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm26 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %xmm28 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm28, %ymm28 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm29, %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %xmm29 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm29, %ymm29 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm21, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX512DQ-BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm7, %ymm7 +; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512DQ-BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm13, %ymm13 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 ; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %xmm11 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 2112(%rdi), %xmm6 -; AVX512DQ-BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm16, %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 2048(%rdi), %xmm6 +; AVX512DQ-BW-NEXT: vinserti128 $1, 2176(%rdi), %ymm6, %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 2624(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm6[0],ymm16[0],ymm6[2],ymm16[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm23, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %xmm23 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 3136(%rdi), %xmm12 +; AVX512DQ-BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm12, %ymm12 +; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %xmm17 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm17, %ymm17 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 3648(%rdi), %xmm9 ; AVX512DQ-BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 ; AVX512DQ-BW-NEXT: vmovdqa 3584(%rdi), %xmm14 ; AVX512DQ-BW-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm31, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm16[1],ymm6[3],ymm16[3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} @@ -26295,21 +26333,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 448(%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 256(%rsi) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512DQ-BW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 384(%rdx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -26395,60 +26434,60 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 448(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512DQ-BW-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -26462,12 +26501,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -26479,10 +26516,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 @@ -26496,10 +26533,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -26522,7 +26559,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1216(%rdi), %ymm3 @@ -26540,44 +26577,43 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm26 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm27[0],ymm26[2],ymm27[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %ymm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %ymm29 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %ymm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm30[0],ymm18[2],ymm30[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2240(%rdi), %ymm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %ymm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %ymm31 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm26[0],ymm31[2],ymm26[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2112(%rdi), %ymm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -26603,28 +26639,26 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm20[1],mem[1],ymm20[3],mem[3] @@ -26665,7 +26699,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -26677,82 +26711,82 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm29 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm27[1],ymm26[3],ymm27[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm18[1],ymm30[1],ymm18[3],ymm30[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm25, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm31[1],ymm26[1],ymm31[3],ymm26[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm27, %zmm30, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm10[0],zmm3[2],zmm10[2],zmm3[4],zmm10[4],zmm3[6],zmm10[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm4[0],zmm9[2],zmm4[2],zmm9[4],zmm4[4],zmm9[6],zmm4[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm31[0],zmm9[2],zmm31[2],zmm9[4],zmm31[4],zmm9[6],zmm31[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -26760,10 +26794,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm3[0],zmm22[0],zmm3[2],zmm22[2],zmm3[4],zmm22[4],zmm3[6],zmm22[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm22[0],zmm25[2],zmm22[2],zmm25[4],zmm22[4],zmm25[6],zmm22[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -26781,47 +26816,51 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm13[0],zmm21[2],zmm13[2],zmm21[4],zmm13[4],zmm21[6],zmm13[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm7[0],mem[0],zmm7[2],mem[2],zmm7[4],mem[4],zmm7[6],mem[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm4[0],zmm18[0],zmm4[2],zmm18[2],zmm4[4],zmm18[4],zmm4[6],zmm18[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -26836,11 +26875,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm26[0],zmm4[0],zmm26[2],zmm4[2],zmm26[4],zmm4[4],zmm26[6],zmm4[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -26855,56 +26894,53 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm29, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm12[0],zmm11[0],zmm12[2],zmm11[2],zmm12[4],zmm11[4],zmm12[6],zmm11[6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm29[1],zmm31[1],zmm29[3],zmm31[3],zmm29[5],zmm31[5],zmm29[7],zmm31[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm12[1],zmm18[1],zmm12[3],zmm18[3],zmm12[5],zmm18[5],zmm12[7],zmm18[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -26914,65 +26950,66 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm13[1],zmm3[3],zmm13[3],zmm3[5],zmm13[5],zmm3[7],zmm13[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm14[1],zmm13[1],zmm14[3],zmm13[3],zmm14[5],zmm13[5],zmm14[7],zmm13[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm7[1],mem[1],zmm7[3],mem[3],zmm7[5],mem[5],zmm7[7],mem[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm4[1],zmm18[1],zmm4[3],zmm18[3],zmm4[5],zmm18[5],zmm4[7],zmm18[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm19[1],zmm24[1],zmm19[3],zmm24[3],zmm19[5],zmm24[5],zmm19[7],zmm24[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm2 = zmm2[1],mem[1],zmm2[3],mem[3],zmm2[5],mem[5],zmm2[7],mem[7] @@ -26982,135 +27019,136 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm2 {%k1} = zmm25[0],mem[0],zmm25[2],mem[2],zmm25[4],mem[4],zmm25[6],mem[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm2 {%k1} = zmm30[0],mem[0],zmm30[2],mem[2],zmm30[4],mem[4],zmm30[6],mem[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm29[0],zmm30[0],zmm29[2],zmm30[2],zmm29[4],zmm30[4],zmm29[6],zmm30[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm29 {%k1} = zmm23[0],mem[0],zmm23[2],mem[2],zmm23[4],mem[4],zmm23[6],mem[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm30[0],zmm26[0],zmm30[2],zmm26[2],zmm30[4],zmm26[4],zmm30[6],zmm26[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm31[0],zmm28[0],zmm31[2],zmm28[2],zmm31[4],zmm28[4],zmm31[6],zmm28[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm29[0],zmm25[0],zmm29[2],zmm25[2],zmm29[4],zmm25[4],zmm29[6],zmm25[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 {%k1} # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm2 {%k1} = zmm11[0],mem[0],zmm11[2],mem[2],zmm11[4],mem[4],zmm11[6],mem[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm2 {%k1} = zmm10[0],mem[0],zmm10[2],mem[2],zmm10[4],mem[4],zmm10[6],mem[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm28[0],zmm7[2],zmm28[2],zmm7[4],zmm28[4],zmm7[6],zmm28[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm13[0],zmm3[0],zmm13[2],zmm3[2],zmm13[4],zmm3[4],zmm13[6],zmm3[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] +; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm11[1],zmm13[3],zmm11[3],zmm13[5],zmm11[5],zmm13[7],zmm11[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -27118,46 +27156,45 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm30[1],zmm26[1],zmm30[3],zmm26[3],zmm30[5],zmm26[5],zmm30[7],zmm26[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm31[1],zmm10[1],zmm31[3],zmm10[3],zmm31[5],zmm10[5],zmm31[7],zmm10[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm12, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm24[1],zmm6[1],zmm24[3],zmm6[3],zmm24[5],zmm6[5],zmm24[7],zmm6[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm29[1],zmm6[1],zmm29[3],zmm6[3],zmm29[5],zmm6[5],zmm29[7],zmm6[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm18, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm1 @@ -27166,79 +27203,77 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm10[1],zmm0[1],zmm10[3],zmm0[3],zmm10[5],zmm0[5],zmm10[7],zmm0[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm14[1],zmm8[1],zmm14[3],zmm8[3],zmm14[5],zmm8[5],zmm14[7],zmm8[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm15, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm9[1],zmm12[1],zmm9[3],zmm12[3],zmm9[5],zmm12[5],zmm9[7],zmm12[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -27246,118 +27281,119 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm28, %zmm17, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm8, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %xmm28 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 704(%rdi), %ymm28, %ymm28 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm29 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm29, %ymm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm21, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm7, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm13, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 704(%rdi), %ymm13, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %xmm28 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 640(%rdi), %ymm28, %ymm28 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1216(%rdi), %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1152(%rdi), %ymm7, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 1600(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm5, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %xmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1664(%rdi), %ymm11, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm29, %zmm19, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2112(%rdi), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2240(%rdi), %ymm6, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %xmm29 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm29, %ymm29 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm29[0],ymm6[0],ymm29[2],ymm6[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm10, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2048(%rdi), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2176(%rdi), %ymm6, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2624(%rdi), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2752(%rdi), %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm6[0],ymm16[0],ymm6[2],ymm16[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm23, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %xmm23 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm23, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm23[0],ymm10[0],ymm23[2],ymm10[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm18, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 3264(%rdi), %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 3136(%rdi), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3264(%rdi), %ymm12, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm17, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3648(%rdi), %xmm9 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3776(%rdi), %ymm9, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3584(%rdi), %xmm14 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3712(%rdi), %ymm14, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm16 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm16, %zmm12, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm31, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm31, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm10[1],ymm23[3],ymm10[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm6[1],ymm29[3],ymm6[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm16[1],ymm6[3],ymm16[3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} @@ -27365,21 +27401,22 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 448(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 256(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -27465,20 +27502,20 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08 +; AVX512DQ-BW-FCP-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <512 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 81fe19c4d8b56..55728fa12c2a6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -105,37 +105,37 @@ define void @load_i8_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512BW-LABEL: load_i8_stride2_vf2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpmovwb %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovwb %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rdx) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride2_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpmovwb %xmm0, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovwb %xmm0, %xmm0 +; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rdx) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride2_vf2: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vpmovwb %xmm0, %xmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovwb %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rdx) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride2_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovwb %xmm0, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovwb %xmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <4 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <4 x i8> %wide.vec, <4 x i8> poison, <2 x i32> @@ -432,7 +432,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX2-LABEL: load_i8_stride2_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm3 @@ -448,7 +448,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX2-FP-LABEL: load_i8_stride2_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FP-NEXT: vpand %xmm0, %xmm2, %xmm3 @@ -464,7 +464,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX2-FCP-LABEL: load_i8_stride2_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpand %xmm0, %xmm2, %xmm3 @@ -480,7 +480,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-LABEL: load_i8_stride2_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-NEXT: vpand %xmm0, %xmm2, %xmm3 @@ -496,7 +496,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-FCP-LABEL: load_i8_stride2_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpand %xmm0, %xmm2, %xmm3 @@ -512,7 +512,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-LABEL: load_i8_stride2_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-NEXT: vpand %xmm0, %xmm2, %xmm3 @@ -528,7 +528,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-FCP-LABEL: load_i8_stride2_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpand %xmm0, %xmm2, %xmm3 @@ -700,14 +700,14 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-LABEL: load_i8_stride2_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm3 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -719,30 +719,30 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-FCP-LABEL: load_i8_stride2_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,2,5,7] -; AVX512-FCP-NEXT: vpermt2q %ymm2, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,5,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm4, %ymm3 +; AVX512-FCP-NEXT: vpermt2q %ymm0, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rdx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride2_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -754,17 +754,17 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-FCP-LABEL: load_i8_stride2_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,2,5,7] -; AVX512DQ-FCP-NEXT: vpermt2q %ymm2, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,5,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm4, %ymm3 +; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -1012,13 +1012,13 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-LABEL: load_i8_stride2_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 @@ -1026,9 +1026,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7] -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm2 @@ -1043,23 +1043,23 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512-FCP-LABEL: load_i8_stride2_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,2,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,5,7] ; AVX512-FCP-NEXT: vpermt2q %ymm5, %ymm8, %ymm7 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm5 ; AVX512-FCP-NEXT: vpermt2q %ymm0, %ymm8, %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpermt2q %ymm4, %ymm8, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 @@ -1073,13 +1073,13 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-LABEL: load_i8_stride2_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 @@ -1087,9 +1087,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm2, %ymm2 @@ -1104,23 +1104,23 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512DQ-FCP-LABEL: load_i8_stride2_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,2,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,5,7] ; AVX512DQ-FCP-NEXT: vpermt2q %ymm5, %ymm8, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm5 ; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpermt2q %ymm4, %ymm8, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 @@ -1134,65 +1134,65 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512BW-LABEL: load_i8_stride2_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride2_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride2_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride2_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <128 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index d1d7cb0a34332..18186f52584fb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -420,10 +420,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -438,10 +438,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -456,10 +456,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -474,10 +474,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -492,10 +492,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -510,10 +510,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -528,10 +528,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -546,10 +546,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -564,10 +564,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -582,10 +582,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -600,10 +600,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -618,10 +618,10 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -770,7 +770,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] @@ -792,7 +792,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] @@ -814,7 +814,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] @@ -836,7 +836,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] @@ -1317,8 +1317,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1326,8 +1325,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] @@ -1345,8 +1343,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1354,8 +1351,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] @@ -1373,8 +1369,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1382,8 +1377,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] @@ -1401,8 +1395,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1429,8 +1422,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1457,8 +1449,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1485,8 +1476,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1509,12 +1499,11 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1537,12 +1526,11 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1565,12 +1553,11 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1593,12 +1580,11 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -2239,8 +2225,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5 ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5 @@ -2249,8 +2234,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 ; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0] -; AVX2-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8 ; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm8 @@ -2299,8 +2283,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] -; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 @@ -2309,8 +2292,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0] -; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 @@ -2359,8 +2341,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 @@ -2369,8 +2350,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0] -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 @@ -2409,20 +2389,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 +; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] @@ -2456,20 +2435,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] @@ -2503,20 +2481,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512DQ-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] @@ -2550,20 +2527,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] @@ -2600,15 +2576,14 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512BW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 ; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2 @@ -2637,15 +2612,14 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm2, %zmm2 @@ -2674,15 +2648,14 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX512DQ-BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2 @@ -2711,15 +2684,14 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index abef980277ece..c15d25b38a657 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -99,105 +99,105 @@ define void @load_i8_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i8_stride4_vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpmovdb %xmm0, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512-NEXT: vpextrw $0, %xmm2, (%rdx) -; AVX512-NEXT: vpextrw $0, %xmm3, (%rcx) -; AVX512-NEXT: vpextrw $0, %xmm0, (%r8) +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: vpextrw $0, %xmm1, (%rdx) +; AVX512-NEXT: vpextrw $0, %xmm2, (%rcx) +; AVX512-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride4_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-FCP-NEXT: vpmovdb %xmm0, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) -; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) -; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%r8) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-FCP-NEXT: vpextrw $0, %xmm1, (%rdx) +; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rcx) +; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride4_vf2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-NEXT: vpmovdb %xmm0, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rdx) -; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%rcx) -; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%r8) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512DQ-NEXT: vpextrw $0, %xmm1, (%rdx) +; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rcx) +; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride4_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-FCP-NEXT: vpmovdb %xmm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) -; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride4_vf2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vpmovdb %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rdx) -; AVX512BW-NEXT: vpextrw $0, %xmm3, (%rcx) -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%r8) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rdx) +; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rcx) +; AVX512BW-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride4_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-FCP-NEXT: vpmovdb %xmm0, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride4_vf2: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-BW-NEXT: vpmovdb %xmm0, %xmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride4_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-BW-FCP-NEXT: vpmovdb %xmm0, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <8 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i8> %wide.vec, <8 x i8> poison, <2 x i32> @@ -856,7 +856,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,0,4] ; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] @@ -896,7 +896,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,0,4] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] @@ -936,7 +936,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,0,4] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] @@ -1620,31 +1620,31 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i8_stride4_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] ; AVX512-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm5, %ymm1, %ymm6 ; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm5 ; AVX512-NEXT: vpermt2d %ymm6, %ymm4, %ymm5 ; AVX512-NEXT: vpsrld $8, %zmm2, %zmm6 ; AVX512-NEXT: vpmovdb %zmm6, %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm7 ; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm6 ; AVX512-NEXT: vpermt2d %ymm7, %ymm4, %ymm6 ; AVX512-NEXT: vpsrld $16, %zmm2, %zmm7 ; AVX512-NEXT: vpmovdb %zmm7, %xmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512-NEXT: vpermt2d %ymm1, %ymm4, %ymm3 @@ -1660,31 +1660,31 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i8_stride4_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] ; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 ; AVX512-FCP-NEXT: vpermt2d %ymm6, %ymm4, %ymm5 ; AVX512-FCP-NEXT: vpsrld $8, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vpmovdb %zmm6, %xmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm7 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm6 ; AVX512-FCP-NEXT: vpermt2d %ymm7, %ymm4, %ymm6 ; AVX512-FCP-NEXT: vpsrld $16, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vpmovdb %zmm7, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm4, %ymm3 @@ -1700,31 +1700,31 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i8_stride4_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] ; AVX512DQ-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm6 ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm5 ; AVX512DQ-NEXT: vpermt2d %ymm6, %ymm4, %ymm5 ; AVX512DQ-NEXT: vpsrld $8, %zmm2, %zmm6 ; AVX512DQ-NEXT: vpmovdb %zmm6, %xmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm7 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm6 ; AVX512DQ-NEXT: vpermt2d %ymm7, %ymm4, %ymm6 ; AVX512DQ-NEXT: vpsrld $16, %zmm2, %zmm7 ; AVX512DQ-NEXT: vpmovdb %zmm7, %xmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpermt2d %ymm1, %ymm4, %ymm3 @@ -1740,31 +1740,31 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i8_stride4_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vpsrld $8, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vpmovdb %zmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm6 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm7, %ymm4, %ymm6 ; AVX512DQ-FCP-NEXT: vpsrld $16, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovdb %zmm7, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm4, %ymm3 @@ -1780,7 +1780,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i8_stride4_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] @@ -1808,7 +1808,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride4_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] @@ -1836,7 +1836,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-LABEL: load_i8_stride4_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] @@ -1864,7 +1864,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride4_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] @@ -3022,12 +3022,12 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] ; AVX512-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3039,7 +3039,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpmovdb %zmm0, %xmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[0,1,2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm9 ; AVX512-NEXT: vpshufb %ymm8, %ymm4, %ymm10 ; AVX512-NEXT: vpermt2d %ymm9, %ymm1, %ymm10 @@ -3053,7 +3053,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpmovdb %zmm10, %xmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[0,1,2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm10 ; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm11 ; AVX512-NEXT: vpermt2d %ymm10, %ymm1, %ymm11 @@ -3067,7 +3067,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpmovdb %zmm11, %xmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[0,1,2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX512-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX512-NEXT: vpermt2d %ymm3, %ymm1, %ymm4 @@ -3092,12 +3092,12 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] ; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512-FCP-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3109,7 +3109,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovdb %zmm0, %xmm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[0,1,2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm9 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm10 ; AVX512-FCP-NEXT: vpermt2d %ymm9, %ymm1, %ymm10 @@ -3123,7 +3123,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovdb %zmm10, %xmm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[0,1,2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm11 ; AVX512-FCP-NEXT: vpermt2d %ymm10, %ymm1, %ymm11 @@ -3137,7 +3137,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovdb %zmm11, %xmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[0,1,2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm1, %ymm4 @@ -3162,12 +3162,12 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] ; AVX512DQ-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3179,7 +3179,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[0,1,2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm9 ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm4, %ymm10 ; AVX512DQ-NEXT: vpermt2d %ymm9, %ymm1, %ymm10 @@ -3193,7 +3193,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpmovdb %zmm10, %xmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm10 ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm11 ; AVX512DQ-NEXT: vpermt2d %ymm10, %ymm1, %ymm11 @@ -3207,7 +3207,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpmovdb %zmm11, %xmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[0,1,2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpermt2d %ymm3, %ymm1, %ymm4 @@ -3232,12 +3232,12 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512DQ-FCP-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -3249,7 +3249,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovdb %zmm0, %xmm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm10 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm9, %ymm1, %ymm10 @@ -3263,7 +3263,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovdb %zmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm11 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm10, %ymm1, %ymm11 @@ -3277,7 +3277,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovdb %zmm11, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm1, %ymm4 @@ -3312,7 +3312,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %zmm4, %zmm1, %zmm4 ; AVX512BW-NEXT: vpshufb %zmm6, %zmm0, %zmm6 ; AVX512BW-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %zmm5, %zmm3, %zmm7 @@ -3362,7 +3362,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb %zmm4, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm6, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %zmm5, %zmm3, %zmm7 @@ -3412,7 +3412,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb %zmm4, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpshufb %zmm6, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %zmm5, %zmm3, %zmm7 @@ -3462,7 +3462,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm4, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm6, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm5, %zmm3, %zmm7 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index ac14f55e3f0ed..4f5a2c3192a01 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -757,19 +757,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -789,19 +789,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -821,19 +821,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -853,19 +853,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -885,19 +885,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -917,19 +917,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -949,19 +949,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -981,19 +981,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -1013,19 +1013,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -1045,19 +1045,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -1077,19 +1077,19 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,8,13],zero,zero,zero,xmm6[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] @@ -1397,24 +1397,24 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u] ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u] +; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm6, %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] -; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm5 +; AVX-NEXT: vpblendvb %xmm6, %xmm4, %xmm5, %xmm5 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] -; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[3,8,13,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm1[u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX-NEXT: vpblendvb %xmm6, %xmm8, %xmm9, %xmm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[3,8,13,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm1[u,u,u] +; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX-NEXT: vpblendvb %xmm6, %xmm9, %xmm10, %xmm6 +; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] ; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -1427,8 +1427,8 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm9 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5,6,7] ; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm8 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] +; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm8 ; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u] ; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm10 @@ -1464,56 +1464,56 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11] -; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm6 +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u] +; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[u,u,u] ; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm6 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13] +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm8 ; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[u,u,u] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15,u,u,u] ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] -; AVX2-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] +; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11,u,u,u,u] +; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-NEXT: vmovdqa %xmm3, (%rsi) +; AVX2-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] +; AVX2-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-NEXT: vmovdqa %xmm6, (%rcx) -; AVX2-NEXT: vmovdqa %xmm4, (%r8) +; AVX2-NEXT: vmovdqa %xmm1, (%r8) ; AVX2-NEXT: vmovdqa %xmm0, (%r9) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1522,56 +1522,56 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11] -; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u] +; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[u,u,u] ; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] -; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm8 ; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[u,u,u] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15,u,u,u] ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] -; AVX2-FP-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsi) +; AVX2-FP-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] +; AVX2-FP-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovdqa %xmm4, (%r8) +; AVX2-FP-NEXT: vmovdqa %xmm1, (%r8) ; AVX2-FP-NEXT: vmovdqa %xmm0, (%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -1580,56 +1580,56 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm6 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u] +; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[u,u,u] ; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[u,u,u] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %xmm4, (%r8) +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%r8) ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1644,44 +1644,44 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm6 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm2 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] -; AVX512-NEXT: vpor %xmm6, %xmm2, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm8 & (ymm4 ^ ymm5)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u] +; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm7 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12] +; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4)) +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4)) -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u] -; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512-NEXT: vpor %xmm10, %xmm9, %xmm8 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13] -; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512-NEXT: vpshufb %xmm3, %xmm8, %xmm8 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] +; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u] -; AVX512-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm10, %xmm1 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] -; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4)) -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,8,13],zero,zero,zero,xmm4[1,6,11,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm6, (%rsi) @@ -1702,44 +1702,44 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm2 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm2, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm8 & (ymm4 ^ ymm5)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpor %xmm10, %xmm9, %xmm8 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13] -; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX512-FCP-NEXT: vpor %xmm1, %xmm10, %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,8,13],zero,zero,zero,xmm4[1,6,11,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rsi) @@ -1760,44 +1760,44 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm6 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm2 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] -; AVX512DQ-NEXT: vpor %xmm6, %xmm2, %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm8 & (ymm4 ^ ymm5)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u] -; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512DQ-NEXT: vpor %xmm10, %xmm9, %xmm8 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13] -; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm8 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] +; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u] -; AVX512DQ-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX512DQ-NEXT: vpor %xmm1, %xmm10, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] -; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,8,13],zero,zero,zero,xmm4[1,6,11,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512DQ-NEXT: vmovdqa %xmm6, (%rsi) @@ -1818,44 +1818,44 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm2, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm8 & (ymm4 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm9, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13] -; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm10, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,8,13],zero,zero,zero,xmm4[1,6,11,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rsi) @@ -1881,14 +1881,14 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] -; AVX512BW-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k2} +; AVX512BW-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13,u,u,u] +; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 @@ -1901,18 +1901,18 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[u,u,u] ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] -; AVX512BW-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u] ; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] @@ -1940,14 +1940,14 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k2} +; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 @@ -1960,18 +1960,18 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] @@ -1999,14 +1999,14 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: movw $21140, %ax # imm = 0x5294 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k2} +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 @@ -2019,18 +2019,18 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] @@ -2058,14 +2058,14 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 @@ -2078,18 +2078,18 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero,xmm8[u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] @@ -2693,51 +2693,52 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-LABEL: load_i8_stride5_vf32: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero ; AVX-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm7 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,4,9,14],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3,4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,4,9,14],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3,4,5,6,7] ; AVX-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX-NEXT: vmovdqa (%rdi), %xmm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vmovdqa (%rdi), %xmm6 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] ; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX-NEXT: vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5 +; AVX-NEXT: vpblendvb %xmm13, %xmm11, %xmm12, %xmm11 +; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3,4],xmm7[5,6,7] ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm5, %ymm12, %ymm11 +; AVX-NEXT: vandps %ymm12, %ymm11, %ymm11 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] ; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm14 ; AVX-NEXT: vorps %ymm14, %ymm11, %ymm11 -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpor %xmm14, %xmm15, %xmm11 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u] ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] -; AVX-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] +; AVX-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX-NEXT: vpblendvb %xmm13, %xmm14, %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14 @@ -2746,28 +2747,29 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm11[5,6,7] -; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6 +; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] ; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm14 -; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6 -; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] -; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm14 -; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4,5,6,7] +; AVX-NEXT: vorps %ymm0, %ymm14, %ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] +; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm14 +; AVX-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6,7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] -; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm15 +; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm15 ; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm14 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5,6,7] -; AVX-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6 +; AVX-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] ; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm12 -; AVX-NEXT: vorps %ymm6, %ymm12, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[3,8,13] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX-NEXT: vpor %xmm14, %xmm15, %xmm12 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 @@ -2775,16 +2777,16 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4],xmm12[5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm12 -; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u] -; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm13 -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u] +; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm13 +; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u] -; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm14 ; AVX-NEXT: vpshufb %xmm13, %xmm10, %xmm13 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7] -; AVX-NEXT: vpor %xmm6, %xmm13, %xmm6 +; AVX-NEXT: vpor %xmm0, %xmm13, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14 @@ -2795,382 +2797,367 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0 ; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14 -; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX-NEXT: vextractf128 $1, %ymm6, %xmm15 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[4,9,14] +; AVX-NEXT: vorps %ymm0, %ymm14, %ymm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm15 +; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm11, %xmm15, %xmm14, %xmm14 -; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm6, %ymm6 -; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] -; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] -; AVX-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] -; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] +; AVX-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4,5],xmm0[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] +; AVX-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3,4,5],xmm6[6,7] +; AVX-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15] +; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpblendvb %xmm11, %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, (%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, (%rdx) ; AVX-NEXT: vmovaps %ymm12, (%rcx) -; AVX-NEXT: vmovaps %ymm6, (%r8) +; AVX-NEXT: vmovaps %ymm14, (%r8) ; AVX-NEXT: vmovaps %ymm0, (%r9) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride5_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0] +; AVX2-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] +; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255] +; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovdqa %xmm8, %xmm7 -; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6 -; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] ; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm1, %ymm10 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0] ; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm10, %ymm5 -; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 +; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm4, %ymm9 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm10, %ymm3, %ymm1, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255] ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm1, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u] ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm10 +; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm4, %ymm10 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] ; AVX2-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm10 -; AVX2-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11] -; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 -; AVX2-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX2-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm8 +; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm4, %ymm6 +; AVX2-NEXT: vmovdqa 144(%rdi), %xmm4 +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm4[4,9,14] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u] +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX2-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5] -; AVX2-NEXT: vpermd %ymm6, %ymm9, %ymm6 -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX2-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u] +; AVX2-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,5,0,5,0,5,0,5] +; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm6, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[1,6,11] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX2-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2,3,4],ymm6[5,6,7],ymm0[8,9,10,11,12],ymm6[13,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,7,12] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX2-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,8,13] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm4, (%rsi) -; AVX2-NEXT: vmovdqa %ymm2, (%rdx) -; AVX2-NEXT: vmovdqa %ymm3, (%rcx) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7],ymm7[8,9,10,11,12],ymm2[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride5_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5 -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] -; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] +; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm7 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm7 -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm9, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] ; AVX2-FP-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm1, %ymm10 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-FP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm10, %ymm5 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm4, %ymm9 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX2-FP-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm1, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm1, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm6, %ymm4, %ymm10 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] ; AVX2-FP-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm10 -; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm8 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm4, %ymm6 +; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm4[4,9,14] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u] +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX2-FP-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 -; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5] -; AVX2-FP-NEXT: vpermd %ymm6, %ymm9, %ymm6 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm6, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm1, %ymm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,5,0,5,0,5,0,5] +; AVX2-FP-NEXT: vpermd %ymm8, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm6, %ymm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[1,6,11] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX2-FP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2,3,4],ymm6[5,6,7],ymm0[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,7,12] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX2-FP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,8,13] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX2-FP-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm4, (%rsi) -; AVX2-FP-NEXT: vmovdqa %ymm2, (%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm3, (%rcx) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7],ymm7[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-FP-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride5_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u] +; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm7 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm1, %ymm10 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,u,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm4, %ymm9 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] ; AVX2-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm1, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u,255] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm1, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm6, %ymm4, %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm10 -; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm8 +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm4[4,9,14] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX2-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm9, %ymm6 -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[1,6,11,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,5,0,5,0,5,0,5] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm6, %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[1,6,11] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX2-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2,3,4],ymm6[5,6,7],ymm0[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,7,12] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX2-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,8,13] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX2-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rcx) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7],ymm7[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -3223,19 +3210,19 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpor %xmm9, %xmm12, %xmm9 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vmovdqa %ymm4, %ymm12 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12)) +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (mem & (ymm14 ^ ymm12)) +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm12, %ymm13, %ymm13 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3)) -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] -; AVX512-NEXT: vpor %xmm14, %xmm10, %xmm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13 +; AVX512-NEXT: vpshufb %ymm12, %ymm14, %ymm14 +; AVX512-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm14 ; AVX512-NEXT: vpshufb %xmm12, %xmm7, %xmm11 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 @@ -3247,10 +3234,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX512-NEXT: vmovdqa %ymm4, %ymm13 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3)) ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u] +; AVX512-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u] ; AVX512-NEXT: vpor %xmm14, %xmm13, %xmm13 @@ -3259,22 +3246,22 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13)) ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3)) ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4)) +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] -; AVX512-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm1 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0)) ; AVX512-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512-NEXT: vmovdqa %ymm9, (%rdx) @@ -3333,19 +3320,19 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm12 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (mem & (ymm14 ^ ymm12)) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm13 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm14, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm14 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm11 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 @@ -3357,10 +3344,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm13 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 @@ -3369,22 +3356,22 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4)) +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm9, (%rdx) @@ -3443,19 +3430,19 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpor %xmm9, %xmm12, %xmm9 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm12 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (mem & (ymm14 ^ ymm12)) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm13, %ymm13 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] -; AVX512DQ-NEXT: vpor %xmm14, %xmm10, %xmm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm14, %ymm14 +; AVX512DQ-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm14 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm7, %xmm11 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm11 @@ -3467,10 +3454,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm13 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u] ; AVX512DQ-NEXT: vpor %xmm14, %xmm13, %xmm13 @@ -3479,22 +3466,22 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4)) +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] -; AVX512DQ-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpermd %ymm2, %ymm3, %ymm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm9, (%rdx) @@ -3553,19 +3540,19 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm12 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (mem & (ymm14 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm13 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 @@ -3577,10 +3564,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm13 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 @@ -3589,22 +3576,22 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%rdx) @@ -3624,16 +3611,16 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2} ; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[2,7,12,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u] +; AVX512BW-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] @@ -3667,13 +3654,13 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000 -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4} -; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] -; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k4} +; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[4,9,14,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] +; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero @@ -3700,10 +3687,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-NEXT: movl $-33554432, %eax # imm = 0xFE000000 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1} ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2} @@ -3712,11 +3699,11 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] -; AVX512BW-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512BW-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3} ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) @@ -3736,16 +3723,16 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1} ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 -; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2} ; AVX512BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} +; AVX512BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[2,7,12,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] @@ -3779,13 +3766,13 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 -; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k4} +; AVX512BW-FCP-NEXT: kmovd %eax, %k4 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[4,9,14,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero @@ -3812,10 +3799,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-FCP-NEXT: movl $-33554432, %eax # imm = 0xFE000000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2} @@ -3824,11 +3811,11 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) @@ -3848,16 +3835,16 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 -; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2} ; AVX512DQ-BW-NEXT: movw $19026, %ax # imm = 0x4A52 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} +; AVX512DQ-BW-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[2,7,12,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] @@ -3891,13 +3878,13 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000 -; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k4} +; AVX512DQ-BW-NEXT: kmovd %eax, %k4 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[4,9,14,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero @@ -3924,10 +3911,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-BW-NEXT: movl $-33554432, %eax # imm = 0xFE000000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2} @@ -3936,11 +3923,11 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx) @@ -3960,16 +3947,16 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[2,7,12,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] @@ -4003,13 +3990,13 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k4} +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[4,9,14,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero @@ -4036,10 +4023,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-BW-FCP-NEXT: movl $-33554432, %eax # imm = 0xFE000000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2} @@ -4048,11 +4035,11 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) @@ -5483,7 +5470,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm0 = [18446744073709551615,255] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5514,7 +5501,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm8 = [18446744073709551615,255] +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5608,787 +5595,781 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i8_stride5_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $136, %rsp +; AVX2-NEXT: subq $168, %rsp ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm8, %ymm0 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15 ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] +; AVX2-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] +; AVX2-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] +; AVX2-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm11 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm13, %ymm14, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] +; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u] +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,3,8,13,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb %ymm0, %ymm15, %ymm15 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm15, %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm0, %ymm9, %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm15 ; AVX2-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u] -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u] +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm6 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8 -; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm0 -; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5 -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u] +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,4,9,14,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm9, %ymm9 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] -; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,0,5,10,15,u,u,u,u,u,u] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm4, %ymm12, %ymm12 +; AVX2-NEXT: vpshufb %ymm1, %ymm12, %ymm12 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm12 ; AVX2-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u] ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,1,6,11,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm12 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm5 +; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 -; AVX2-NEXT: vmovdqa 304(%rdi), %xmm2 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX2-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm8, %ymm1 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] +; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm11 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,128,128,128,1,6,11] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm14 +; AVX2-NEXT: vmovdqa 304(%rdi), %xmm8 +; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm10 +; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,u,u,2,7,12,128,128,128] +; AVX2-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX2-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm12 -; AVX2-NEXT: vpor %xmm4, %xmm12, %xmm4 +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX2-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] -; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm7 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX2-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX2-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,u,u,u,u,128,128,128,2,7,12] +; AVX2-NEXT: vpshufb %xmm12, %xmm8, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,u,u,3,8,13,128,128,128] +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm7 +; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5,6,7],ymm9[8,9,10,11,12],ymm5[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm8 -; AVX2-NEXT: vpshufb %xmm14, %xmm5, %xmm12 -; AVX2-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] -; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm11 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] -; AVX2-NEXT: vpshufb %xmm15, %xmm1, %xmm14 -; AVX2-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255] -; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX2-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX2-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12 -; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,128,128,128,128,4,9,14] +; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm9 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,u,u,u,0,5,10,15,128,128,128] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm15 +; AVX2-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm7 +; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-NEXT: vpor %xmm0, %xmm4, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] -; AVX2-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7] -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] -; AVX2-NEXT: vpshufb %ymm4, %ymm13, %ymm13 +; AVX2-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX2-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,2,7,12,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm11[3,4,5,6,7] +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] +; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm11 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-NEXT: vpermd %ymm13, %ymm0, %ymm13 -; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm15, %ymm13, %ymm13 -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm15 -; AVX2-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] +; AVX2-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm11, %ymm11 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX2-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX2-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX2-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX2-NEXT: vpermd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,u,u,128,128,128,3,8,13] +; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,4,9,14,128,128,128] +; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-NEXT: vmovdqa %ymm11, 32(%r8) -; AVX2-NEXT: vmovdqa %ymm12, (%r8) -; AVX2-NEXT: vmovdqa %ymm10, 32(%r9) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm4 +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm3, (%rcx) +; AVX2-NEXT: vmovdqa %ymm13, 32(%r8) +; AVX2-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-NEXT: vmovdqa %ymm11, 32(%r9) ; AVX2-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-NEXT: addq $136, %rsp +; AVX2-NEXT: addq $168, %rsp ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride5_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $136, %rsp +; AVX2-FP-NEXT: subq $168, %rsp ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm8, %ymm0 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] +; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] +; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] +; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm11 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm13, %ymm14, %ymm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,3,8,13,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm15, %ymm15 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm15, %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm9, %ymm1 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] -; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8 -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,4,9,14,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm9, %ymm9 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FP-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] -; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,0,5,10,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm12, %ymm12 +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm12 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] -; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,1,6,11,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm5 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 -; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm2 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX2-FP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm8, %ymm1 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] +; AVX2-FP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm11 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,128,128,128,1,6,11] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm14 +; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm8 +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm10 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,u,u,2,7,12,128,128,128] +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-FP-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm12 -; AVX2-FP-NEXT: vpor %xmm4, %xmm12, %xmm4 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm7 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX2-FP-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX2-FP-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,u,u,u,u,128,128,128,2,7,12] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,u,u,3,8,13,128,128,128] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm7 +; AVX2-FP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5,6,7],ymm9[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm14, %xmm5, %xmm12 -; AVX2-FP-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm11 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm14 -; AVX2-FP-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255] -; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX2-FP-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12 -; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,128,128,128,128,4,9,14] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,u,u,u,0,5,10,15,128,128,128] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm15 +; AVX2-FP-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FP-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FP-NEXT: vpor %xmm0, %xmm4, %xmm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-FP-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] -; AVX2-FP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm13, %ymm13 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX2-FP-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,2,7,12,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm11, %ymm11 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-FP-NEXT: vpermd %ymm13, %ymm0, %ymm13 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm13, %ymm13 -; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm15 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm11, %ymm11 +; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX2-FP-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX2-FP-NEXT: vpermd %ymm4, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,u,u,128,128,128,3,8,13] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,4,9,14,128,128,128] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm11, 32(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm12, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm10, 32(%r9) +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm4 +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm3, (%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm13, 32(%r8) +; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FP-NEXT: vmovdqa %ymm11, 32(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FP-NEXT: addq $136, %rsp +; AVX2-FP-NEXT: addq $168, %rsp ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride5_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $136, %rsp +; AVX2-FCP-NEXT: subq $168, %rsp ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] +; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] +; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] +; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm3, %ymm11 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm14, %ymm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpor %xmm3, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,3,8,13,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm15 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm15, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm15 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,4,9,14,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm6 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,0,5,10,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm12 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm12 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,1,6,11,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 -; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm8, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] +; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm11 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,128,128,128,1,6,11] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm14 +; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm10 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,u,u,2,7,12,128,128,128] +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-FCP-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm12 -; AVX2-FCP-NEXT: vpor %xmm4, %xmm12, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm7 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX2-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,u,u,u,u,128,128,128,2,7,12] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,u,u,3,8,13,128,128,128] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm7 +; AVX2-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5,6,7],ymm9[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm12 -; AVX2-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm11 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm14 -; AVX2-FCP-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255] -; AVX2-FCP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX2-FCP-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,128,128,128,128,4,9,14] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,u,u,u,0,5,10,15,128,128,128] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm15 +; AVX2-FCP-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vpor %xmm0, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] -; AVX2-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm13 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX2-FCP-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,2,7,12,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm13 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm13 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm1, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX2-FCP-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm4, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,u,u,128,128,128,3,8,13] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,4,9,14,128,128,128] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm12, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm10, 32(%r9) +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm13, 32(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FCP-NEXT: addq $136, %rsp +; AVX2-FCP-NEXT: addq $168, %rsp ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -6399,197 +6380,197 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm25 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm22 ^ ymm23)) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm25 ^ (ymm7 & (ymm24 ^ ymm25)) +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm10 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX512-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm12 ; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512-NEXT: vpermd %ymm12, %ymm17, %ymm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm16 = [0,5,0,5,0,5,0,5] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm0 +; AVX512-NEXT: vpermd %ymm12, %ymm16, %ymm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm18) | ymm11 +; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX512-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm6 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] +; AVX512-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm12 ^ ymm14)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm15 +; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm17 +; AVX512-NEXT: vmovdqa %ymm3, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12)) +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm5, %ymm6 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) ; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm10[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~ymm18) | ymm15 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm19 & (ymm2 ^ ymm0)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa %ymm15, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX512-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm24 ^ ymm25)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm20) | ymm4 +; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm1)) +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm18 ; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12)) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512-NEXT: vmovdqa %ymm3, %ymm1 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm10[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm19 & (ymm0 ^ ymm4)) +; AVX512-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm23 ^ ymm22)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] +; AVX512-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm24 ^ (ymm4 & (ymm25 ^ ymm24)) +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,10,15],zero,zero,zero,xmm4[u,u,u] +; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm20) | ymm2 +; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm4)) +; AVX512-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm12 ^ ymm14)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,9,14] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,2,7,12],zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm20 +; AVX512-NEXT: vpor %xmm4, %xmm2, %xmm0 ; AVX512-NEXT: vmovdqa %ymm5, %ymm1 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm10[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX512-NEXT: vpor %xmm2, %xmm4, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm19 & (ymm0 ^ ymm7)) +; AVX512-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm23 ^ ymm22)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm24 ^ (ymm4 & (ymm25 ^ ymm24)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,11],zero,zero,zero,zero,xmm4[u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[2,7,12],zero,zero,zero,xmm4[0,5,10,15,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & mem) | ymm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] -; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm2 & (zmm1 ^ zmm4)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm14 ^ (ymm3 & (ymm12 ^ ymm14)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15] +; AVX512-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm26 ^ (ymm15 & (ymm8 ^ ymm26)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (mem & (ymm15 ^ ymm9)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm10[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm4[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] -; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm19 & (ymm4 ^ ymm1)) +; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm22 ^ ymm23)) +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm3)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpermd %ymm3, %ymm16, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm2 & (zmm3 ^ zmm1)) +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm18, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) @@ -6604,197 +6585,197 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm22 ^ ymm23)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm25 ^ (ymm7 & (ymm24 ^ ymm25)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) ; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm16 = [0,5,0,5,0,5,0,5] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm0 +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm16, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm18) | ymm11 +; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX512-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm12 ^ ymm14)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm17 +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm6 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm10[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~ymm18) | ymm15 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm19 & (ymm2 ^ ymm0)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm24 ^ ymm25)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm20) | ymm4 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm1)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm18 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm10[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm19 & (ymm0 ^ ymm4)) +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm23 ^ ymm22)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm24 ^ (ymm4 & (ymm25 ^ ymm24)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,10,15],zero,zero,zero,xmm4[u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm20) | ymm2 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm12 ^ ymm14)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,9,14] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,2,7,12],zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vpor %xmm4, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm10[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpor %xmm2, %xmm4, %xmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm19 & (ymm0 ^ ymm7)) +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm23 ^ ymm22)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm24 ^ (ymm4 & (ymm25 ^ ymm24)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,11],zero,zero,zero,zero,xmm4[u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[2,7,12],zero,zero,zero,xmm4[0,5,10,15,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & mem) | ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm2 & (zmm1 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm14 ^ (ymm3 & (ymm12 ^ ymm14)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm26 ^ (ymm15 & (ymm8 ^ ymm26)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (mem & (ymm15 ^ ymm9)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm10[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm4[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm19 & (ymm4 ^ ymm1)) +; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm22 ^ ymm23)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm3)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm2 & (zmm3 ^ zmm1)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) @@ -6809,197 +6790,197 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm25 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm22 ^ ymm23)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm25 ^ (ymm7 & (ymm24 ^ ymm25)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm12 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512DQ-NEXT: vpermd %ymm12, %ymm17, %ymm15 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm16 = [0,5,0,5,0,5,0,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm0 +; AVX512DQ-NEXT: vpermd %ymm12, %ymm16, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm18) | ymm11 +; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX512DQ-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm12 ^ ymm14)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm17 +; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm10[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~ymm18) | ymm15 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm19 & (ymm2 ^ ymm0)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm24 ^ ymm25)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm20) | ymm4 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm10[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm19 & (ymm0 ^ ymm4)) +; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm23 ^ ymm22)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] +; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm24 ^ (ymm4 & (ymm25 ^ ymm24)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,10,15],zero,zero,zero,xmm4[u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm20) | ymm2 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm4)) +; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm12 ^ ymm14)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,9,14] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,2,7,12],zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm20 +; AVX512DQ-NEXT: vpor %xmm4, %xmm2, %xmm0 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm10[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX512DQ-NEXT: vpor %xmm2, %xmm4, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm19 & (ymm0 ^ ymm7)) +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm23 ^ ymm22)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm24 ^ (ymm4 & (ymm25 ^ ymm24)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,11],zero,zero,zero,zero,xmm4[u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[2,7,12],zero,zero,zero,xmm4[0,5,10,15,u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & mem) | ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] -; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm2 & (zmm1 ^ zmm4)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm14 ^ (ymm3 & (ymm12 ^ ymm14)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15] +; AVX512DQ-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm26 ^ (ymm15 & (ymm8 ^ ymm26)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (mem & (ymm15 ^ ymm9)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm10[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm19 & (ymm4 ^ ymm1)) +; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm22 ^ ymm23)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm3)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpermd %ymm3, %ymm16, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm2 & (zmm3 ^ zmm1)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) @@ -7014,197 +6995,197 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm22 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm25 ^ (ymm7 & (ymm24 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) ; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm16 = [0,5,0,5,0,5,0,5] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm0 +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm16, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm18) | ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm12 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm6 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm10[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~ymm18) | ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm19 & (ymm2 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm24 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm20) | ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm10[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm19 & (ymm0 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm23 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm24 ^ (ymm4 & (ymm25 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,10,15],zero,zero,zero,xmm4[u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm20) | ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm12 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,9,14] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,2,7,12],zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm10[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm4, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm19 & (ymm0 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm23 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm24 ^ (ymm4 & (ymm25 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,11],zero,zero,zero,zero,xmm4[u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[2,7,12],zero,zero,zero,xmm4[0,5,10,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & mem) | ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm2 & (zmm1 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm14 ^ (ymm3 & (ymm12 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm26 ^ (ymm15 & (ymm8 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (mem & (ymm15 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm10[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm19 & (ymm4 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm22 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm2 & (zmm3 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) @@ -7223,15 +7204,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} +; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm9 +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[2,7,12,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k3} +; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm9 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k5 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] @@ -7245,109 +7226,109 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] -; AVX512BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm20 = [0,5,0,5,0,5,0,5] +; AVX512BW-NEXT: vpermd %ymm8, %ymm20, %ymm8 ; AVX512BW-NEXT: movl $127, %eax +; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11] ; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero ; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} -; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm8 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm12 ; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm9 {%k5} +; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,zero,xmm12[3,8,13],zero,zero,zero,xmm12[1,6,11] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm9, %ymm14 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 +; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm13 {%k3} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm15 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u] ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] -; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k6} +; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm13 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX512BW-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm13 {%k6} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm21 +; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm13 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm16 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 ; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 -; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k1} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero +; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm17 +; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm17 {%k4} ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000 +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm18 {%k3} ; AVX512BW-NEXT: kmovd %eax, %k6 +; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm19 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[1,6,11],zero,zero,zero,zero,xmm19[4,9,14,u,u,u] ; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] -; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm18[2,7,12],zero,zero,zero,xmm18[0,5,10,15],zero,zero,zero,xmm18[u,u,u] +; AVX512BW-NEXT: vporq %xmm19, %xmm16, %xmm16 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm18 = ymm15[2,3,0,1] ; AVX512BW-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512BW-NEXT: vmovdqu8 %ymm18, %ymm15 {%k6} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[2,7,12],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm18, %xmm17, %xmm4 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512BW-NEXT: vporq %xmm15, %xmm17, %xmm15 ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 ; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 -; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] -; AVX512BW-NEXT: vporq %xmm17, %xmm15, %xmm15 -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm4 {%k2} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm16, %ymm15 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm4[u,u,u,1,6,11],zero,zero,zero,zero,xmm4[4,9,14],zero,zero,zero +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[0,5,10,15],zero,zero,zero,xmm4[3,8,13] +; AVX512BW-NEXT: vporq %xmm17, %xmm4, %xmm4 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm15 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000 @@ -7368,30 +7349,30 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm12[3,8,13],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm9 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512BW-NEXT: kmovq %rax, %k5 ; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} -; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm9 {%k3} +; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero +; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm11 +; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm11 {%k4} ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} @@ -7400,21 +7381,21 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512BW-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512BW-NEXT: vpermd %ymm3, %ymm20, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -7426,7 +7407,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) @@ -7445,15 +7426,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2} ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} +; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[2,7,12,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k3} +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm9 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] @@ -7467,109 +7448,109 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] -; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm20 = [0,5,0,5,0,5,0,5] +; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm20, %ymm8 ; AVX512BW-FCP-NEXT: movl $127, %eax +; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11] ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm8 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm9 {%k5} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,zero,xmm12[3,8,13],zero,zero,zero,xmm12[1,6,11] +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm14 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 +; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] ; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm13 {%k3} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm15 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u] ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k6} +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm16 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm17 +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm17 {%k4} ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm18 {%k3} ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[1,6,11],zero,zero,zero,zero,xmm19[4,9,14,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm18[2,7,12],zero,zero,zero,xmm18[0,5,10,15],zero,zero,zero,xmm18[u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm15[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm15 {%k6} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[2,7,12],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm4 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm4[u,u,u,1,6,11],zero,zero,zero,zero,xmm4[4,9,14],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[0,5,10,15],zero,zero,zero,xmm4[3,8,13] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm4, %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 @@ -7590,30 +7571,30 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm12[3,8,13],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512BW-FCP-NEXT: kmovq %rax, %k5 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm9 {%k3} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm11 +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm11 {%k4} ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} @@ -7622,21 +7603,21 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512BW-FCP-NEXT: vpermd %ymm3, %ymm20, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -7648,7 +7629,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) @@ -7667,15 +7648,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $1108344832, %eax # imm = 0x42100000 -; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} +; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: movw $19026, %ax # imm = 0x4A52 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm9 +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[2,7,12,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k3} +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm9 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] @@ -7689,109 +7670,109 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm20 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-NEXT: vpermd %ymm8, %ymm20, %ymm8 ; AVX512DQ-BW-NEXT: movl $127, %eax +; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11] ; AVX512DQ-BW-NEXT: kmovd %eax, %k4 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm13, %xmm8 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm9 {%k5} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,zero,xmm12[3,8,13],zero,zero,zero,xmm12[1,6,11] +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm9, %ymm14 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 +; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] ; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm13 {%k3} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm15 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u] ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k6} +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm13 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm13 {%k6} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm13 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm16 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 -; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k1} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm17 +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm17 {%k4} ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000 +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm18 {%k3} ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm18, %xmm19 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[1,6,11],zero,zero,zero,zero,xmm19[4,9,14,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm18[2,7,12],zero,zero,zero,xmm18[0,5,10,15],zero,zero,zero,xmm18[u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm16, %xmm16 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm18 = ymm15[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm18, %ymm15 {%k6} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[2,7,12],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm17, %xmm4 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm17, %xmm15 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 -; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] -; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm15, %xmm15 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm4 {%k2} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm16, %ymm15 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm4[u,u,u,1,6,11],zero,zero,zero,zero,xmm4[4,9,14],zero,zero,zero +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[0,5,10,15],zero,zero,zero,xmm4[3,8,13] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm4, %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm15 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $277086208, %eax # imm = 0x10840000 @@ -7812,30 +7793,30 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm12[3,8,13],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm9 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512DQ-BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512DQ-BW-NEXT: kmovq %rax, %k5 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm9 {%k3} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm11 +; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm11 {%k4} ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} @@ -7844,21 +7825,21 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512DQ-BW-NEXT: vpermd %ymm3, %ymm20, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -7870,7 +7851,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r8) @@ -7889,15 +7870,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[2,7,12,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm9 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] @@ -7911,109 +7892,109 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm20 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm20, %ymm8 ; AVX512DQ-BW-FCP-NEXT: movl $127, %eax +; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11] ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm9 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,zero,xmm12[3,8,13],zero,zero,zero,xmm12[1,6,11] +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm14 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] ; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm13 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u] ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k6} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm16 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm17 {%k4} ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm18 {%k3} ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[1,6,11],zero,zero,zero,zero,xmm19[4,9,14,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm18[2,7,12],zero,zero,zero,xmm18[0,5,10,15],zero,zero,zero,xmm18[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm15[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm15 {%k6} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[2,7,12],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm4[u,u,u,1,6,11],zero,zero,zero,zero,xmm4[4,9,14],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[0,5,10,15],zero,zero,zero,xmm4[3,8,13] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm4, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm15 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 @@ -8034,30 +8015,30 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm12[3,8,13],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm9 {%k3} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm11 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} @@ -8066,21 +8047,21 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm3, %ymm20, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -8092,7 +8073,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index f87126a98eea4..ec4fa8c0355bf 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -62,7 +62,6 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i8_stride6_vf2: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -75,12 +74,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride6_vf2: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -93,12 +92,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX2-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX2-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride6_vf2: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -111,12 +110,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX2-FP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX2-FP-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride6_vf2: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -129,12 +128,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX2-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX2-FCP-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride6_vf2: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -147,12 +146,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride6_vf2: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -165,12 +164,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512-FCP-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride6_vf2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -183,12 +182,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512DQ-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512DQ-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride6_vf2: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -201,12 +200,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride6_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -219,12 +218,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512BW-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512BW-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride6_vf2: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -237,12 +236,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride6_vf2: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -255,12 +254,12 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf2: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -273,6 +272,7 @@ define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i8>, ptr %in.vec, align 64 @@ -327,11 +327,11 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 ; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] @@ -371,21 +371,21 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vmovd %xmm2, (%rsi) ; AVX-NEXT: vmovd %xmm3, (%rdx) @@ -402,21 +402,21 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vmovd %xmm2, (%rsi) ; AVX2-NEXT: vmovd %xmm3, (%rdx) @@ -433,21 +433,21 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vmovd %xmm2, (%rsi) ; AVX2-FP-NEXT: vmovd %xmm3, (%rdx) @@ -464,21 +464,21 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX2-FCP-NEXT: vmovd %xmm3, (%rdx) @@ -490,28 +490,28 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i8_stride6_vf4: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[2,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[1,7,13],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[4,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[2,8,14],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[3,9,15],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vmovd %xmm2, (%rsi) +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-NEXT: vmovd %xmm1, (%rsi) ; AVX512-NEXT: vmovd %xmm3, (%rdx) ; AVX512-NEXT: vmovd %xmm4, (%rcx) ; AVX512-NEXT: vmovd %xmm5, (%r8) @@ -521,28 +521,28 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i8_stride6_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[2,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[1,7,13],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[4,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[2,8,14],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[3,9,15],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovd %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-FCP-NEXT: vmovd %xmm1, (%rsi) ; AVX512-FCP-NEXT: vmovd %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovd %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovd %xmm5, (%r8) @@ -552,28 +552,28 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i8_stride6_vf4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[2,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[1,7,13],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[4,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[2,8,14],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[3,9,15],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vmovd %xmm2, (%rsi) +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-NEXT: vmovd %xmm1, (%rsi) ; AVX512DQ-NEXT: vmovd %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovd %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovd %xmm5, (%r8) @@ -583,28 +583,28 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i8_stride6_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm0[2,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[1,7,13],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[4,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[2,8,14],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[3,9,15],zero,xmm2[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rsi) ; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovd %xmm5, (%r8) @@ -619,21 +619,21 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm2, (%rsi) ; AVX512BW-NEXT: vmovd %xmm3, (%rdx) @@ -650,21 +650,21 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rdx) @@ -681,21 +681,21 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rdx) @@ -712,21 +712,21 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rdx) @@ -915,16 +915,16 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vpor %xmm6, %xmm7, %xmm5 ; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] @@ -934,12 +934,12 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovq %xmm3, (%rsi) ; AVX-NEXT: vmovq %xmm4, (%rdx) @@ -958,25 +958,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm4, (%rsi) ; AVX2-NEXT: vmovq %xmm2, (%rdx) @@ -996,25 +996,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) ; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) @@ -1034,25 +1034,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) @@ -1072,25 +1072,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm4, (%rsi) ; AVX512-NEXT: vmovq %xmm2, (%rdx) @@ -1110,25 +1110,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) @@ -1148,25 +1148,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) @@ -1186,25 +1186,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) @@ -1224,25 +1224,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512BW-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm4, (%rsi) ; AVX512BW-NEXT: vmovq %xmm2, (%rdx) @@ -1262,25 +1262,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) @@ -1300,25 +1300,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) @@ -1338,25 +1338,25 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) @@ -1683,61 +1683,61 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i8_stride6_vf16: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa (%rdi), %xmm1 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] +; AVX-NEXT: vpor %xmm4, %xmm6, %xmm6 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[4,10] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7] ; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX-NEXT: vpblendvb %xmm9, %xmm6, %xmm7, %xmm6 +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX-NEXT: vpblendvb %xmm9, %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX-NEXT: vpor %xmm10, %xmm11, %xmm9 +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11] +; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3,4,5],xmm9[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX-NEXT: vpblendvb %xmm8, %xmm7, %xmm9, %xmm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm9[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u] -; AVX-NEXT: vpblendvb %xmm11, %xmm8, %xmm10, %xmm8 +; AVX-NEXT: vpblendvb %xmm11, %xmm9, %xmm10, %xmm9 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,6,12] ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm8 +; AVX-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX-NEXT: vpor %xmm13, %xmm14, %xmm12 ; AVX-NEXT: vpblendvb %xmm11, %xmm10, %xmm12, %xmm10 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,7,13] ; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX-NEXT: vpblendvb %xmm9, %xmm10, %xmm11, %xmm9 +; AVX-NEXT: vpblendvb %xmm8, %xmm10, %xmm11, %xmm8 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] @@ -1750,10 +1750,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX-NEXT: vpblendvb %xmm12, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,9,15] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero @@ -1761,38 +1761,37 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX-NEXT: vmovdqa %xmm6, (%rsi) ; AVX-NEXT: vmovdqa %xmm7, (%rdx) -; AVX-NEXT: vmovdqa %xmm8, (%rcx) -; AVX-NEXT: vmovdqa %xmm9, (%r8) +; AVX-NEXT: vmovdqa %xmm9, (%rcx) +; AVX-NEXT: vmovdqa %xmm8, (%r8) ; AVX-NEXT: vmovdqa %xmm10, (%r9) ; AVX-NEXT: vmovdqa %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride6_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10] +; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-NEXT: vpor %xmm7, %xmm9, %xmm6 ; AVX2-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] @@ -1800,26 +1799,27 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX2-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] ; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] -; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX2-NEXT: vpor %xmm10, %xmm11, %xmm7 ; AVX2-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] +; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] -; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero -; AVX2-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX2-NEXT: vpor %xmm10, %xmm11, %xmm8 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15] @@ -1837,30 +1837,29 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FP-LABEL: load_i8_stride6_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm2 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10] +; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm2 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FP-NEXT: vpor %xmm7, %xmm9, %xmm6 ; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] @@ -1868,26 +1867,27 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] ; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] -; AVX2-FP-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX2-FP-NEXT: vpor %xmm10, %xmm11, %xmm7 ; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX2-FP-NEXT: vpor %xmm10, %xmm11, %xmm8 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15] @@ -1905,30 +1905,29 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i8_stride6_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10] +; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FCP-NEXT: vpor %xmm7, %xmm9, %xmm6 ; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] @@ -1936,26 +1935,27 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX2-FCP-NEXT: vpor %xmm10, %xmm11, %xmm7 ; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero ; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX2-FCP-NEXT: vpor %xmm10, %xmm11, %xmm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15] @@ -1973,7 +1973,6 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i8_stride6_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 @@ -1988,10 +1987,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) ; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero @@ -2005,10 +2004,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512-NEXT: vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] @@ -2045,7 +2045,6 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i8_stride6_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 @@ -2060,10 +2059,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) ; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero @@ -2077,10 +2076,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512-FCP-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] @@ -2117,7 +2117,6 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-LABEL: load_i8_stride6_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 @@ -2132,10 +2131,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) ; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero @@ -2149,10 +2148,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512DQ-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] @@ -2189,7 +2189,6 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i8_stride6_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 @@ -2204,10 +2203,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero @@ -2221,10 +2220,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] @@ -2261,296 +2261,296 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i8_stride6_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 -; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm2, %ymm3 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,6,12],zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10] ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm7 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512BW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 -; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm3 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u] +; AVX512BW-NEXT: movw $-2048, %ax # imm = 0xF800 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,7,13],zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm1 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX512BW-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm0, %ymm8 {%k3} ; AVX512BW-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} -; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492 -; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm3 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm4, %xmm9, %xmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm8 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm4 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] -; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] -; AVX512BW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] +; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm6 {%k2} +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512BW-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] ; AVX512BW-NEXT: vpshufb %xmm9, %xmm7, %xmm10 ; AVX512BW-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] -; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] ; AVX512BW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512BW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] ; AVX512BW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] -; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx) -; AVX512BW-NEXT: vmovdqa %xmm8, (%rcx) -; AVX512BW-NEXT: vmovdqa %xmm4, (%r8) -; AVX512BW-NEXT: vmovdqa %xmm6, (%r9) +; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) +; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-NEXT: vmovdqa %xmm6, (%r8) +; AVX512BW-NEXT: vmovdqa %xmm8, (%r9) ; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride6_vf16: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 -; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm2, %ymm3 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,6,12],zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10] ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 -; AVX512BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm3 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u] +; AVX512BW-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,7,13],zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm1 {%k2} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX512BW-FCP-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm0, %ymm8 {%k3} ; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} -; AVX512BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 -; AVX512BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm9, %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm8 {%k2} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm4 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] +; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm6 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] ; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 ; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] ; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] -; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9) +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride6_vf16: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-BW-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm2, %ymm3 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,6,12],zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10] ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm7 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800 -; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm6, %xmm3 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u] +; AVX512DQ-BW-NEXT: movw $-2048, %ax # imm = 0xF800 +; AVX512DQ-BW-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,7,13],zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm6, %xmm1 {%k2} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX512DQ-BW-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512DQ-BW-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm0, %ymm8 {%k3} ; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} -; AVX512DQ-BW-NEXT: movw $9362, %di # imm = 0x2492 -; AVX512DQ-BW-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm3 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm9, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm8 {%k2} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm4 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] -; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] +; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm8, %xmm6 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm7, %xmm10 ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] ; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9) +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf16: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm2, %ymm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,6,12],zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,7,13],zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm1 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm0, %ymm8 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm9, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm8 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -3197,243 +3197,243 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-LABEL: load_i8_stride6_vf32: ; AVX: # %bb.0: ; AVX-NEXT: subq $120, %rsp -; AVX-NEXT: vmovdqa (%rdi), %xmm9 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa (%rdi), %xmm5 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,8,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm12[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[2,8,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm7[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[3,9,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm12[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[3,9,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm7[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm12[0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm12[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm7 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm1 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] -; AVX-NEXT: # xmm11 = mem[0,0] -; AVX-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm4 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] -; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm10 -; AVX-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX-NEXT: vmovd {{.*#+}} xmm15 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm15, %xmm6, %xmm4 +; AVX-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX-NEXT: # xmm10 = mem[0,0] +; AVX-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX-NEXT: vpshufb %xmm10, %xmm8, %xmm3 +; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX-NEXT: # xmm9 = mem[0,0] +; AVX-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX-NEXT: vpshufb %xmm9, %xmm6, %xmm14 +; AVX-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm14 +; AVX-NEXT: vmovd {{.*#+}} xmm1 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] -; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm10 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm8 -; AVX-NEXT: vmovdqa %xmm7, %xmm10 -; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX-NEXT: vpor %xmm2, %xmm8, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX-NEXT: vorps %ymm1, %ymm2, %ymm8 -; AVX-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX-NEXT: vpshufb %xmm15, %xmm6, %xmm1 +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm15 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; AVX-NEXT: vpshufb %xmm7, %xmm12, %xmm3 +; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm7 +; AVX-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm5 ; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm11 -; AVX-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX-NEXT: vpor %xmm3, %xmm11, %xmm11 -; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm3, %xmm15, %xmm11, %xmm15 -; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX-NEXT: vandps %ymm11, %ymm8, %ymm8 -; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX-NEXT: vandnps %ymm15, %ymm11, %ymm15 -; AVX-NEXT: vorps %ymm15, %ymm8, %ymm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX-NEXT: vpor %xmm10, %xmm9, %xmm10 +; AVX-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpblendvb %xmm9, %xmm0, %xmm10, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3,4,5],xmm7[6,7] +; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vandnps %ymm14, %ymm10, %ymm14 +; AVX-NEXT: vandps %ymm7, %ymm10, %ymm7 +; AVX-NEXT: vorps %ymm7, %ymm14, %ymm14 +; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX-NEXT: vandps %ymm7, %ymm14, %ymm14 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX-NEXT: vorps %ymm0, %ymm14, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[3,9,15,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[1,7,13],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm15, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[5,11] -; AVX-NEXT: vmovdqa %xmm14, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vmovd {{.*#+}} xmm8 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm15 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm10[3,9,15,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm15, %xmm10, %xmm10 -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5],xmm10[6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX-NEXT: vandps %ymm14, %ymm10, %ymm10 +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm1[3,9,15,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[1,7,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,11] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX-NEXT: vmovd {{.*#+}} xmm14 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm14, %xmm13, %xmm15 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm13 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm12[3,9,15,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,7,13],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4,5],xmm12[6,7] +; AVX-NEXT: vandnps %ymm0, %ymm10, %ymm0 +; AVX-NEXT: vandps %ymm10, %ymm12, %ymm10 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm12 +; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX-NEXT: vpblendvb %xmm9, %xmm3, %xmm12, %xmm3 ; AVX-NEXT: vorps %ymm0, %ymm10, %ymm0 -; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm8, %xmm1 -; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[2,8,14],zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm10 -; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] -; AVX-NEXT: # xmm11 = mem[0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm14 -; AVX-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm1[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[2,8,14],zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm0, %xmm7, %xmm0 +; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] +; AVX-NEXT: # xmm7 = mem[0,0] +; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm10 +; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX-NEXT: # xmm12 = mem[0,0] +; AVX-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 -; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX-NEXT: vorps %ymm0, %ymm10, %ymm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm14[0],xmm10[0] -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm11 -; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX-NEXT: vpblendvb %xmm3, %xmm10, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa %xmm15, %xmm3 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm13[0],xmm10[0] +; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm12 +; AVX-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX-NEXT: vpblendvb %xmm9, %xmm10, %xmm7, %xmm7 +; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX-NEXT: vorps %ymm0, %ymm10, %ymm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm11 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[3,9,15],zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm14 -; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] -; AVX-NEXT: # xmm15 = mem[0,0] -; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm8 -; AVX-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm14[0],xmm8[0] -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm14 -; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1 -; AVX-NEXT: vpblendvb %xmm3, %xmm8, %xmm1, %xmm1 -; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[4,10],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm9[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm8 -; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,9,15],zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] +; AVX-NEXT: # xmm12 = mem[0,0] +; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] ; AVX-NEXT: # xmm14 = mem[0,0] -; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX-NEXT: vpor %xmm8, %xmm15, %xmm8 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm15 # 32-byte Folded Reload -; AVX-NEXT: vandps %ymm0, %ymm8, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 -; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm15 +; AVX-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm15[0],xmm13[0] +; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm12 ; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX-NEXT: vpblendvb %xmm9, %xmm13, %xmm12, %xmm12 +; AVX-NEXT: vandnps %ymm7, %ymm9, %ymm7 +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX-NEXT: vorps %ymm7, %ymm9, %ymm7 +; AVX-NEXT: vandps %ymm7, %ymm10, %ymm7 +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm9 +; AVX-NEXT: vandnps %ymm9, %ymm10, %ymm9 +; AVX-NEXT: vorps %ymm7, %ymm9, %ymm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[4,10],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm1[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] +; AVX-NEXT: # xmm12 = mem[0,0] +; AVX-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX-NEXT: # xmm14 = mem[0,0] +; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm15 +; AVX-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm9 +; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload +; AVX-NEXT: vandps %ymm13, %ymm9, %ymm9 +; AVX-NEXT: vorps %ymm15, %ymm9, %ymm9 +; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm12 +; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm14 +; AVX-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4],xmm1[5,6,7] -; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4],xmm12[5,6,7] +; AVX-NEXT: vandps %ymm10, %ymm9, %ymm9 +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX-NEXT: vandnps %ymm12, %ymm10, %ymm12 +; AVX-NEXT: vorps %ymm12, %ymm9, %ymm9 +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,11],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm1[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] +; AVX-NEXT: # xmm12 = mem[0,0] +; AVX-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] +; AVX-NEXT: # xmm14 = mem[0,0] +; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 +; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: vandps %ymm6, %ymm13, %ymm6 +; AVX-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm1 +; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX-NEXT: vandps %ymm6, %ymm10, %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[5,11],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] -; AVX-NEXT: # xmm9 = mem[0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm12, %xmm12 -; AVX-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] -; AVX-NEXT: # xmm13 = mem[0,0] -; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: vandps %ymm1, %ymm8, %ymm1 -; AVX-NEXT: vorps %ymm7, %ymm1, %ymm1 -; AVX-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7] -; AVX-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, (%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, (%rdx) -; AVX-NEXT: vmovaps %ymm11, (%rcx) -; AVX-NEXT: vmovaps %ymm3, (%r8) -; AVX-NEXT: vmovaps %ymm0, (%r9) +; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, (%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, (%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, (%rcx) +; AVX-NEXT: vmovaps %ymm7, (%r8) +; AVX-NEXT: vmovaps %ymm9, (%r9) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps %ymm1, (%rax) ; AVX-NEXT: addq $120, %rsp @@ -3442,1193 +3442,1189 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i8_stride6_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm11 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215] -; AVX2-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] -; AVX2-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 -; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-NEXT: vpblendvb %ymm8, %ymm14, %ymm4, %ymm8 -; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm13, %ymm13 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero,xmm8[u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] -; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 -; AVX2-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm1[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm1 +; AVX2-NEXT: vpor %xmm6, %xmm10, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm14, %ymm9 -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-NEXT: vpblendvb %ymm10, %ymm13, %ymm8, %ymm8 -; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm14, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm11 -; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] -; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm6, %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm8 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[2,8,14,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero,xmm8[u,u,u,u,u] ; AVX2-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm12 -; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm12, %ymm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero -; AVX2-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero -; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm14 +; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm6, %ymm13, %ymm15, %ymm13 +; AVX2-NEXT: vpblendvb %ymm7, %ymm12, %ymm10, %ymm7 +; AVX2-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[3,9,15,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vpblendvb %ymm6, %ymm8, %ymm14, %ymm14 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12] +; AVX2-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm9, %ymm13, %ymm6, %ymm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13] +; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm8 +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-NEXT: vpblendvb %ymm9, %ymm14, %ymm7, %ymm7 +; AVX2-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] +; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero +; AVX2-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm13[0,6,12],zero,zero,zero,xmm13[4,10,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm14[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm5 +; AVX2-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm13[1,7,13],zero,zero,zero,xmm13[5,11,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10] +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,4,10],zero,zero,zero,xmm8[2,8,14],zero,zero +; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,5,11],zero,zero,zero,xmm8[3,9,15],zero,zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqa %ymm0, (%rsi) ; AVX2-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-NEXT: vmovdqa %ymm6, (%r9) +; AVX2-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride6_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm11 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 -; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm14, %ymm4, %ymm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm13, %ymm13 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero,xmm8[u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] -; AVX2-FP-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm1[0,1] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm1 +; AVX2-FP-NEXT: vpor %xmm6, %xmm10, %xmm0 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm10, %ymm0 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm14, %ymm9 -; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm13, %ymm8, %ymm8 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm14, %ymm4 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm8, %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm8 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[2,8,14,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero,xmm8[u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm12 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm6, %ymm12, %ymm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero -; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm14 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm13, %ymm15, %ymm13 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm12, %ymm10, %ymm7 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[3,9,15,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm8, %ymm14, %ymm14 +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12] +; AVX2-FP-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm13, %ymm6, %ymm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13] +; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm8 +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm14, %ymm7, %ymm7 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 +; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm13[0,6,12],zero,zero,zero,xmm13[4,10,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm14[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm13[1,7,13],zero,zero,zero,xmm13[5,11,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,4,10],zero,zero,zero,xmm8[2,8,14],zero,zero +; AVX2-FP-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,5,11],zero,zero,zero,xmm8[3,9,15],zero,zero +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero -; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm6, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride6_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm14, %ymm4, %ymm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm13, %ymm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero,xmm8[u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm1[0,1] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpor %xmm6, %xmm10, %xmm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm14, %ymm9 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm13, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm14, %ymm4 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm8 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[2,8,14,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero,xmm8[u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm6, %ymm12, %ymm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero -; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm14 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm13, %ymm15, %ymm13 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm12, %ymm10, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[3,9,15,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero,xmm8[u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm8, %ymm14, %ymm14 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12] +; AVX2-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13] +; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm8 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm14, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm13[0,6,12],zero,zero,zero,xmm13[4,10,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm14[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm13[1,7,13],zero,zero,zero,xmm13[5,11,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,4,10],zero,zero,zero,xmm8[2,8,14],zero,zero +; AVX2-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,5,11],zero,zero,zero,xmm8[3,9,15],zero,zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero -; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rsi) ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm6, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride6_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm18 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm17 ^ ymm18)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] ; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-NEXT: vmovdqa %ymm10, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm1 ^ ymm4)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16) -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6)) +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm8 ^ ymm6)) ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero -; AVX512-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16) -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero -; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm7 & ymm16) +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero +; AVX512-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17)) -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7],ymm12[8,9,10],ymm7[11,12,13,14,15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ymm16) +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero +; AVX512-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm17 ^ (ymm12 & (ymm18 ^ ymm17)) +; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm13[4,10],zero,zero,zero,xmm13[2,8,14,u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[2,8,14],zero,zero,xmm12[0,6,12],zero,zero,zero,xmm12[u,u,u,u,u] +; AVX512-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm4 ^ ymm1)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16) -; AVX512-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] -; AVX512-NEXT: vpor %xmm4, %xmm15, %xmm4 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16) -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] -; AVX512-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6)) -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9)) +; AVX512-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm6 ^ ymm8)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,0,6,12],zero,zero,zero,xmm0[4,10],zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm11 & ymm16) +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[2,8,14],zero,zero,xmm11[0,6,12] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vpor %xmm3, %xmm15, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[5,11],zero,zero,zero,xmm13[3,9,15,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[3,9,15],zero,zero,xmm12[1,7,13],zero,zero,zero,xmm12[u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm15 & (ymm3 ^ ymm14)) +; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,7,13],zero,zero,zero,xmm0[5,11],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[3,9,15],zero,zero,xmm11[1,7,13] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm16) +; AVX512-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm15 & (ymm0 ^ ymm10)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm9 & (ymm8 ^ ymm6)) +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm17 ^ (ymm2 & (ymm18 ^ ymm17)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[4,10,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4)) +; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm11[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm8 ^ (ymm15 & (ymm4 ^ ymm8)) ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero ; AVX512-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[5,11,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6)) -; AVX512-NEXT: vmovdqa64 %ymm18, (%rsi) +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm15 & (ymm1 ^ ymm6)) +; AVX512-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512-NEXT: vmovdqa %ymm2, (%r8) -; AVX512-NEXT: vmovdqa %ymm5, (%r9) -; AVX512-NEXT: vmovdqa %ymm0, (%rax) +; AVX512-NEXT: vmovdqa %ymm3, (%rcx) +; AVX512-NEXT: vmovdqa %ymm0, (%r8) +; AVX512-NEXT: vmovdqa %ymm4, (%r9) +; AVX512-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride6_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm18 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm17 ^ ymm18)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] ; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm1 ^ ymm4)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16) -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6)) +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm8 ^ ymm6)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero -; AVX512-FCP-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero -; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm7 & ymm16) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7],ymm12[8,9,10],ymm7[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ymm16) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm17 ^ (ymm12 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm13[4,10],zero,zero,zero,xmm13[2,8,14,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[2,8,14],zero,zero,xmm12[0,6,12],zero,zero,zero,xmm12[u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm4 ^ ymm1)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16) -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9)) +; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm6 ^ ymm8)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,0,6,12],zero,zero,zero,xmm0[4,10],zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm11 & ymm16) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[2,8,14],zero,zero,xmm11[0,6,12] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vpor %xmm3, %xmm15, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[5,11],zero,zero,zero,xmm13[3,9,15,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[3,9,15],zero,zero,xmm12[1,7,13],zero,zero,zero,xmm12[u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm15 & (ymm3 ^ ymm14)) +; AVX512-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,7,13],zero,zero,zero,xmm0[5,11],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[3,9,15],zero,zero,xmm11[1,7,13] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm16) +; AVX512-FCP-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm15 & (ymm0 ^ ymm10)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm9 & (ymm8 ^ ymm6)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm17 ^ (ymm2 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[4,10,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4)) +; AVX512-FCP-NEXT: vpor %xmm11, %xmm12, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm11[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm8 ^ (ymm15 & (ymm4 ^ ymm8)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[5,11,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm15 & (ymm1 ^ ymm6)) +; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r9) -; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride6_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm18 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm17 ^ ymm18)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] ; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm1 ^ ymm4)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16) -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6)) +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm8 ^ ymm6)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero -; AVX512DQ-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero -; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm7 & ymm16) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero +; AVX512DQ-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7],ymm12[8,9,10],ymm7[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ymm16) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero +; AVX512DQ-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm17 ^ (ymm12 & (ymm18 ^ ymm17)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm13[4,10],zero,zero,zero,xmm13[2,8,14,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[2,8,14],zero,zero,xmm12[0,6,12],zero,zero,zero,xmm12[u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm4 ^ ymm1)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16) -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] -; AVX512DQ-NEXT: vpor %xmm4, %xmm15, %xmm4 -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] -; AVX512DQ-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9)) +; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm6 ^ ymm8)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,0,6,12],zero,zero,zero,xmm0[4,10],zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm11 & ymm16) +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[2,8,14],zero,zero,xmm11[0,6,12] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vpor %xmm3, %xmm15, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[5,11],zero,zero,zero,xmm13[3,9,15,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[3,9,15],zero,zero,xmm12[1,7,13],zero,zero,zero,xmm12[u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm15 & (ymm3 ^ ymm14)) +; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,7,13],zero,zero,zero,xmm0[5,11],zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[3,9,15],zero,zero,xmm11[1,7,13] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm16) +; AVX512DQ-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm15 & (ymm0 ^ ymm10)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm9 & (ymm8 ^ ymm6)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm17 ^ (ymm2 & (ymm18 ^ ymm17)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[4,10,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4)) +; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm11[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm8 ^ (ymm15 & (ymm4 ^ ymm8)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[5,11,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6)) -; AVX512DQ-NEXT: vmovdqa64 %ymm18, (%rsi) +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm15 & (ymm1 ^ ymm6)) +; AVX512DQ-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm2, (%r8) -; AVX512DQ-NEXT: vmovdqa %ymm5, (%r9) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-NEXT: vmovdqa %ymm3, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%r8) +; AVX512DQ-NEXT: vmovdqa %ymm4, (%r9) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride6_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm18 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm17 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm1 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16) -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm8 ^ ymm6)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm7 & ymm16) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7],ymm12[8,9,10],ymm7[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ymm16) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm17 ^ (ymm12 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm13[4,10],zero,zero,zero,xmm13[2,8,14,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[2,8,14],zero,zero,xmm12[0,6,12],zero,zero,zero,xmm12[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm4 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm6 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,0,6,12],zero,zero,zero,xmm0[4,10],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm11 & ymm16) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[2,8,14],zero,zero,xmm11[0,6,12] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm15, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[5,11],zero,zero,zero,xmm13[3,9,15,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[3,9,15],zero,zero,xmm12[1,7,13],zero,zero,zero,xmm12[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm15 & (ymm3 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,1,7,13],zero,zero,zero,xmm0[5,11],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[3,9,15],zero,zero,xmm11[1,7,13] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm16) +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm15 & (ymm0 ^ ymm10)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm9 & (ymm8 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm17 ^ (ymm2 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[4,10,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm11[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm8 ^ (ymm15 & (ymm4 ^ ymm8)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[5,11,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm15 & (ymm1 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride6_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 -; AVX512BW-NEXT: movw $-28124, %r10w # imm = 0x9224 -; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2} -; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 -; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm5, %xmm5 -; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 -; AVX512BW-NEXT: kmovd %r10d, %k3 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm10 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm6 +; AVX512BW-NEXT: movw $-28124, %ax # imm = 0x9224 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm6, %ymm4 {%k2} +; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm3, %ymm7 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm9 +; AVX512BW-NEXT: movl $4192256, %eax # imm = 0x3FF800 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm2, %ymm10 {%k1} +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero ; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero -; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0,1,2],ymm12[3,4,5,6,7],ymm9[8,9,10],ymm12[11,12,13,14,15] +; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm1, %ymm9 {%k2} ; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3} +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm0, %ymm10 {%k3} +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k4 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12] -; AVX512BW-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm5, %ymm12 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[0,6,12] +; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2} +; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} ; AVX512BW-NEXT: movw $9289, %di # imm = 0x2449 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512BW-NEXT: kmovd %edi, %k5 -; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] -; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[1,7,13] +; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3} -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2} +; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm6[5,6,7] +; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm2 {%k3} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[2,8,14] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm6 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[3,9,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-NEXT: vmovdqa %ymm5, (%rsi) -; AVX512BW-NEXT: vmovdqa %ymm6, (%rdx) +; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512BW-NEXT: vmovdqa %ymm8, (%rdx) ; AVX512BW-NEXT: vmovdqa %ymm7, (%rcx) -; AVX512BW-NEXT: vmovdqa %ymm8, (%r8) -; AVX512BW-NEXT: vmovdqa %ymm9, (%r9) +; AVX512BW-NEXT: vmovdqa %ymm10, (%r8) +; AVX512BW-NEXT: vmovdqa %ymm6, (%r9) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride6_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 -; AVX512BW-FCP-NEXT: movw $-28124, %r10w # imm = 0x9224 -; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2} -; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 -; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm5 -; AVX512BW-FCP-NEXT: movl $4192256, %r10d # imm = 0x3FF800 -; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm6 +; AVX512BW-FCP-NEXT: movw $-28124, %ax # imm = 0x9224 +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm6, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm3, %ymm7 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm5, %xmm9 +; AVX512BW-FCP-NEXT: movl $4192256, %eax # imm = 0x3FF800 +; AVX512BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm2, %ymm10 {%k1} +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0,1,2],ymm12[3,4,5,6,7],ymm9[8,9,10],ymm12[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm1, %ymm9 {%k2} ; AVX512BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm0, %ymm10 {%k3} +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm12, %xmm7 ; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12] -; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm5, %ymm12 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[0,6,12] +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} ; AVX512BW-FCP-NEXT: movw $9289, %di # imm = 0x2449 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512BW-FCP-NEXT: kmovd %edi, %k5 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k5} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[1,7,13] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2} +; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm6[5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm2 {%k3} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[2,8,14] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm6 {%k2} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[3,9,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride6_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 -; AVX512DQ-BW-NEXT: movw $-28124, %r10w # imm = 0x9224 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2} -; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm5, %xmm5 -; AVX512DQ-BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm6 +; AVX512DQ-BW-NEXT: movw $-28124, %ax # imm = 0x9224 +; AVX512DQ-BW-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm6, %ymm4 {%k2} +; AVX512DQ-BW-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm3, %ymm7 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm5, %xmm9 +; AVX512DQ-BW-NEXT: movl $4192256, %eax # imm = 0x3FF800 +; AVX512DQ-BW-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm2, %ymm10 {%k1} +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0,1,2],ymm12[3,4,5,6,7],ymm9[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm8 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm1, %ymm9 {%k2} ; AVX512DQ-BW-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512DQ-BW-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3} +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm0, %ymm10 {%k3} +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm12, %xmm7 ; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12] -; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm5, %ymm12 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[0,6,12] +; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} ; AVX512DQ-BW-NEXT: movw $9289, %di # imm = 0x2449 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-BW-NEXT: kmovd %edi, %k5 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k5} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[1,7,13] +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2} +; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm6[5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm2 {%k3} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[2,8,14] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm6 {%k2} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[3,9,15] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 -; AVX512DQ-BW-FCP-NEXT: movw $-28124, %r10w # imm = 0x9224 -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: movl $4192256, %r10d # imm = 0x3FF800 -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm6 +; AVX512DQ-BW-FCP-NEXT: movw $-28124, %ax # imm = 0x9224 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm6, %ymm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm3, %ymm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm5, %xmm9 +; AVX512DQ-BW-FCP-NEXT: movl $4192256, %eax # imm = 0x3FF800 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm2, %ymm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0,1,2],ymm12[3,4,5,6,7],ymm9[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm1, %ymm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm0, %ymm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm12, %xmm7 ; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm5, %ymm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[2,8,14],zero,zero,xmm13[0,6,12] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $9289, %di # imm = 0x2449 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,u],zero,zero,zero,xmm13[3,9,15],zero,zero,xmm13[1,7,13] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm6[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm2 {%k3} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[2,8,14] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,2,8,14],zero,zero,xmm2[0,6,12],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[3,9,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,3,9,15],zero,zero,xmm2[1,7,13],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -5943,10 +5939,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i8_stride6_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $616, %rsp # imm = 0x268 +; AVX-NEXT: subq $648, %rsp # imm = 0x288 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm8 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm9 @@ -5957,15 +5953,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm0 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] -; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm1 -; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm1 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm0 -; AVX-NEXT: vmovdqa %xmm2, %xmm4 +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vmovdqa %xmm2, %xmm5 ; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u] -; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u] +; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb %xmm13, %xmm9, %xmm1 ; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm2 @@ -5973,255 +5968,257 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm2 ; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovd {{.*#+}} xmm13 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] -; AVX-NEXT: vpshufb %xmm0, %xmm15, %xmm2 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] +; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX-NEXT: vmovq {{.*#+}} xmm15 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovq {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm2 -; AVX-NEXT: vmovdqa %xmm7, %xmm14 -; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm3 -; AVX-NEXT: vmovdqa %xmm4, %xmm7 +; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm2 +; AVX-NEXT: vpshufb %xmm12, %xmm5, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb %xmm13, %xmm9, %xmm1 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX-NEXT: vpshufb %xmm15, %xmm10, %xmm2 +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovq {{.*#+}} xmm12 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm0 +; AVX-NEXT: vpshufb %xmm12, %xmm5, %xmm0 +; AVX-NEXT: vmovdqa %xmm5, %xmm14 ; AVX-NEXT: vmovq {{.*#+}} xmm13 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm1 +; AVX-NEXT: vpshufb %xmm13, %xmm4, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm0 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm0 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm1 ; AVX-NEXT: vpshufb %xmm13, %xmm10, %xmm2 -; AVX-NEXT: vmovdqa %xmm10, %xmm12 +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm2 +; AVX-NEXT: vpshufb %xmm5, %xmm9, %xmm2 ; AVX-NEXT: vmovdqa %xmm9, %xmm15 -; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm0, %xmm13 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm8 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm1 -; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa %xmm14, %xmm13 -; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX-NEXT: vmovdqa %xmm3, %xmm14 +; AVX-NEXT: vmovq {{.*#+}} xmm11 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm5 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa %xmm4, %xmm9 +; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm2 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm9 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm6, %xmm2 -; AVX-NEXT: vmovdqa %xmm6, %xmm10 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX-NEXT: vmovdqa %xmm6, %xmm8 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] -; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpblendvb %xmm13, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm8, %xmm11, %xmm1 -; AVX-NEXT: vmovdqa %xmm11, %xmm8 -; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm1 +; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm2 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm2 +; AVX-NEXT: vpshufb %xmm0, %xmm15, %xmm2 +; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm3 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm12 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vmovq {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm11 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] -; AVX-NEXT: # xmm11 = mem[0,0] ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[4,10] +; AVX-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX-NEXT: # xmm10 = mem[0,0] ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; AVX-NEXT: vmovd {{.*#+}} xmm14 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm4 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] -; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX-NEXT: vmovd {{.*#+}} xmm1 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm4 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] +; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX-NEXT: vmovdqa %xmm0, %xmm8 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm5 -; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm9 +; AVX-NEXT: vpshufb %xmm12, %xmm9, %xmm5 +; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm9 ; AVX-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX-NEXT: vorps %ymm2, %ymm4, %ymm9 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm4 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm4 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] -; AVX-NEXT: # xmm0 = mem[0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm10 +; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX-NEXT: # xmm7 = mem[0,0] +; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm10, %xmm0 +; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm10 ; AVX-NEXT: vpor %xmm2, %xmm10, %xmm10 -; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm10, %xmm10 -; AVX-NEXT: vmovdqa %ymm2, %ymm5 -; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX-NEXT: vandps %ymm7, %ymm9, %ymm9 +; AVX-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpblendvb %xmm14, %xmm4, %xmm10, %xmm10 +; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX-NEXT: vandps %ymm12, %ymm9, %ymm9 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX-NEXT: vandnps %ymm10, %ymm7, %ymm10 -; AVX-NEXT: vorps %ymm10, %ymm9, %ymm4 -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vandnps %ymm10, %ymm12, %ymm10 +; AVX-NEXT: vorps %ymm10, %ymm9, %ymm3 +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 304(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm9 +; AVX-NEXT: vmovq {{.*#+}} xmm5 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm9 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm10 -; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX-NEXT: vmovdqa 272(%rdi), %xmm2 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm11, %xmm1 ; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm10 -; AVX-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm11 +; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX-NEXT: vmovdqa 272(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm10 +; AVX-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm11 ; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm10 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm11 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm6 -; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3,4,5],xmm3[6,7] -; AVX-NEXT: vandnps %ymm9, %ymm13, %ymm6 -; AVX-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX-NEXT: vmovaps %ymm13, %ymm11 -; AVX-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm3 +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm9 +; AVX-NEXT: vpshufb %xmm8, %xmm15, %xmm10 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm10 +; AVX-NEXT: vpshufb %xmm1, %xmm13, %xmm6 +; AVX-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3,4,5],xmm6[6,7] ; AVX-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm0 -; AVX-NEXT: vmovdqa 336(%rdi), %xmm10 -; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm1 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[4,10] +; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm4 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero -; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vandps %ymm7, %ymm3, %ymm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX-NEXT: vpblendvb %xmm14, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vandnps %ymm3, %ymm11, %ymm1 +; AVX-NEXT: vandps %ymm6, %ymm11, %ymm3 +; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] +; AVX-NEXT: # xmm6 = mem[0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,11] -; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128] -; AVX-NEXT: # xmm9 = mem[0,0] +; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128] +; AVX-NEXT: # xmm7 = mem[0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm14 -; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm14 +; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm13 +; AVX-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm13 ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm15 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm15 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm13 -; AVX-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5],xmm13[6,7] -; AVX-NEXT: vandnps %ymm14, %ymm11, %ymm13 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm8, %xmm10, %xmm15 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm14 +; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5],xmm14[6,7] +; AVX-NEXT: vandnps %ymm13, %ymm11, %ymm13 ; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 ; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm13 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm11, %xmm14 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm14 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm6, %xmm14, %xmm14 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm15 +; AVX-NEXT: vpshufb %xmm7, %xmm15, %xmm15 ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm15, %xmm13, %xmm14, %xmm13 -; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX-NEXT: vandnps %ymm13, %ymm15, %ymm13 +; AVX-NEXT: vandnps %ymm13, %ymm12, %ymm13 ; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm13, %xmm13 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm13 ; AVX-NEXT: vpor %xmm0, %xmm13, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm14 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm13 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm14 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm13 -; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm14 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm13 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm14 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8 @@ -6229,86 +6226,85 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm13[3,4,5],xmm4[6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4 -; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm4, %ymm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,11] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX-NEXT: vpshufb %xmm6, %xmm14, %xmm3 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX-NEXT: vpblendvb %xmm15, %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm12 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm12, %xmm9, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm13, %xmm15, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] ; AVX-NEXT: # xmm0 = mem[0,0] ; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm4 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm5 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX-NEXT: vmovdqa %ymm8, %ymm9 -; AVX-NEXT: vandnps %ymm1, %ymm8, %ymm1 -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: vorps %ymm1, %ymm4, %ymm4 -; AVX-NEXT: vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm7 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm7, %xmm11, %xmm4 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] -; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm5 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX-NEXT: vpshufb %xmm8, %xmm10, %xmm5 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX-NEXT: vmovdqa %xmm6, %xmm15 -; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm6 ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpblendvb %xmm9, %xmm1, %xmm5, %xmm5 +; AVX-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpblendvb %xmm8, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vandnps %ymm1, %ymm8, %ymm1 +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: vorps %ymm1, %ymm5, %ymm5 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX-NEXT: vandnps %ymm5, %ymm1, %ymm5 -; AVX-NEXT: vorps %ymm5, %ymm4, %ymm1 +; AVX-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX-NEXT: vorps %ymm4, %ymm5, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm10[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm5 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm5 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX-NEXT: vandnps %ymm4, %ymm9, %ymm4 -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload ; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm5 -; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpblendvb %xmm9, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm8, %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vandps %ymm5, %ymm4, %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -6316,35 +6312,35 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm0 +; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm3 +; AVX-NEXT: vpshufb %xmm13, %xmm15, %xmm3 +; AVX-NEXT: vmovdqa %xmm15, %xmm14 ; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] ; AVX-NEXT: # xmm5 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm6 +; AVX-NEXT: vmovdqa %xmm12, %xmm9 +; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm6 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX-NEXT: vmovdqa %ymm9, %ymm13 -; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: vmovdqa %ymm8, %ymm13 +; AVX-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload ; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 ; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] ; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX-NEXT: vpshufb %xmm5, %xmm15, %xmm7 -; AVX-NEXT: vmovdqa %xmm15, %xmm8 +; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm7 +; AVX-NEXT: vmovdqa %xmm11, %xmm8 ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpblendvb %xmm13, %xmm4, %xmm6, %xmm4 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] @@ -6353,22 +6349,23 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vandnps %ymm4, %ymm7, %ymm4 ; AVX-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm12[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[3,9,15],zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm10[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[3,9,15],zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm4 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm15, %xmm6 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm5 ; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX-NEXT: vpblendvb %xmm13, %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vandnps %ymm0, %ymm13, %ymm0 @@ -6379,131 +6376,128 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2 ; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm13 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm14, %xmm13, %xmm2 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm2 +; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] ; AVX-NEXT: # xmm4 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm15, %xmm5 ; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2 ; AVX-NEXT: vorps %ymm5, %ymm2, %ymm2 ; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm4, %xmm14, %xmm6 ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm8 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm6 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm6 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm7 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2 +; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX-NEXT: vandnps %ymm5, %ymm0, %ymm5 +; AVX-NEXT: vandnps %ymm5, %ymm6, %ymm5 ; AVX-NEXT: vorps %ymm5, %ymm2, %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm1[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[4,10],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm6 +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm6 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm7 ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm8, %xmm11, %xmm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm6 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX-NEXT: vandps %ymm0, %ymm5, %ymm4 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vandps %ymm2, %ymm5, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm4, %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vandnps %ymm3, %ymm2, %ymm3 +; AVX-NEXT: vorps %ymm3, %ymm4, %ymm4 ; AVX-NEXT: vmovq {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm3 -; AVX-NEXT: vmovq {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm13, %xmm5 +; AVX-NEXT: vmovq {{.*#+}} xmm2 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm5 ; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] ; AVX-NEXT: # xmm5 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm7 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm7 ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] ; AVX-NEXT: # xmm6 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm8 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3 ; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm8 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX-NEXT: vpshufb %xmm6, %xmm14, %xmm8 ; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] -; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm8 +; AVX-NEXT: vpshufb %xmm13, %xmm9, %xmm8 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm9 -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1] +; AVX-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm12[1],xmm8[1] ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vandps %ymm2, %ymm3, %ymm3 +; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX-NEXT: vandnps %ymm7, %ymm2, %ymm7 +; AVX-NEXT: vandnps %ymm7, %ymm9, %ymm7 ; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm7 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm8 ; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm8 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm9 -; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm8 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm12 +; AVX-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX-NEXT: vandps %ymm7, %ymm10, %ymm0 +; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: vandps %ymm0, %ymm7, %ymm0 ; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm6 ; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm7 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm0, %ymm9, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX-NEXT: vandnps %ymm5, %ymm2, %ymm1 +; AVX-NEXT: vandnps %ymm5, %ymm9, %ymm1 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 32(%rsi) @@ -6521,230 +6515,229 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps %ymm1, 32(%r8) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, (%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 32(%r9) +; AVX-NEXT: vmovaps %ymm4, 32(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, (%r9) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps %ymm0, 32(%rax) ; AVX-NEXT: vmovaps %ymm3, (%rax) -; AVX-NEXT: addq $616, %rsp # imm = 0x268 +; AVX-NEXT: addq $648, %rsp # imm = 0x288 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride6_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $328, %rsp # imm = 0x148 -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm1[0,1] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 -; AVX2-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm9 +; AVX2-NEXT: vpblendvb %ymm12, %ymm7, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm2 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm10 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm12 -; AVX2-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215] -; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm11 +; AVX2-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm13, %ymm10, %ymm8, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX2-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-NEXT: vpshufb %xmm11, %xmm15, %xmm10 -; AVX2-NEXT: vpor %xmm0, %xmm10, %xmm1 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX2-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[0,1],ymm9[0,1] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm10[2,3],ymm9[2,3] +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm15 +; AVX2-NEXT: vpshufb %xmm14, %xmm15, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm5 +; AVX2-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa %ymm8, %ymm11 +; AVX2-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] -; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX2-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-NEXT: vpshufb %xmm4, %xmm14, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb %ymm3, %ymm5, %ymm1 +; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm5 -; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm5 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: vpshufb %ymm14, %ymm3, %ymm15 -; AVX2-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5 -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5 -; AVX2-NEXT: vmovdqa %ymm8, %ymm7 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX2-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0 -; AVX2-NEXT: vpshufb %ymm14, %ymm0, %ymm6 -; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm4 +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] +; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm15 +; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm15, %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm15 +; AVX2-NEXT: vpshufb %xmm8, %xmm15, %xmm1 +; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm0 +; AVX2-NEXT: vpblendvb %ymm14, %ymm12, %ymm11, %ymm6 +; AVX2-NEXT: vmovdqa %ymm11, %ymm8 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm1 +; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 +; AVX2-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm14 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 -; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 -; AVX2-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm15 +; AVX2-NEXT: vmovdqa 352(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1 -; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12 -; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 +; AVX2-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm1 +; AVX2-NEXT: vpblendvb %ymm11, %ymm5, %ymm6, %ymm12 +; AVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] -; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm5 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm9 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] -; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm9 -; AVX2-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9 -; AVX2-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm10 +; AVX2-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: vpblendvb %ymm10, %ymm7, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] -; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] -; AVX2-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] +; AVX2-NEXT: vpshufb %xmm4, %xmm9, %xmm9 +; AVX2-NEXT: vpor %xmm2, %xmm9, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm1 +; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm10, %ymm14, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] -; AVX2-NEXT: vpshufb %xmm7, %xmm14, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] -; AVX2-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm11 -; AVX2-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] -; AVX2-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] +; AVX2-NEXT: vpshufb %xmm8, %xmm14, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] +; AVX2-NEXT: vpshufb %xmm7, %xmm15, %xmm2 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm9 +; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX2-NEXT: vpshufb %ymm9, %ymm13, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm11, %ymm6, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] -; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] -; AVX2-NEXT: vpshufb %xmm7, %xmm15, %xmm11 -; AVX2-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm0 +; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-NEXT: vpshufb %xmm7, %xmm12, %xmm7 +; AVX2-NEXT: vpor %xmm1, %xmm7, %xmm7 +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm9, %ymm5, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm4 +; AVX2-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] +; AVX2-NEXT: vpshufb %xmm3, %xmm14, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] +; AVX2-NEXT: vpshufb %xmm8, %xmm15, %xmm9 +; AVX2-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX2-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] -; AVX2-NEXT: vpshufb %ymm8, %ymm13, %ymm13 -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb %xmm7, %xmm12, %xmm1 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-NEXT: vpshufb %xmm14, %xmm10, %xmm2 +; AVX2-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX2-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm15 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX2-NEXT: vpshufb %ymm15, %ymm13, %ymm6 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 +; AVX2-NEXT: vpblendvb %ymm10, %ymm2, %ymm6, %ymm6 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm8, %xmm12, %xmm2 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm14, %xmm11, %xmm2 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpshufb %ymm8, %ymm6, %ymm2 +; AVX2-NEXT: vpshufb %ymm15, %ymm5, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] @@ -6795,229 +6788,229 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r8) -; AVX2-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm3, 32(%rax) -; AVX2-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride6_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $328, %rsp # imm = 0x148 -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FP-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm1[0,1] ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm9 +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm7, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm10 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm12 -; AVX2-FP-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm11 +; AVX2-FP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm8, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm15, %xmm10 -; AVX2-FP-NEXT: vpor %xmm0, %xmm10, %xmm1 -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[0,1],ymm9[0,1] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm10[2,3],ymm9[2,3] +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm15 +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm15, %xmm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm5 +; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm11 +; AVX2-FP-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm14, %xmm1 ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm5 -; AVX2-FP-NEXT: vpor %xmm3, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: vpshufb %ymm14, %ymm3, %ymm15 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5 -; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm7 -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0 -; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm2 +; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm5, %xmm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm15 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm4, %ymm15, %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm15 +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm15, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm0 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm11, %ymm6 +; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm8 +; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm1 +; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm14 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm15 +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1 -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm5, %ymm6, %ymm12 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm9, %ymm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm5 +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm9 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm11, %xmm9 -; AVX2-FP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9 -; AVX2-FP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm10 +; AVX2-FP-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FP-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-FP-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm9, %xmm9 +; AVX2-FP-NEXT: vpor %xmm2, %xmm9, %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm1 ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm14, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm11 -; AVX2-FP-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm14, %xmm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm2 +; AVX2-FP-NEXT: vpor %xmm0, %xmm2, %xmm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm9 +; AVX2-FP-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX2-FP-NEXT: vpshufb %ymm9, %ymm13, %ymm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-FP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm6, %ymm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm11 -; AVX2-FP-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm7 +; AVX2-FP-NEXT: vpor %xmm1, %xmm7, %xmm7 +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm9, %ymm5, %ymm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm14, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm15, %xmm9 +; AVX2-FP-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX2-FP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] -; AVX2-FP-NEXT: vpshufb %ymm8, %ymm13, %ymm13 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm1 -; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm14, %xmm10, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm13, %ymm6 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm2, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm8, %xmm12, %xmm2 +; AVX2-FP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm11, %xmm2 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpshufb %ymm8, %ymm6, %ymm2 +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm5, %ymm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm3 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] @@ -7068,229 +7061,229 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%r8) -; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-FP-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-FP-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-FP-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride6_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $328, %rsp # imm = 0x148 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FCP-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm1[0,1] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm9 +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm7, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm10 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm12 -; AVX2-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm11 +; AVX2-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm10 -; AVX2-FCP-NEXT: vpor %xmm0, %xmm10, %xmm1 -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[0,1],ymm9[0,1] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm10[2,3],ymm9[2,3] +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm15 +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm5 +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm11 +; AVX2-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm1 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm5 -; AVX2-FCP-NEXT: vpor %xmm3, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm15 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5 -; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm7 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm4 +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm15 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm4, %ymm15, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm15 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm0 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm11, %ymm6 +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm8 +; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm1 +; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm14 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm15 +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12 -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm6, %ymm12 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm9, %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm5 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm9 -; AVX2-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9 -; AVX2-FCP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm10 +; AVX2-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm9 +; AVX2-FCP-NEXT: vpor %xmm2, %xmm9, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm14, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm11 -; AVX2-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm2 +; AVX2-FCP-NEXT: vpor %xmm0, %xmm2, %xmm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm9 +; AVX2-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm11 -; AVX2-FCP-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm7 +; AVX2-FCP-NEXT: vpor %xmm1, %xmm7, %xmm7 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm9 +; AVX2-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm13 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm1 -; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm2, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm2 +; AVX2-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm2 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] @@ -7341,266 +7334,270 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-FCP-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-FCP-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride6_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $40, %rsp -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm25 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm29 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm31 +; AVX512-NEXT: vmovdqa %ymm14, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm31 ^ ymm29)) ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm9 -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm30 -; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm18 -; AVX512-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24)) -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] -; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm10 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] -; AVX512-NEXT: vpshufb %xmm8, %xmm6, %xmm13 -; AVX512-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31)) -; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512-NEXT: vporq %xmm1, %xmm5, %xmm17 -; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm29 -; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29)) -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] -; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX512-NEXT: vporq %xmm3, %xmm5, %xmm27 +; AVX512-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm30 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm22 +; AVX512-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm25 ^ (ymm7 & (ymm22 ^ ymm25)) +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] +; AVX512-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] +; AVX512-NEXT: vpshufb %xmm10, %xmm7, %xmm8 +; AVX512-NEXT: vporq %xmm3, %xmm8, %xmm23 +; AVX512-NEXT: vmovdqa %ymm14, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm11 ^ ymm30)) +; AVX512-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill +; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm28 +; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm24 +; AVX512-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm24 ^ ymm28)) +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512-NEXT: vporq %xmm5, %xmm10, %xmm20 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] -; AVX512-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX512-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] -; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] -; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512-NEXT: vporq %xmm4, %xmm6, %xmm28 -; AVX512-NEXT: vpshufb %xmm8, %xmm9, %xmm4 -; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm6 -; AVX512-NEXT: vporq %xmm4, %xmm6, %xmm21 -; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm27 +; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] +; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512-NEXT: vporq %xmm6, %xmm7, %xmm21 +; AVX512-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512-NEXT: vporq %xmm5, %xmm3, %xmm17 +; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm26 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26)) -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm15 -; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa %ymm8, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm29 ^ ymm31)) +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm5 ; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18)) -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] -; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm7 +; AVX512-NEXT: vmovdqa %ymm14, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm25 ^ ymm22)) +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] +; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm7 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] ; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm13 ; AVX512-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) -; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX512-NEXT: vporq %xmm0, %xmm6, %xmm16 -; AVX512-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22)) -; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX512-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512-NEXT: vpor %xmm8, %xmm10, %xmm0 +; AVX512-NEXT: vmovdqa %ymm8, %ymm13 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm30 ^ ymm11)) +; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm2 +; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX512-NEXT: vporq %xmm0, %xmm4, %xmm16 +; AVX512-NEXT: vmovdqa %ymm14, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm24 ^ (ymm11 & (ymm28 ^ ymm24)) +; AVX512-NEXT: vpshufb %xmm6, %xmm11, %xmm4 +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX512-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512-NEXT: vpor %xmm4, %xmm10, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] -; AVX512-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512-NEXT: vpor %xmm4, %xmm15, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[3,9,15,u,u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512-NEXT: vpor %xmm3, %xmm9, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] -; AVX512-NEXT: vpor %xmm1, %xmm15, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] +; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] +; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm3 ; AVX512-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm19 ^ (ymm5 & (ymm20 ^ ymm19)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] +; AVX512-NEXT: vshufi64x2 {{.*#+}} ymm12 = ymm23[2,3],mem[2,3] ; AVX512-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem)) -; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4) -; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 +; AVX512-NEXT: vinserti32x4 $2, %xmm27, %zmm0, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 +; AVX512-NEXT: vmovdqa %ymm6, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm12 ^ (ymm1 & (ymm23 ^ ymm12)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm0 | (ymm7 & mem) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm4 & (zmm0 ^ zmm2)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm3 & (zmm27 ^ zmm0)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm17 & ymm7) +; AVX512-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm2 ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm4 & (zmm1 ^ zmm2)) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1)) -; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX512-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm1 ; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm21 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] -; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm28 -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26)) +; AVX512-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm29, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm31 ^ (ymm14 & (ymm11 ^ ymm31)) ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm29 ; AVX512-NEXT: vporq %xmm1, %xmm2, %xmm26 -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24)) -; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm10 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] -; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] -; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX512-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30)) -; AVX512-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29)) -; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm7 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %xmm13, %xmm11, %xmm1 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] -; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] -; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm10 +; AVX512-NEXT: vmovdqa64 %ymm22, %ymm15 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm25 ^ (ymm8 & (ymm15 ^ ymm25)) +; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] +; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] +; AVX512-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512-NEXT: vporq %xmm1, %xmm3, %xmm31 +; AVX512-NEXT: vpternlogq $202, (%rsp), %ymm30, %ymm14 # 32-byte Folded Reload +; AVX512-NEXT: # ymm14 = mem ^ (ymm14 & (ymm30 ^ mem)) +; AVX512-NEXT: vmovdqa %ymm6, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm12 ^ ymm23)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm24 ^ ymm28)) +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] +; AVX512-NEXT: vpshufb %xmm11, %xmm10, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] +; AVX512-NEXT: vpshufb %xmm5, %xmm15, %xmm10 ; AVX512-NEXT: vpor %xmm1, %xmm10, %xmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm11 +; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm15 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm15 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm15, %ymm4, %ymm4 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4 ; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20)) -; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm20 ^ (ymm6 & (ymm19 ^ ymm20)) +; AVX512-NEXT: vpshufb %ymm1, %ymm6, %ymm1 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 -; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18) -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9) -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 -; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 -; AVX512-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11)) -; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8)) -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm13, %xmm12, %xmm8 -; AVX512-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9) -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 -; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1)) -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1)) -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1)) -; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21)) -; AVX512-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX512-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 %xmm29, %xmm14 +; AVX512-NEXT: vpshufb %xmm14, %xmm7, %xmm14 +; AVX512-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX512-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm23 ^ (ymm8 & (ymm12 ^ ymm23)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm9[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm11 & ymm18) +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm20 ^ (ymm8 & (ymm19 ^ ymm20)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm3 & ~ymm11) +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm3 +; AVX512-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm14 +; AVX512-NEXT: vinserti32x4 $2, %xmm26, %zmm14, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm14 ^ (zmm11 & (zmm9 ^ zmm14)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm14 & (zmm3 ^ zmm9)) +; AVX512-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ~ymm11) +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 +; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm11 & (zmm1 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm14 & (zmm2 ^ zmm1)) +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm1 & (zmm16 ^ zmm0)) +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm1 & (zmm21 ^ zmm0)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm16)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm21)) +; AVX512-NEXT: vmovdqa64 %zmm27, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm4, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: addq $40, %rsp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -7608,253 +7605,257 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i8_stride6_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $40, %rsp -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm29 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31 +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm31 ^ ymm29)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512-FCP-NEXT: vpor %xmm3, %xmm6, %xmm9 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm30 -; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm13 -; AVX512-FCP-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31)) -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512-FCP-NEXT: vporq %xmm1, %xmm5, %xmm17 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29 -; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX512-FCP-NEXT: vporq %xmm3, %xmm5, %xmm27 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm30 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm22 +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm25 ^ (ymm7 & (ymm22 ^ ymm25)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm8 +; AVX512-FCP-NEXT: vporq %xmm3, %xmm8, %xmm23 +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm11 ^ ymm30)) +; AVX512-FCP-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512-FCP-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm28 +; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24 +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm24 ^ ymm28)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512-FCP-NEXT: vporq %xmm5, %xmm10, %xmm20 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vporq %xmm4, %xmm6, %xmm28 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm6 -; AVX512-FCP-NEXT: vporq %xmm4, %xmm6, %xmm21 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm27 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vporq %xmm6, %xmm7, %xmm21 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vporq %xmm5, %xmm3, %xmm17 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm26 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm29 ^ ymm31)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm5 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm7 +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm25 ^ ymm22)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm13 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX512-FCP-NEXT: vporq %xmm0, %xmm6, %xmm16 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22)) -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512-FCP-NEXT: vpor %xmm8, %xmm10, %xmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm30 ^ ymm11)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX512-FCP-NEXT: vporq %xmm0, %xmm4, %xmm16 +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm24 ^ (ymm11 & (ymm28 ^ ymm24)) +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm4 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512-FCP-NEXT: vpor %xmm4, %xmm10, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpor %xmm4, %xmm15, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[3,9,15,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpor %xmm3, %xmm9, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm15, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpor %xmm3, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm19 ^ (ymm5 & (ymm20 ^ ymm19)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} ymm12 = ymm23[2,3],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem)) -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4) -; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm27, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 +; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm12 ^ (ymm1 & (ymm23 ^ ymm12)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm0 = ymm0 | (ymm7 & mem) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm4 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm3 & (zmm27 ^ zmm0)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm17 & ymm7) +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm4 & (zmm1 ^ zmm2)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1)) -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm1 ; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] -; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26)) +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm31 ^ (ymm14 & (ymm11 ^ ymm31)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm29 ; AVX512-FCP-NEXT: vporq %xmm1, %xmm2, %xmm26 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX512-FCP-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30)) -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1 -; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm25 ^ (ymm8 & (ymm15 ^ ymm25)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512-FCP-NEXT: vporq %xmm1, %xmm3, %xmm31 +; AVX512-FCP-NEXT: vpternlogq $202, (%rsp), %ymm30, %ymm14 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm14 = mem ^ (ymm14 & (ymm30 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm12 ^ ymm23)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm24 ^ ymm28)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm10 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm10, %xmm10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm15 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4 ; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20)) -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm20 ^ (ymm6 & (ymm19 ^ ymm20)) +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512-FCP-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11)) -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm8 -; AVX512-FCP-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1)) -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm14 +; AVX512-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm23 ^ (ymm8 & (ymm12 ^ ymm23)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm9[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm11 & ymm18) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm20 ^ (ymm8 & (ymm19 ^ ymm20)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm3 & ~ymm11) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm14, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm14 ^ (zmm11 & (zmm9 ^ zmm14)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm14 & (zmm3 ^ zmm9)) +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ~ymm11) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm11 & (zmm1 ^ zmm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm14 & (zmm2 ^ zmm1)) +; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm1 & (zmm16 ^ zmm0)) +; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm1 & (zmm21 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm16)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm21)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-FCP-NEXT: addq $40, %rsp ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -7862,253 +7863,257 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i8_stride6_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $40, %rsp -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm25 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm29 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm31 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm31 ^ ymm29)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512DQ-NEXT: vpor %xmm3, %xmm6, %xmm9 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm30 -; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm18 -; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm10 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm6, %xmm13 -; AVX512DQ-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31)) -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512DQ-NEXT: vporq %xmm1, %xmm5, %xmm17 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm29 -; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512DQ-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX512DQ-NEXT: vporq %xmm3, %xmm5, %xmm27 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm30 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm22 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm25 ^ (ymm7 & (ymm22 ^ ymm25)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm7, %xmm8 +; AVX512DQ-NEXT: vporq %xmm3, %xmm8, %xmm23 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm11 ^ ymm30)) +; AVX512DQ-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512DQ-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm28 +; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm24 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm24 ^ ymm28)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512DQ-NEXT: vporq %xmm5, %xmm10, %xmm20 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512DQ-NEXT: vporq %xmm4, %xmm6, %xmm28 -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm9, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm6 -; AVX512DQ-NEXT: vporq %xmm4, %xmm6, %xmm21 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm27 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512DQ-NEXT: vporq %xmm6, %xmm7, %xmm21 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-NEXT: vporq %xmm5, %xmm3, %xmm17 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm26 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm15 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm29 ^ ymm31)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm5 ; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm7 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm25 ^ ymm22)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm7 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm13 ; AVX512DQ-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX512DQ-NEXT: vporq %xmm0, %xmm6, %xmm16 -; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22)) -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512DQ-NEXT: vpor %xmm8, %xmm10, %xmm0 +; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm30 ^ ymm11)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm2 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX512DQ-NEXT: vporq %xmm0, %xmm4, %xmm16 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm24 ^ (ymm11 & (ymm28 ^ ymm24)) +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm11, %xmm4 +; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512DQ-NEXT: vpor %xmm4, %xmm10, %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpor %xmm4, %xmm15, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[3,9,15,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512DQ-NEXT: vpor %xmm3, %xmm9, %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] -; AVX512DQ-NEXT: vpor %xmm1, %xmm15, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm3 ; AVX512DQ-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm19 ^ (ymm5 & (ymm20 ^ ymm19)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} ymm12 = ymm23[2,3],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem)) -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4) -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm27, %zmm0, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 +; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm12 ^ (ymm1 & (ymm23 ^ ymm12)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = ymm0 | (ymm7 & mem) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm4 & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm3 & (zmm27 ^ zmm0)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm17 & ymm7) +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm4 & (zmm1 ^ zmm2)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1)) -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm1 ; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm21 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] -; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm28 -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26)) +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm31 ^ (ymm14 & (ymm11 ^ ymm31)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm29 ; AVX512DQ-NEXT: vporq %xmm1, %xmm2, %xmm26 -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm10 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX512DQ-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30)) -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm7 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm1 -; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm10 +; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm25 ^ (ymm8 & (ymm15 ^ ymm25)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm10, %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512DQ-NEXT: vporq %xmm1, %xmm3, %xmm31 +; AVX512DQ-NEXT: vpternlogq $202, (%rsp), %ymm30, %ymm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm14 = mem ^ (ymm14 & (ymm30 ^ mem)) +; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm12 ^ ymm23)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm24 ^ ymm28)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm10, %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm15, %xmm10 ; AVX512DQ-NEXT: vpor %xmm1, %xmm10, %xmm10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm11 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm15 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm15 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4 ; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20)) -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm20 ^ (ymm6 & (ymm19 ^ ymm20)) +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512DQ-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11)) -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8)) -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm12, %xmm8 -; AVX512DQ-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1)) -; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1)) -; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1)) -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21)) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm14 +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm7, %xmm14 +; AVX512DQ-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512DQ-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm23 ^ (ymm8 & (ymm12 ^ ymm23)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm9[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm11 & ymm18) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm20 ^ (ymm8 & (ymm19 ^ ymm20)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm3 & ~ymm11) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm3 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm14 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm26, %zmm14, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm14 ^ (zmm11 & (zmm9 ^ zmm14)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm14 & (zmm3 ^ zmm9)) +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ~ymm11) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm11 & (zmm1 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm14 & (zmm2 ^ zmm1)) +; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm1 & (zmm16 ^ zmm0)) +; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm1 & (zmm21 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm16)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm21)) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-NEXT: addq $40, %rsp ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -8116,253 +8121,257 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i8_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $40, %rsp -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm31 ^ ymm29)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm6, %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm13 -; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm5, %xmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm5, %xmm27 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm25 ^ (ymm7 & (ymm22 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm8 +; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm8, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm11 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm24 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512DQ-FCP-NEXT: vporq %xmm5, %xmm10, %xmm20 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vporq %xmm4, %xmm6, %xmm28 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm6 -; AVX512DQ-FCP-NEXT: vporq %xmm4, %xmm6, %xmm21 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm27 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vporq %xmm6, %xmm7, %xmm21 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vporq %xmm5, %xmm3, %xmm17 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm26 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm29 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm5 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm25 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm13 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm6, %xmm16 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm10, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm30 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm4, %xmm16 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm24 ^ (ymm11 & (ymm28 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm4 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm10, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm15, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[3,9,15,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm15, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm19 ^ (ymm5 & (ymm20 ^ ymm19)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} ymm12 = ymm23[2,3],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem)) -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4) -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm27, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm12 ^ (ymm1 & (ymm23 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm0 = ymm0 | (ymm7 & mem) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm4 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm3 & (zmm27 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm17 & ymm7) +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm4 & (zmm1 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm1 ; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] -; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm31 ^ (ymm14 & (ymm11 ^ ymm31)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm29 ; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm2, %xmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX512DQ-FCP-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1 -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm25 ^ (ymm8 & (ymm15 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm3, %xmm31 +; AVX512DQ-FCP-NEXT: vpternlogq $202, (%rsp), %ymm30, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm14 = mem ^ (ymm14 & (ymm30 ^ mem)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm12 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm24 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm10 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20)) -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm20 ^ (ymm6 & (ymm19 ^ ymm20)) +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm8 -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm14 +; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm23 ^ (ymm8 & (ymm12 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm9[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm11 & ymm18) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm20 ^ (ymm8 & (ymm19 ^ ymm20)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm3 & ~ymm11) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm14, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm14 ^ (zmm11 & (zmm9 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm14 & (zmm3 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ~ymm11) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm11 & (zmm1 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm14 & (zmm2 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm1 & (zmm16 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm1 & (zmm21 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm16)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm21)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-FCP-NEXT: addq $40, %rsp ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -8380,41 +8389,41 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm12, %xmm3 -; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm5 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm6 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm26 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512BW-NEXT: vpblendmw %ymm26, %ymm1, %ymm15 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] -; AVX512BW-NEXT: vpshufb %xmm17, %xmm16, %xmm11 +; AVX512BW-NEXT: vpshufb %xmm17, %xmm16, %xmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] ; AVX512BW-NEXT: vpshufb %xmm18, %xmm15, %xmm13 -; AVX512BW-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm11, %zmm11 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] -; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13 +; AVX512BW-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm14 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm11[2,3],mem[2,3] +; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm11, %ymm13 ; AVX512BW-NEXT: movw $-28124, %r10w # imm = 0x9224 ; AVX512BW-NEXT: kmovd %r10d, %k4 ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm13, %ymm19 {%k4} +; AVX512BW-NEXT: vinserti32x4 $2, %xmm6, %zmm14, %zmm6 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm10, %ymm20 {%k1} ; AVX512BW-NEXT: vpshufb %xmm2, %xmm20, %xmm2 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm21, %xmm4 ; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,0,6,12,2,8,14,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] -; AVX512BW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14 +; AVX512BW-NEXT: vpshufb %ymm11, %ymm19, %ymm2 {%k2} +; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],mem[2,3] +; AVX512BW-NEXT: vinserti128 $1, 288(%rdi), %ymm14, %ymm14 +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm2 {%k2} ; AVX512BW-NEXT: vpblendmw %ymm4, %ymm14, %ymm22 {%k4} -; AVX512BW-NEXT: vpshufb %ymm6, %ymm22, %ymm7 +; AVX512BW-NEXT: vpshufb %ymm11, %ymm22, %ymm7 ; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX512BW-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm24 {%k1} @@ -8444,7 +8453,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %xmm7, %xmm20, %xmm7 ; AVX512BW-NEXT: vpshufb %xmm9, %xmm21, %xmm9 ; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm9 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,1,7,13,3,9,15,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} ; AVX512BW-NEXT: vpshufb %ymm7, %ymm22, %ymm7 @@ -8457,24 +8466,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3} ; AVX512BW-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4} -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm8, %xmm16 -; AVX512BW-NEXT: vpshufb %xmm7, %xmm16, %xmm12 +; AVX512BW-NEXT: vpblendmw %ymm10, %ymm3, %ymm7 {%k2} +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm7, %xmm16 +; AVX512BW-NEXT: vpshufb %xmm8, %xmm16, %xmm12 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm17, %xmm8, %xmm18 +; AVX512BW-NEXT: vpshufb %xmm17, %xmm7, %xmm18 ; AVX512BW-NEXT: vporq %xmm12, %xmm18, %xmm18 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,2,8,14,4,10,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k5 ; AVX512BW-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} ; AVX512BW-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2} ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 -; AVX512BW-NEXT: vpshufb %xmm7, %xmm21, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm8, %xmm21, %xmm8 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm20, %xmm12 -; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512BW-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512BW-NEXT: vpshufb %xmm22, %xmm17, %xmm12 @@ -8483,20 +8492,20 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %xmm25, %xmm24, %xmm27 ; AVX512BW-NEXT: vporq %xmm12, %xmm27, %xmm12 ; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12 +; AVX512BW-NEXT: vinserti32x4 $2, %xmm8, %zmm12, %zmm12 ; AVX512BW-NEXT: movl $2097151, %edi # imm = 0x1FFFFF ; AVX512BW-NEXT: kmovq %rdi, %k6 +; AVX512BW-NEXT: vpblendmw %ymm14, %ymm4, %ymm8 {%k4} +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm27 {%k1} ; AVX512BW-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4} -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1} -; AVX512BW-NEXT: vpshufb %xmm22, %xmm18, %xmm22 -; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm27 -; AVX512BW-NEXT: vpshufb %xmm25, %xmm27, %xmm25 -; AVX512BW-NEXT: vporq %xmm22, %xmm25, %xmm22 -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512BW-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3} +; AVX512BW-NEXT: vpshufb %xmm22, %xmm27, %xmm18 +; AVX512BW-NEXT: vextracti32x4 $1, %ymm27, %xmm22 +; AVX512BW-NEXT: vpshufb %xmm25, %xmm22, %xmm25 +; AVX512BW-NEXT: vporq %xmm18, %xmm25, %xmm18 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-NEXT: vpshufb %ymm19, %ymm8, %ymm18 {%k5} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu8 %zmm18, %zmm12 {%k3} ; AVX512BW-NEXT: movw $9289, %di # imm = 0x2449 ; AVX512BW-NEXT: kmovd %edi, %k4 ; AVX512BW-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4} @@ -8504,29 +8513,29 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm13, %xmm16, %xmm14 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm16, %xmm8, %xmm8 -; AVX512BW-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] -; AVX512BW-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} +; AVX512BW-NEXT: vpshufb %xmm16, %xmm7, %xmm7 +; AVX512BW-NEXT: vpor %xmm7, %xmm14, %xmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,3,9,15,5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb %ymm14, %ymm15, %ymm7 {%k5} ; AVX512BW-NEXT: vpshufb %xmm13, %xmm21, %xmm13 ; AVX512BW-NEXT: vpshufb %xmm16, %xmm20, %xmm15 ; AVX512BW-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] ; AVX512BW-NEXT: vpshufb %xmm15, %xmm17, %xmm16 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] -; AVX512BW-NEXT: vpshufb %xmm17, %xmm24, %xmm19 -; AVX512BW-NEXT: vporq %xmm16, %xmm19, %xmm16 +; AVX512BW-NEXT: vpshufb %xmm17, %xmm24, %xmm18 +; AVX512BW-NEXT: vporq %xmm16, %xmm18, %xmm16 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6} -; AVX512BW-NEXT: vpshufb %xmm15, %xmm18, %xmm8 -; AVX512BW-NEXT: vpshufb %xmm17, %xmm27, %xmm15 -; AVX512BW-NEXT: vpor %xmm8, %xmm15, %xmm8 -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k6} +; AVX512BW-NEXT: vpshufb %xmm15, %xmm27, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm17, %xmm22, %xmm15 +; AVX512BW-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-NEXT: vpshufb %ymm14, %ymm8, %ymm7 {%k5} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,4,10,0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1} @@ -8566,7 +8575,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,5,11,1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm14, %xmm15, %xmm15 @@ -8616,41 +8625,41 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 -; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm5 +; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm6 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26 ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpblendmw %ymm26, %ymm1, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm16, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm16, %xmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] ; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm15, %xmm13 -; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm11, %zmm11 -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13 +; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm11[2,3],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm11, %ymm13 ; AVX512BW-FCP-NEXT: movw $-28124, %r10w # imm = 0x9224 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k4 ; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm13, %ymm19 {%k4} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm14, %zmm6 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm20 {%k1} ; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm20, %xmm2 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm21 ; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm21, %xmm4 ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,0,6,12,2,8,14,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $4192256, %r10d # imm = 0x3FF800 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm19, %ymm2 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm14, %ymm14 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm14, %ymm22 {%k4} -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm22, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm22, %ymm7 ; AVX512BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX512BW-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm6, %ymm24 {%k1} @@ -8680,7 +8689,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm20, %xmm7 ; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm21, %xmm9 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm9 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,1,7,13,3,9,15,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm22, %ymm7 @@ -8693,24 +8702,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3} ; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm8, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm16, %xmm12 +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm7 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm7, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm16, %xmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm8, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm7, %xmm18 ; AVX512BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,2,8,14,4,10,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-FCP-NEXT: kmovd %edi, %k5 ; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} ; AVX512BW-FCP-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2} ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm21, %xmm8 ; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm20, %xmm12 -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm17, %xmm12 @@ -8719,20 +8728,20 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm24, %xmm27 ; AVX512BW-FCP-NEXT: vporq %xmm12, %xmm27, %xmm12 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm12, %zmm12 ; AVX512BW-FCP-NEXT: movl $2097151, %edi # imm = 0x1FFFFF ; AVX512BW-FCP-NEXT: kmovq %rdi, %k6 +; AVX512BW-FCP-NEXT: vpblendmw %ymm14, %ymm4, %ymm8 {%k4} +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm11, %ymm27 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4} -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm22 -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm27 -; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm25 -; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm25, %xmm22 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3} +; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm27, %xmm18 +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm22 +; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm22, %xmm25 +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm25, %xmm18 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm8, %ymm18 {%k5} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm12 {%k3} ; AVX512BW-FCP-NEXT: movw $9289, %di # imm = 0x2449 ; AVX512BW-FCP-NEXT: kmovd %edi, %k4 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4} @@ -8740,29 +8749,29 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm16, %xmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm8, %xmm8 -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} +; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm7, %xmm7 +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm14, %xmm7 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,3,9,15,5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm7 {%k5} ; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm13 ; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm20, %xmm15 ; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] ; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm17, %xmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm24, %xmm19 -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm19, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm24, %xmm18 +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm18, %xmm16 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6} -; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm18, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm27, %xmm15 -; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm15, %xmm8 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm13 {%k6} +; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm27, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm22, %xmm15 +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm7 {%k5} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,4,10,0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1} @@ -8802,7 +8811,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,5,11,1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm15 @@ -8852,41 +8861,41 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm12, %xmm3 -; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm3, %xmm5 +; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm3, %xmm6 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm26 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpblendmw %ymm26, %ymm1, %ymm15 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] -; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm16, %xmm11 +; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm16, %xmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] ; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm15, %xmm13 -; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm11, %zmm11 -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13 +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm14 +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm11[2,3],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti128 $1, 96(%rdi), %ymm11, %ymm13 ; AVX512DQ-BW-NEXT: movw $-28124, %r10w # imm = 0x9224 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k4 ; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm13, %ymm19 {%k4} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm6, %zmm14, %zmm6 ; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm10, %ymm20 {%k1} ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm20, %xmm2 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm21, %xmm4 ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,0,6,12,2,8,14,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14 +; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm19, %ymm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti128 $1, 288(%rdi), %ymm14, %ymm14 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm14, %ymm22 {%k4} -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm22, %ymm7 +; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm22, %ymm7 ; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX512DQ-BW-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm24 {%k1} @@ -8916,7 +8925,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm20, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm21, %xmm9 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm9 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,1,7,13,3,9,15,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm22, %ymm7 @@ -8929,24 +8938,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3} ; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm8, %xmm16 -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm16, %xmm12 +; AVX512DQ-BW-NEXT: vpblendmw %ymm10, %ymm3, %ymm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm7, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm16, %xmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm8, %xmm18 +; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm7, %xmm18 ; AVX512DQ-BW-NEXT: vporq %xmm12, %xmm18, %xmm18 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,2,8,14,4,10,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-NEXT: kmovd %edi, %k5 ; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} ; AVX512DQ-BW-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2} ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm21, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm21, %xmm8 ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm20, %xmm12 -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm17, %xmm12 @@ -8955,20 +8964,20 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm24, %xmm27 ; AVX512DQ-BW-NEXT: vporq %xmm12, %xmm27, %xmm12 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm8, %zmm12, %zmm12 ; AVX512DQ-BW-NEXT: movl $2097151, %edi # imm = 0x1FFFFF ; AVX512DQ-BW-NEXT: kmovq %rdi, %k6 +; AVX512DQ-BW-NEXT: vpblendmw %ymm14, %ymm4, %ymm8 {%k4} +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm27 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4} -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1} -; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm18, %xmm22 -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm18, %xmm27 -; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm27, %xmm25 -; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm25, %xmm22 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3} +; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm27, %xmm18 +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm27, %xmm22 +; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm22, %xmm25 +; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm25, %xmm18 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm8, %ymm18 {%k5} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm18, %zmm12 {%k3} ; AVX512DQ-BW-NEXT: movw $9289, %di # imm = 0x2449 ; AVX512DQ-BW-NEXT: kmovd %edi, %k4 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4} @@ -8976,29 +8985,29 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm16, %xmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm8, %xmm8 -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] -; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} +; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm7, %xmm7 +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm14, %xmm7 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,3,9,15,5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm15, %ymm7 {%k5} ; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm21, %xmm13 ; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm20, %xmm15 ; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm17, %xmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] -; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm24, %xmm19 -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm19, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm24, %xmm18 +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm18, %xmm16 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6} -; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm18, %xmm8 -; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm27, %xmm15 -; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm15, %xmm8 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k6} +; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm27, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm22, %xmm15 +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm8, %ymm7 {%k5} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,4,10,0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1} @@ -9038,7 +9047,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,5,11,1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm15, %xmm15 @@ -9088,41 +9097,41 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm26, %ymm1, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm16, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm16, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm15, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm11, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm11[2,3],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm11, %ymm13 ; AVX512DQ-BW-FCP-NEXT: movw $-28124, %r10w # imm = 0x9224 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k4 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm13, %ymm19 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm14, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm20, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm21 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm21, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,0,6,12,2,8,14,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $4192256, %r10d # imm = 0x3FF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm19, %ymm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm14, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm14, %ymm22 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm22, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm22, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm6, %ymm24 {%k1} @@ -9152,7 +9161,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm20, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm21, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,1,7,13,3,9,15,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm22, %ymm7 @@ -9165,24 +9174,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm8, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm16, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm7, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm16, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm8, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm7, %xmm18 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,2,8,14,4,10,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k5 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm21, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm20, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm17, %xmm12 @@ -9191,20 +9200,20 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm24, %xmm27 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm12, %xmm27, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm12, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movl $2097151, %edi # imm = 0x1FFFFF ; AVX512DQ-BW-FCP-NEXT: kmovq %rdi, %k6 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm14, %ymm4, %ymm8 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm11, %ymm27 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm22 -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm27 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm25, %xmm22 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm27, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm22, %xmm25 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm25, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm8, %ymm18 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm12 {%k3} ; AVX512DQ-BW-FCP-NEXT: movw $9289, %di # imm = 0x2449 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4} @@ -9212,29 +9221,29 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm16, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm14, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,3,9,15,5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm7 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm20, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm17, %xmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm24, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm19, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm24, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm18, %xmm16 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm18, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm27, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm15, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm13 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm27, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm22, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm7 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,4,10,0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1} @@ -9274,7 +9283,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,5,11,1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm15 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 86c932a5bb1f9..ab1c20826cf01 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -69,8 +69,6 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i8_stride7_vf2: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -84,14 +82,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride7_vf2: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -105,14 +103,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX2-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX2-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX2-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride7_vf2: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -126,14 +124,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX2-FP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX2-FP-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX2-FP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride7_vf2: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -147,14 +145,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX2-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX2-FCP-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX2-FCP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride7_vf2: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -168,14 +166,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX512-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride7_vf2: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -189,14 +187,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512-FCP-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX512-FCP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride7_vf2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -210,14 +208,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512DQ-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512DQ-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX512DQ-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf2: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -231,14 +229,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride7_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -252,14 +250,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512BW-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512BW-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX512BW-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride7_vf2: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -273,14 +271,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride7_vf2: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -294,14 +292,14 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf2: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -315,7 +313,9 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <14 x i8>, ptr %in.vec, align 64 @@ -447,11 +447,11 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-LABEL: load_i8_stride7_vf4: ; AVX: # %bb.0: ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] @@ -479,32 +479,32 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovd %xmm5, (%rcx) ; AVX-NEXT: vmovd %xmm7, (%r8) ; AVX-NEXT: vmovd %xmm4, (%r9) -; AVX-NEXT: vmovd %xmm6, (%r10) +; AVX-NEXT: vmovd %xmm6, (%rdi) ; AVX-NEXT: vmovd %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride7_vf4: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -518,32 +518,32 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovd %xmm5, (%rcx) ; AVX2-NEXT: vmovd %xmm7, (%r8) ; AVX2-NEXT: vmovd %xmm4, (%r9) -; AVX2-NEXT: vmovd %xmm6, (%r10) +; AVX2-NEXT: vmovd %xmm6, (%rdi) ; AVX2-NEXT: vmovd %xmm0, (%rax) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride7_vf4: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -557,32 +557,32 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovd %xmm5, (%rcx) ; AVX2-FP-NEXT: vmovd %xmm7, (%r8) ; AVX2-FP-NEXT: vmovd %xmm4, (%r9) -; AVX2-FP-NEXT: vmovd %xmm6, (%r10) +; AVX2-FP-NEXT: vmovd %xmm6, (%rdi) ; AVX2-FP-NEXT: vmovd %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride7_vf4: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -596,18 +596,18 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovd %xmm5, (%rcx) ; AVX2-FCP-NEXT: vmovd %xmm7, (%r8) ; AVX2-FCP-NEXT: vmovd %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovd %xmm6, (%r10) +; AVX2-FCP-NEXT: vmovd %xmm6, (%rdi) ; AVX2-FCP-NEXT: vmovd %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride7_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] @@ -635,18 +635,18 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovd %xmm5, (%rcx) ; AVX512-NEXT: vmovd %xmm7, (%r8) ; AVX512-NEXT: vmovd %xmm4, (%r9) -; AVX512-NEXT: vmovd %xmm6, (%r10) +; AVX512-NEXT: vmovd %xmm6, (%rdi) ; AVX512-NEXT: vmovd %xmm0, (%rax) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride7_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] @@ -674,18 +674,18 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovd %xmm5, (%rcx) ; AVX512-FCP-NEXT: vmovd %xmm7, (%r8) ; AVX512-FCP-NEXT: vmovd %xmm4, (%r9) -; AVX512-FCP-NEXT: vmovd %xmm6, (%r10) +; AVX512-FCP-NEXT: vmovd %xmm6, (%rdi) ; AVX512-FCP-NEXT: vmovd %xmm0, (%rax) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride7_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] @@ -713,18 +713,18 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovd %xmm5, (%rcx) ; AVX512DQ-NEXT: vmovd %xmm7, (%r8) ; AVX512DQ-NEXT: vmovd %xmm4, (%r9) -; AVX512DQ-NEXT: vmovd %xmm6, (%r10) +; AVX512DQ-NEXT: vmovd %xmm6, (%rdi) ; AVX512DQ-NEXT: vmovd %xmm0, (%rax) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] @@ -752,32 +752,32 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovd %xmm5, (%rcx) ; AVX512DQ-FCP-NEXT: vmovd %xmm7, (%r8) ; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%r9) -; AVX512DQ-FCP-NEXT: vmovd %xmm6, (%r10) +; AVX512DQ-FCP-NEXT: vmovd %xmm6, (%rdi) ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride7_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -791,32 +791,32 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovd %xmm5, (%rcx) ; AVX512BW-NEXT: vmovd %xmm7, (%r8) ; AVX512BW-NEXT: vmovd %xmm4, (%r9) -; AVX512BW-NEXT: vmovd %xmm6, (%r10) +; AVX512BW-NEXT: vmovd %xmm6, (%rdi) ; AVX512BW-NEXT: vmovd %xmm0, (%rax) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride7_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -830,32 +830,32 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovd %xmm5, (%rcx) ; AVX512BW-FCP-NEXT: vmovd %xmm7, (%r8) ; AVX512BW-FCP-NEXT: vmovd %xmm4, (%r9) -; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r10) +; AVX512BW-FCP-NEXT: vmovd %xmm6, (%rdi) ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rax) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride7_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -869,32 +869,32 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovd %xmm5, (%rcx) ; AVX512DQ-BW-NEXT: vmovd %xmm7, (%r8) ; AVX512DQ-BW-NEXT: vmovd %xmm4, (%r9) -; AVX512DQ-BW-NEXT: vmovd %xmm6, (%r10) +; AVX512DQ-BW-NEXT: vmovd %xmm6, (%rdi) ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rax) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -908,7 +908,7 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm7, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%rdi) ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <28 x i8>, ptr %in.vec, align 64 @@ -1159,239 +1159,237 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i8_stride7_vf8: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2] -; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] -; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0] -; AVX-NEXT: # xmm7 = mem[0,0] -; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] +; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0] +; AVX-NEXT: # xmm8 = mem[0,0] +; AVX-NEXT: vpblendvb %xmm8, %xmm3, %xmm5, %xmm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3] -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm8, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3] +; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX-NEXT: vpblendvb %xmm7, %xmm8, %xmm9, %xmm8 +; AVX-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm9 +; AVX-NEXT: vpblendvb %xmm8, %xmm6, %xmm9, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX-NEXT: vpblendvb %xmm7, %xmm9, %xmm6, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5] +; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX-NEXT: vpblendvb %xmm8, %xmm9, %xmm7, %xmm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm11 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3],xmm10[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,9,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vmovq %xmm3, (%rsi) ; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm8, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm7, (%r9) -; AVX-NEXT: vmovq %xmm10, (%r10) +; AVX-NEXT: vmovq %xmm6, (%rcx) +; AVX-NEXT: vmovq %xmm7, (%r8) +; AVX-NEXT: vmovq %xmm8, (%r9) +; AVX-NEXT: vmovq %xmm10, (%rdi) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride7_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm6 +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 +; AVX2-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,11],zero,zero,xmm5[0,7,14],zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm5, (%r8) -; AVX2-NEXT: vmovq %xmm6, (%r9) -; AVX2-NEXT: vmovq %xmm7, (%r10) +; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vmovq %xmm3, (%rcx) +; AVX2-NEXT: vmovq %xmm7, (%r8) +; AVX2-NEXT: vmovq %xmm5, (%r9) +; AVX2-NEXT: vmovq %xmm6, (%rdi) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride7_vf8: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm6 +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,11],zero,zero,xmm5[0,7,14],zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm5, (%r8) -; AVX2-FP-NEXT: vmovq %xmm6, (%r9) -; AVX2-FP-NEXT: vmovq %xmm7, (%r10) +; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FP-NEXT: vmovq %xmm7, (%r8) +; AVX2-FP-NEXT: vmovq %xmm5, (%r9) +; AVX2-FP-NEXT: vmovq %xmm6, (%rdi) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride7_vf8: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm6 +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,11],zero,zero,xmm5[0,7,14],zero,xmm5[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FCP-NEXT: vmovq %xmm7, (%r8) +; AVX2-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX2-FCP-NEXT: vmovq %xmm6, (%rdi) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride7_vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] @@ -1410,6 +1408,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -1421,6 +1420,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 @@ -1437,15 +1437,13 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm5, (%r8) ; AVX512-NEXT: vmovq %xmm6, (%r9) -; AVX512-NEXT: vmovq %xmm7, (%r10) +; AVX512-NEXT: vmovq %xmm7, (%rdi) ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride7_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] @@ -1464,6 +1462,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -1475,6 +1474,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 @@ -1491,15 +1491,13 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512-FCP-NEXT: vmovq %xmm7, (%rdi) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride7_vf8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] @@ -1518,6 +1516,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -1529,6 +1528,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 @@ -1545,15 +1545,13 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-NEXT: vmovq %xmm7, (%rdi) ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] @@ -1572,6 +1570,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -1583,6 +1582,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 @@ -1599,55 +1599,55 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%rdi) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride7_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-NEXT: movw $290, %di # imm = 0x122 -; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: movw $290, %ax # imm = 0x122 +; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512BW-NEXT: movw $580, %di # imm = 0x244 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k1} +; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,xmm5[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,xmm6[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm8 {%k1} +; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,xmm8[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] ; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: movw $9288, %r10w # imm = 0x2448 +; AVX512BW-NEXT: kmovd %r10d, %k1 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1656,57 +1656,57 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) +; AVX512BW-NEXT: vmovq %xmm6, (%r8) +; AVX512BW-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-NEXT: vmovq %xmm7, (%rdi) ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride7_vf8: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-FCP-NEXT: movw $290, %di # imm = 0x122 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512BW-FCP-NEXT: movw $290, %ax # imm = 0x122 +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512BW-FCP-NEXT: movw $580, %di # imm = 0x244 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k1} +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,xmm5[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,xmm6[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm8 {%k1} +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,xmm8[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512BW-FCP-NEXT: movw $9288, %r10w # imm = 0x2448 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1715,57 +1715,57 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm7, (%rdi) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride7_vf8: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: movw $290, %di # imm = 0x122 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-NEXT: movw $290, %ax # imm = 0x122 +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512DQ-BW-NEXT: movw $580, %di # imm = 0x244 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k1} +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,xmm5[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,xmm6[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm8 {%k1} +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,xmm8[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-NEXT: movw $9288, %r10w # imm = 0x2448 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1774,57 +1774,57 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm7, (%rdi) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf8: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: movw $290, %di # imm = 0x122 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-FCP-NEXT: movw $290, %ax # imm = 0x122 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: movw $580, %di # imm = 0x244 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,xmm5[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,xmm6[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,xmm8[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-FCP-NEXT: movw $9288, %r10w # imm = 0x2448 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1833,9 +1833,9 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%rdi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -2382,421 +2382,423 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-LABEL: load_i8_stride7_vf16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX-NEXT: vmovq {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX-NEXT: vmovq {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm11, %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendvb %xmm11, %xmm7, %xmm8, %xmm8 +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 96(%rdi), %xmm7 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX-NEXT: vpxor %xmm12, %xmm12, %xmm12 -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm12[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[3,10] +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm0[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[3,10] ; AVX-NEXT: vpor %xmm10, %xmm9, %xmm10 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm9 = [18446744073709551615,255] +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX-NEXT: vpblendvb %xmm11, %xmm10, %xmm13, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[4,11] +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm0[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[4,11] ; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX-NEXT: vpblendvb %xmm9, %xmm10, %xmm13, %xmm10 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX-NEXT: vpblendvb %xmm11, %xmm13, %xmm14, %xmm11 -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,xmm12[0,7,14,u,u] ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12] +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm0[7] +; AVX-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[5,12] ; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11 ; AVX-NEXT: vmovd {{.*#+}} xmm13 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm14 -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm0, %xmm15, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,xmm12[1,8,15,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm5[u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15,u,u] -; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5,6],xmm12[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[6,13] -; AVX-NEXT: vpor %xmm14, %xmm12, %xmm12 -; AVX-NEXT: vpblendvb %xmm9, %xmm0, %xmm12, %xmm12 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[6,13] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm8[7] +; AVX-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX-NEXT: vpblendvb %xmm9, %xmm1, %xmm0, %xmm8 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3,4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,xmm12[2,9,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u] ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] ; AVX-NEXT: # xmm15 = mem[0,0] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14] ; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[0,7,14] -; AVX-NEXT: vpor %xmm8, %xmm14, %xmm8 -; AVX-NEXT: vpblendvb %xmm9, %xmm0, %xmm8, %xmm0 -; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX-NEXT: vpblendvb %xmm9, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm13, %xmm4, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[3,10,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[1,8,15] -; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpblendvb %xmm9, %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] -; AVX-NEXT: vpblendw $31, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0,1,2,3,4],xmm1[5,6,7] -; AVX-NEXT: vmovdqa %xmm1, (%rsi) -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vmovaps %xmm1, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,xmm12[3,10,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[1,8,15] +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm9, %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] +; AVX-NEXT: vpblendw $31, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[0,1,2,3,4],xmm2[5,6,7] +; AVX-NEXT: vmovdqa %xmm2, (%rsi) +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vmovaps %xmm2, (%rdx) ; AVX-NEXT: vmovdqa %xmm10, (%rcx) ; AVX-NEXT: vmovdqa %xmm11, (%r8) -; AVX-NEXT: vmovdqa %xmm12, (%r9) +; AVX-NEXT: vmovdqa %xmm8, (%r9) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa %xmm0, (%rax) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovdqa %xmm2, (%rax) +; AVX-NEXT: vmovdqa %xmm1, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride7_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm3 -; AVX2-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] +; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0],xmm11[1],xmm10[2],xmm11[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero -; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm6 +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm4 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero -; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm8 -; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] -; AVX2-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8 -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm11, %xmm8, %xmm8 -; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero -; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11 -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12] -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero -; AVX2-NEXT: vpor %xmm12, %xmm9, %xmm9 -; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 -; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13] -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero -; AVX2-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 -; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14] -; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero +; AVX2-NEXT: vpor %xmm6, %xmm13, %xmm13 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %xmm6, %xmm3, %xmm13, %xmm3 +; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm13 +; AVX2-NEXT: vpor %xmm12, %xmm9, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm12[4,11] +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero +; AVX2-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX2-NEXT: vpblendvb %xmm6, %xmm9, %xmm12, %xmm9 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm12 +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm11[5,12] +; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero +; AVX2-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX2-NEXT: vpor %xmm14, %xmm10, %xmm10 +; AVX2-NEXT: vpblendvb %xmm6, %xmm12, %xmm10, %xmm10 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm12 +; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13] +; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero +; AVX2-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX2-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX2-NEXT: vpblendvb %xmm6, %xmm12, %xmm11, %xmm11 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm12 +; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[5,12],zero,zero,xmm12[1,8,15],zero,zero,xmm12[u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm13[0,7,14] +; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero +; AVX2-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-NEXT: vpblendvb %xmm6, %xmm12, %xmm13, %xmm12 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm4[1,8,15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero -; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpblendvb %xmm7, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovdqa %xmm3, (%rsi) -; AVX2-NEXT: vmovdqa %xmm6, (%rdx) -; AVX2-NEXT: vmovdqa %xmm8, (%rcx) -; AVX2-NEXT: vmovdqa %xmm9, (%r8) -; AVX2-NEXT: vmovdqa %xmm10, (%r9) -; AVX2-NEXT: vmovdqa %xmm11, (%r10) +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm1 +; AVX2-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX2-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-NEXT: vmovdqa %xmm9, (%rcx) +; AVX2-NEXT: vmovdqa %xmm10, (%r8) +; AVX2-NEXT: vmovdqa %xmm11, (%r9) +; AVX2-NEXT: vmovdqa %xmm12, (%rdi) ; AVX2-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride7_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm3 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0],xmm11[1],xmm10[2],xmm11[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero -; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm4 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero -; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm8 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] -; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm11, %xmm8, %xmm8 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero -; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero -; AVX2-FP-NEXT: vpor %xmm12, %xmm9, %xmm9 -; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 -; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero -; AVX2-FP-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 -; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero +; AVX2-FP-NEXT: vpor %xmm6, %xmm13, %xmm13 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %xmm6, %xmm3, %xmm13, %xmm3 +; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm13 +; AVX2-FP-NEXT: vpor %xmm12, %xmm9, %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm12[4,11] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero +; AVX2-FP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX2-FP-NEXT: vpblendvb %xmm6, %xmm9, %xmm12, %xmm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm12 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm11[5,12] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero ; AVX2-FP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpor %xmm14, %xmm10, %xmm10 +; AVX2-FP-NEXT: vpblendvb %xmm6, %xmm12, %xmm10, %xmm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm12 +; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero +; AVX2-FP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX2-FP-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX2-FP-NEXT: vpblendvb %xmm6, %xmm12, %xmm11, %xmm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm12 +; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[5,12],zero,zero,xmm12[1,8,15],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm13[0,7,14] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero +; AVX2-FP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-FP-NEXT: vpblendvb %xmm6, %xmm12, %xmm13, %xmm12 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm4[1,8,15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX2-FP-NEXT: vmovdqa %xmm8, (%rcx) -; AVX2-FP-NEXT: vmovdqa %xmm9, (%r8) -; AVX2-FP-NEXT: vmovdqa %xmm10, (%r9) -; AVX2-FP-NEXT: vmovdqa %xmm11, (%r10) +; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpor %xmm2, %xmm4, %xmm1 +; AVX2-FP-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-FP-NEXT: vmovdqa %xmm9, (%rcx) +; AVX2-FP-NEXT: vmovdqa %xmm10, (%r8) +; AVX2-FP-NEXT: vmovdqa %xmm11, (%r9) +; AVX2-FP-NEXT: vmovdqa %xmm12, (%rdi) ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride7_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm3 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0],xmm11[1],xmm10[2],xmm11[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero -; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm6 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero -; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] -; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm11, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero -; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero -; AVX2-FCP-NEXT: vpor %xmm12, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero -; AVX2-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero +; AVX2-FCP-NEXT: vpor %xmm6, %xmm13, %xmm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %xmm6, %xmm3, %xmm13, %xmm3 +; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm13 +; AVX2-FCP-NEXT: vpor %xmm12, %xmm9, %xmm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm12[4,11] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero +; AVX2-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX2-FCP-NEXT: vpblendvb %xmm6, %xmm9, %xmm12, %xmm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm12 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm11[5,12] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero +; AVX2-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX2-FCP-NEXT: vpor %xmm14, %xmm10, %xmm10 +; AVX2-FCP-NEXT: vpblendvb %xmm6, %xmm12, %xmm10, %xmm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero +; AVX2-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX2-FCP-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX2-FCP-NEXT: vpblendvb %xmm6, %xmm12, %xmm11, %xmm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[5,12],zero,zero,xmm12[1,8,15],zero,zero,xmm12[u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm13[0,7,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero +; AVX2-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-FCP-NEXT: vpblendvb %xmm6, %xmm12, %xmm13, %xmm12 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm4[1,8,15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm0, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %xmm9, (%r8) -; AVX2-FCP-NEXT: vmovdqa %xmm10, (%r9) -; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r10) +; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpor %xmm2, %xmm4, %xmm1 +; AVX2-FCP-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %xmm10, (%r8) +; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r9) +; AVX2-FCP-NEXT: vmovdqa %xmm12, (%rdi) ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -2805,96 +2807,96 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm4 ; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm3 ^ (ymm5 & (ymm1 ^ ymm3)) ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] ; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm9 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] -; AVX512-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9)) +; AVX512-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3)) +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm2[1],xmm4[2,3,4],xmm2[5],xmm4[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero +; AVX512-NEXT: vpor %xmm6, %xmm9, %xmm11 +; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm6 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm1 ^ ymm3)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm11)) ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] +; AVX512-NEXT: vpor %xmm12, %xmm10, %xmm11 +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11] ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10)) +; AVX512-NEXT: vpor %xmm12, %xmm10, %xmm12 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm10, %ymm12 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm1 ^ ymm3)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm11)) +; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm2[1],xmm4[2],xmm2[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1)) -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm11)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm3 ^ ymm1)) +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1)) -; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] +; AVX512-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero +; AVX512-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm3 ^ ymm1)) +; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm8)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6],xmm2[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512-NEXT: vpor %xmm13, %xmm9, %xmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm3 ^ ymm1)) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512-NEXT: vpor %xmm15, %xmm9, %xmm3 +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 ^ (xmm7 & (xmm3 ^ xmm8)) +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6],xmm2[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] +; AVX512-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1)) ; AVX512-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512-NEXT: vmovdqa %xmm11, (%rcx) +; AVX512-NEXT: vmovdqa %xmm12, (%rcx) ; AVX512-NEXT: vmovdqa %xmm14, (%r8) -; AVX512-NEXT: vmovdqa %xmm12, (%r9) -; AVX512-NEXT: vmovdqa %xmm9, (%r10) +; AVX512-NEXT: vmovdqa %xmm11, (%r9) +; AVX512-NEXT: vmovdqa %xmm3, (%r10) ; AVX512-NEXT: vmovdqa %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -2903,96 +2905,96 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm4 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm3 ^ (ymm5 & (ymm1 ^ ymm3)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9)) +; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm2[1],xmm4[2,3,4],xmm2[5],xmm4[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero +; AVX512-FCP-NEXT: vpor %xmm6, %xmm9, %xmm11 +; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm1 ^ ymm3)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm11)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] +; AVX512-FCP-NEXT: vpor %xmm12, %xmm10, %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10)) +; AVX512-FCP-NEXT: vpor %xmm12, %xmm10, %xmm12 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm1 ^ ymm3)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm11)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm2[1],xmm4[2],xmm2[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm11)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm3 ^ ymm1)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] +; AVX512-FCP-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero +; AVX512-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm3 ^ ymm1)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm8)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6],xmm2[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm13, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm3 ^ ymm1)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512-FCP-NEXT: vpor %xmm15, %xmm9, %xmm3 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 ^ (xmm7 & (xmm3 ^ xmm8)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6],xmm2[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1)) ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm11, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %xmm12, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %xmm14, (%r8) -; AVX512-FCP-NEXT: vmovdqa %xmm12, (%r9) -; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r10) +; AVX512-FCP-NEXT: vmovdqa %xmm11, (%r9) +; AVX512-FCP-NEXT: vmovdqa %xmm3, (%r10) ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -3001,96 +3003,96 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm4 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm3 ^ (ymm5 & (ymm1 ^ ymm3)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9)) +; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm2[1],xmm4[2,3,4],xmm2[5],xmm4[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero +; AVX512DQ-NEXT: vpor %xmm6, %xmm9, %xmm11 +; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm1 ^ ymm3)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm11)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] +; AVX512DQ-NEXT: vpor %xmm12, %xmm10, %xmm11 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512DQ-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10)) +; AVX512DQ-NEXT: vpor %xmm12, %xmm10, %xmm12 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm1 ^ ymm3)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm11)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm2[1],xmm4[2],xmm2[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512DQ-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm11)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm3 ^ ymm1)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] +; AVX512DQ-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero +; AVX512DQ-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm3 ^ ymm1)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm8)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6],xmm2[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm13, %xmm9, %xmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm3 ^ ymm1)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-NEXT: vpor %xmm15, %xmm9, %xmm3 +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 ^ (xmm7 & (xmm3 ^ xmm8)) +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6],xmm2[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] +; AVX512DQ-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1)) ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512DQ-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512DQ-NEXT: vmovdqa %xmm11, (%rcx) +; AVX512DQ-NEXT: vmovdqa %xmm12, (%rcx) ; AVX512DQ-NEXT: vmovdqa %xmm14, (%r8) -; AVX512DQ-NEXT: vmovdqa %xmm12, (%r9) -; AVX512DQ-NEXT: vmovdqa %xmm9, (%r10) +; AVX512DQ-NEXT: vmovdqa %xmm11, (%r9) +; AVX512DQ-NEXT: vmovdqa %xmm3, (%r10) ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -3099,96 +3101,96 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm3 ^ (ymm5 & (ymm1 ^ ymm3)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm2[1],xmm4[2,3,4],xmm2[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm9, %xmm11 +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm1 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm11)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] +; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm10, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10)) +; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm10, %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm1 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm11)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm2[1],xmm4[2],xmm2[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm11)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm3 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] +; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm3 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm8)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm3 ^ ymm1)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm9, %xmm3 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 ^ (xmm7 & (xmm3 ^ xmm8)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1)) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%r10) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -3196,15 +3198,14 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i8_stride7_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122 -; AVX512BW-NEXT: kmovd %r11d, %k1 +; AVX512BW-NEXT: movw $-28382, %r10w # imm = 0x9122 +; AVX512BW-NEXT: kmovd %r10d, %k1 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] @@ -3215,80 +3216,81 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512BW-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] +; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero +; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512BW-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} ; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512BW-NEXT: kmovd %edi, %k4 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm10 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] +; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12] +; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} +; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm9 {%k1} ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} -; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX512BW-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm11 {%k3} +; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm10 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm8, %xmm11, %xmm8 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm8 {%k1} +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15] ; AVX512BW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512BW-NEXT: vmovdqa %xmm8, (%r8) +; AVX512BW-NEXT: vmovdqa %xmm9, (%r8) ; AVX512BW-NEXT: vmovdqa %xmm10, (%r9) -; AVX512BW-NEXT: vmovdqa %xmm9, (%r10) +; AVX512BW-NEXT: vmovdqa %xmm8, (%rdi) ; AVX512BW-NEXT: vmovdqa %xmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3296,15 +3298,14 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i8_stride7_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 +; AVX512BW-FCP-NEXT: movw $-28382, %r10w # imm = 0x9122 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] @@ -3315,80 +3316,81 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512BW-FCP-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} ; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512BW-FCP-NEXT: kmovd %edi, %k4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm10 {%k4} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12] +; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm9 {%k1} ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm11 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm10 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm8 {%k1} +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa %xmm10, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%rdi) ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -3396,15 +3398,14 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-LABEL: load_i8_stride7_vf16: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: movw $-28382, %r11w # imm = 0x9122 -; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 +; AVX512DQ-BW-NEXT: movw $-28382, %r10w # imm = 0x9122 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} +; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] @@ -3415,80 +3416,81 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k2} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512DQ-BW-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512DQ-BW-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} ; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512DQ-BW-NEXT: kmovd %edi, %k4 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm10 {%k4} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12] +; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm10, %xmm9 {%k1} ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm11 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm8, %xmm10 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm11, %xmm8 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm11, %xmm8 {%k1} +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa %xmm10, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%r10) +; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%rdi) ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -3496,15 +3498,14 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 +; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r10w # imm = 0x9122 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] @@ -3515,80 +3516,81 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm10 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm11 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm10, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%rdi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -4689,8 +4691,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-LABEL: load_i8_stride7_vf32: ; AVX: # %bb.0: ; AVX-NEXT: subq $200, %rsp -; AVX-NEXT: vmovdqa 176(%rdi), %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13,u,u,u,u] +; AVX-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13,u,u,u,u] ; AVX-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm6[u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 @@ -4700,138 +4702,139 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,5,12],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u] -; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa (%rdi), %xmm10 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm14 +; AVX-NEXT: vmovdqa (%rdi), %xmm11 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u,u],zero,zero,xmm8[4,11,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,6,13],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[4,11,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,6,13],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm2 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,u,u,u,4,11],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm12[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,5,12],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,5,12],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm2 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[3,10],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm12[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,6,13],zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm2 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm3, %xmm12 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,3,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u],zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u] -; AVX-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm4, %xmm4 -; AVX-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[4,11,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm3, %xmm4 +; AVX-NEXT: vmovdqa 192(%rdi), %xmm7 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX-NEXT: vpor %xmm12, %xmm13, %xmm13 -; AVX-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709486080,16777215] -; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm13, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX-NEXT: vpor %xmm3, %xmm13, %xmm13 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm13, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX-NEXT: vpor %xmm2, %xmm14, %xmm2 -; AVX-NEXT: vpblendvb %xmm12, %xmm3, %xmm2, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX-NEXT: vpor %xmm2, %xmm13, %xmm2 +; AVX-NEXT: vpblendvb %xmm3, %xmm12, %xmm2, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpblendvb %xmm12, %xmm4, %xmm2, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] +; AVX-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm15 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u],zero,zero,zero,xmm5[5,12,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX-NEXT: vpor %xmm5, %xmm6, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm3, %xmm0 +; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm4, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX-NEXT: vpshufb %xmm10, %xmm5, %xmm2 +; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm2[6,7] -; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] -; AVX-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1,2,3,4,5],xmm2[6,7] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm2 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm9 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] ; AVX-NEXT: vmovdqa 96(%rdi), %xmm8 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9] ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: vandnps %ymm12, %ymm13, %ymm12 +; AVX-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm12, %ymm14, %ymm12 ; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -4840,21 +4843,21 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm0[6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm3[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10] -; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6],xmm3[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10] +; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandnps (%rsp), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: vandps %ymm14, %ymm13, %ymm13 ; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -4863,135 +4866,139 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm0 -; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm1 +; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm1 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11] -; AVX-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vmovd {{.*#+}} xmm12 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: vandnps %ymm4, %ymm1, %ymm4 -; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX-NEXT: vandnps %ymm5, %ymm13, %ymm5 -; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX-NEXT: vorps %ymm5, %ymm4, %ymm0 +; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm0[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm3[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm15, %xmm12 +; AVX-NEXT: vpshufb %xmm15, %xmm6, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX-NEXT: vorps %ymm1, %ymm5, %ymm5 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX-NEXT: vandnps %ymm4, %ymm14, %ymm4 +; AVX-NEXT: vandps %ymm5, %ymm14, %ymm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm11[u,u] +; AVX-NEXT: vorps %ymm4, %ymm5, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14,u,u] -; AVX-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14,u,u] +; AVX-NEXT: vpor %xmm7, %xmm4, %xmm4 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12] ; AVX-NEXT: vpor %xmm7, %xmm4, %xmm7 -; AVX-NEXT: vmovd {{.*#+}} xmm4 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm10 +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm10 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm7, %ymm15, %ymm7 ; AVX-NEXT: vorps %ymm7, %ymm10, %ymm7 -; AVX-NEXT: vandps %ymm7, %ymm13, %ymm7 +; AVX-NEXT: vandps %ymm7, %ymm14, %ymm7 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX-NEXT: vandnps %ymm10, %ymm13, %ymm10 +; AVX-NEXT: vandnps %ymm10, %ymm14, %ymm10 ; AVX-NEXT: vorps %ymm7, %ymm10, %ymm0 -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm7 -; AVX-NEXT: vmovd {{.*#+}} xmm14 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm10 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm7 +; AVX-NEXT: vmovd {{.*#+}} xmm13 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm13, %xmm0, %xmm10 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm11[u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u] ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6],xmm3[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[6,13] ; AVX-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm10 +; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm10 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 -; AVX-NEXT: vandps %ymm1, %ymm7, %ymm7 -; AVX-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX-NEXT: vandnps %ymm3, %ymm15, %ymm3 ; AVX-NEXT: vorps %ymm3, %ymm7, %ymm3 -; AVX-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX-NEXT: vandps %ymm3, %ymm14, %ymm3 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX-NEXT: vandnps %ymm7, %ymm13, %ymm7 +; AVX-NEXT: vandnps %ymm7, %ymm14, %ymm7 ; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm12 ; AVX-NEXT: vmovd {{.*#+}} xmm7 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm10 -; AVX-NEXT: vmovdqa %xmm5, %xmm3 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm10 +; AVX-NEXT: vmovdqa %xmm0, %xmm3 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa %xmm1, %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa %xmm4, %xmm1 ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm10[2,3,4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[u,u,u] ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] ; AVX-NEXT: # xmm12 = mem[0,0] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[0,7,14] ; AVX-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[0,7,14] -; AVX-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4 -; AVX-NEXT: vandnps %ymm5, %ymm1, %ymm5 -; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX-NEXT: vandnps %ymm5, %ymm13, %ymm5 -; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm5 +; AVX-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX-NEXT: vandnps %ymm4, %ymm14, %ymm4 +; AVX-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm4 ; AVX-NEXT: vmovd {{.*#+}} xmm7 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm10 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3,4,5,6,7] -; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3,4,5,6,7] +; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm11[u,u,u] ; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[1,8,15] ; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 -; AVX-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX-NEXT: vandnps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX-NEXT: vandps %ymm1, %ymm13, %ymm1 -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%rcx) -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%r8) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, (%r9) -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps %ymm4, (%rax) +; AVX-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2 +; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX-NEXT: vandnps %ymm4, %ymm14, %ymm1 +; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, (%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, (%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, (%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, (%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, (%r9) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps %ymm0, (%rax) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: vmovaps %ymm1, (%rax) ; AVX-NEXT: addq $200, %rsp ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -5001,59 +5008,62 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: subq $72, %rsp ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX2-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm2, %ymm6 +; AVX2-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm14, %ymm3, %ymm13, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0] -; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 -; AVX2-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm5, %ymm6 +; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm13, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15] +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm8 +; AVX2-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX2-NEXT: vmovdqa 208(%rdi), %xmm7 +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 +; AVX2-NEXT: vmovdqa %ymm14, %ymm3 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -5065,144 +5075,143 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX2-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm2, %ymm8, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX2-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm8, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] ; AVX2-NEXT: vpor %xmm1, %xmm12, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] -; AVX2-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-NEXT: vpor %xmm0, %xmm14, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm14 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm6, %ymm10, %ymm11, %ymm10 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] ; AVX2-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX2-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX2-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 -; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 -; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 -; AVX2-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 -; AVX2-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 -; AVX2-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 -; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 -; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] -; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm13 -; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm12 +; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm10 +; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm1 +; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm2 +; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm11 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm13, %ymm5 +; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm13, %ymm9 +; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm13, %ymm8 +; AVX2-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm6 +; AVX2-NEXT: vpblendvb %ymm3, %ymm13, %ymm0, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7,8,9,10],ymm13[11],ymm5[12,13],ymm13[14],ymm5[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = ymm5[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm13, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm13, %xmm2 +; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1,2,3],ymm1[4],ymm8[5,6],ymm1[7,8],ymm8[9,10,11],ymm1[12],ymm8[13,14],ymm1[15] +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX2-NEXT: vpblendvb %ymm5, %ymm8, %ymm1, %ymm8 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7,8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm4 +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm8[0],mem[1,2,3,4,5,6,7],ymm8[8],mem[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm12[1,2,3,4,5,6,7],ymm3[8],ymm12[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm5, (%rsi) ; AVX2-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-NEXT: addq $72, %rsp ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -5212,59 +5221,62 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: subq $72, %rsp ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm2, %ymm6 +; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm3, %ymm13, %ymm1 -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm6, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-FP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm5, %ymm6 +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm3, %ymm13, %ymm7 +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm8 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm7 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 +; AVX2-FP-NEXT: vmovdqa %ymm14, %ymm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -5276,144 +5288,143 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX2-FP-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm8, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX2-FP-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm8, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] ; AVX2-FP-NEXT: vpor %xmm1, %xmm12, %xmm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] -; AVX2-FP-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-FP-NEXT: vpor %xmm0, %xmm14, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm14 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm10, %ymm11, %ymm10 ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] ; AVX2-FP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX2-FP-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX2-FP-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm13 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm12 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm13, %ymm5 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm13, %ymm9 +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm13, %ymm8 +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm13, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7,8,9,10],ymm13[11],ymm5[12,13],ymm13[14],ymm5[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm5[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm13, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm2, %xmm13, %xmm2 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1,2,3],ymm1[4],ymm8[5,6],ymm1[7,8],ymm8[9,10,11],ymm1[12],ymm8[13,14],ymm1[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm8, %ymm1, %ymm8 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7,8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm8[0],mem[1,2,3,4,5,6,7],ymm8[8],mem[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm12[1,2,3,4,5,6,7],ymm3[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm5, (%rsi) ; AVX2-FP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-FP-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-FP-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FP-NEXT: addq $72, %rsp ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -5421,193 +5432,194 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-LABEL: load_i8_stride7_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $40, %rsp -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm1 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm8, %ymm12, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,1,2,4,6] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm10, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4,5],ymm7[6],ymm2[7,8,9],ymm7[10],ymm2[11,12,13],ymm7[14],ymm2[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7,8,9],ymm7[10],ymm1[11,12,13],ymm7[14],ymm1[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [0,18446744073709551360,16777215,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm8, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,1,3,4,6] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm12, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,12] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12] -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm10, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm9, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm9, %ymm7, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm12, %ymm8, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX2-FCP-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX2-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[6,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-FCP-NEXT: vpor %xmm9, %xmm14, %xmm9 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm7 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm7, %ymm9, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14] ; AVX2-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14] -; AVX2-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm12, %ymm15 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm12 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15] -; AVX2-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm6, %ymm5, %ymm11 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm13 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm6 -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm15 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm12, %ymm8, %ymm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm12, %ymm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15] +; AVX2-FCP-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX2-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm12 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm5, %ymm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm4 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm11, %ymm10, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm14 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm10, %ymm11, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm10, %ymm11, %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX2-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7,8,9,10],ymm7[11],ymm0[12,13],ymm7[14],ymm0[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6],ymm0[7,8],ymm1[9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1,2],ymm4[3],ymm13[4,5,6],ymm4[7,8],ymm13[9,10],ymm4[11],ymm13[12,13,14],ymm4[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1,2,3],ymm1[4],ymm14[5,6],ymm1[7,8],ymm14[9,10,11],ymm1[12],ymm14[13,14],ymm1[15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm8 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3,10],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7,8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13,14,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,3,5,6] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,3,5,6] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm15[1,2,3,4,5,6,7],ymm5[8],ymm15[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm8[0],mem[1,2,3,4,5,6,7],ymm8[8],mem[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm15[1,2,3,4,5,6,7],ymm0[8],ymm15[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm12[1,2,3,4,5,6,7],ymm1[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi) @@ -5615,9 +5627,9 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FCP-NEXT: addq $40, %rsp @@ -5626,1130 +5638,1124 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i8_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2)) -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm2 ^ ymm18)) +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] +; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX512-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm6 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512-NEXT: vmovdqa %ymm14, %ymm9 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7)) -; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm10, %xmm9, %xmm13 +; AVX512-NEXT: vmovdqa %ymm14, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm7 ^ (ymm4 & (ymm6 ^ ymm7)) +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm4[0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512-NEXT: vpor %xmm9, %xmm10, %xmm13 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX512-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm4 ^ (ymm15 & (ymm9 ^ ymm4)) ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10 ; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem) -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12)) -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512-NEXT: vmovdqa %ymm11, %ymm12 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u] -; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm13 & mem) +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512-NEXT: vmovdqa %ymm11, %ymm13 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm18 ^ (ymm13 & (ymm2 ^ ymm18)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm19 = ymm12 ^ (ymm16 & (ymm19 ^ ymm12)) +; AVX512-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512-NEXT: vmovdqa %ymm13, %ymm12 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7)) -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm15, %xmm12, %xmm15 +; AVX512-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512-NEXT: vmovdqa %ymm14, %ymm12 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm4 ^ ymm9)) ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8)) -; AVX512-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm15, %xmm8, %xmm8 -; AVX512-NEXT: vmovdqa %ymm13, %ymm15 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9)) -; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3)) -; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm8 & ~mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm1 ^ (ymm16 & (ymm12 ^ ymm1)) +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm6 ^ ymm7)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512-NEXT: vmovdqa %ymm13, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 ^ (ymm8 & (ymm4 ^ ymm9)) +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7,8,9,10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm1 & ymm17) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm18 ^ ymm2)) +; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512-NEXT: vpshufb %xmm15, %xmm5, %xmm15 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] +; AVX512-NEXT: vmovdqa %ymm13, %ymm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm18 ^ ymm2)) +; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512-NEXT: vpor %xmm15, %xmm14, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512-NEXT: vpor %xmm8, %xmm14, %xmm8 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm12 -; AVX512-NEXT: vpshufb %xmm12, %xmm5, %xmm14 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm20 -; AVX512-NEXT: vmovdqa %ymm13, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u] -; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) -; AVX512-NEXT: vmovdqa %ymm11, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm16 & (ymm8 ^ ymm1)) +; AVX512-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm6 ^ ymm7)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm14, %xmm1 ; AVX512-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm4 ^ ymm9)) ; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17) -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm1 & ymm17) +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm8[1,2,3,4,5,6,7],ymm14[8],ymm8[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm18 ^ ymm2)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512-NEXT: vpor %xmm8, %xmm14, %xmm8 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) -; AVX512-NEXT: vmovdqa %ymm13, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6)) -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm16 & (ymm8 ^ ymm1)) +; AVX512-NEXT: vmovdqa %ymm13, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm7 ^ ymm6)) +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm14 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm14, %xmm1 ; AVX512-NEXT: vmovdqa %ymm11, %ymm14 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm4 ^ ymm9)) ; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17) -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm11, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm1 & ymm17) +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm8[1,2,3,4,5,6,7],ymm14[8],ymm8[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm18 ^ ymm2)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] -; AVX512-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) -; AVX512-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6)) -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512-NEXT: vpor %xmm8, %xmm15, %xmm8 +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm16 & (ymm8 ^ ymm1)) +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm7 ^ ymm6)) +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm15 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX512-NEXT: vmovdqa %ymm13, %ymm15 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm4 ^ (ymm15 & (ymm9 ^ ymm4)) ; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17) -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2)) -; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm1 & ymm17) +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm18 ^ (ymm13 & (ymm2 ^ ymm18)) +; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2)) ; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm16 & (ymm2 ^ ymm1)) +; AVX512-NEXT: vpor %xmm3, %xmm5, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm0 & (ymm9 ^ ymm4)) ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17) -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm1 & ymm17) +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm18, (%rsi) -; AVX512-NEXT: vmovdqa64 %ymm19, (%rdx) +; AVX512-NEXT: vmovdqa64 %ymm19, (%rsi) +; AVX512-NEXT: vmovdqa %ymm12, (%rdx) ; AVX512-NEXT: vmovdqa64 %ymm20, (%rcx) ; AVX512-NEXT: vmovdqa64 %ymm21, (%r8) ; AVX512-NEXT: vmovdqa %ymm14, (%r9) -; AVX512-NEXT: vmovdqa %ymm8, (%r10) +; AVX512-NEXT: vmovdqa %ymm8, (%rdi) ; AVX512-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride7_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm2 ^ ymm18)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,1,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10 +; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm5 ^ (ymm3 & (ymm4 ^ ymm5)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm11 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm7 ^ ymm3)) ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem) -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm18 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm8[2],ymm14[3,4],ymm8[5],ymm14[6,7,8,9],ymm8[10],ymm14[11,12],ymm8[13],ymm14[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm11 & mem) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm16 & (ymm1 ^ ymm6)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm18 ^ (ymm6 & (ymm2 ^ ymm18)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,1,3,4,6] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm10 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm14, %xmm10, %xmm14 +; AVX512-FCP-NEXT: vpor %xmm15, %xmm10, %xmm15 ; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm3 ^ ymm7)) ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm15 & ~mem) +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm5 ^ (ymm15 & (ymm4 ^ ymm5)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[2,9],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm14[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6)) -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm14 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm6 & (ymm3 ^ ymm7)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm18 ^ ymm2)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[1,8,15,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm1 & ymm16) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,1,3,5,6] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm18 ^ ymm2)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[5,12] +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12] -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm15 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm17 & (ymm6 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm4 ^ ymm5)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm3 ^ ymm7)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2],ymm8[3],ymm13[4,5,6],ymm8[7,8],ymm13[9,10],ymm8[11],ymm13[12,13,14],ymm8[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm1 & ymm16) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm6[1,2,3,4,5,6,7],ymm13[8],ymm6[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm18 ^ ymm2)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[6,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) -; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm15 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm17 & (ymm6 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm3 ^ ymm7)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3],ymm8[4],ymm13[5,6],ymm8[7,8],ymm13[9,10,11],ymm8[12],ymm13[13,14],ymm8[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm1 & ymm16) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm6[1,2,3,4,5,6,7],ymm13[8],ymm6[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm18 ^ ymm2)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm14[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm12[0,7,14] ; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14] -; AVX512-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm17 & (ymm6 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm15 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm3 ^ (ymm15 & (ymm7 ^ ymm3)) ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2)) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm1 & ymm16) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm6[1,2,3,4,5,6,7],ymm15[8],ymm6[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm18 ^ (ymm11 & (ymm2 ^ ymm18)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm12[1,8,15] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm11, %xmm2 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm17 & (ymm2 ^ ymm1)) +; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm7 ^ ymm3)) ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm1 & ymm16) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%r9) -; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm13, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdi) ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm2 ^ ymm18)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm6 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm10, %xmm9, %xmm13 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm7 ^ (ymm4 & (ymm6 ^ ymm7)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm4[0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-NEXT: vpor %xmm9, %xmm10, %xmm13 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm4 ^ (ymm15 & (ymm9 ^ ymm4)) ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem) -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12)) -; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm13 & mem) +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm18 ^ (ymm13 & (ymm2 ^ ymm18)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm19 = ymm12 ^ (ymm16 & (ymm19 ^ ymm12)) +; AVX512DQ-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm12 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm15, %xmm12, %xmm15 +; AVX512DQ-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm4 ^ ymm9)) ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8)) -; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm15, %xmm8, %xmm8 -; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm15 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9)) -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm8 & ~mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm1 ^ (ymm16 & (ymm12 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm6 ^ ymm7)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 ^ (ymm8 & (ymm4 ^ ymm9)) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7,8,9,10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm1 & ymm17) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm18 ^ ymm2)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm5, %xmm15 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm18 ^ ymm2)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-NEXT: vpor %xmm15, %xmm14, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512DQ-NEXT: vpor %xmm8, %xmm14, %xmm8 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm12 -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm5, %xmm14 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm20 -; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) -; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm16 & (ymm8 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm6 ^ ymm7)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm14, %xmm1 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm4 ^ ymm9)) ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17) -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm1 & ymm17) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm8[1,2,3,4,5,6,7],ymm14[8],ymm8[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm18 ^ ymm2)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-NEXT: vpor %xmm8, %xmm14, %xmm8 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) -; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm16 & (ymm8 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm7 ^ ymm6)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm14 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm14, %xmm1 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm4 ^ ymm9)) ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17) -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm1 & ymm17) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm8[1,2,3,4,5,6,7],ymm14[8],ymm8[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm18 ^ ymm2)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] -; AVX512DQ-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512DQ-NEXT: vpor %xmm8, %xmm15, %xmm8 +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm16 & (ymm8 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm7 ^ ymm6)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm15 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm15 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm4 ^ (ymm15 & (ymm9 ^ ymm4)) ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17) -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm1 & ymm17) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm18 ^ (ymm13 & (ymm2 ^ ymm18)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm16 & (ymm2 ^ ymm1)) +; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (ymm0 & (ymm9 ^ ymm4)) ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17) -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm1 & ymm17) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm18, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %ymm19, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %ymm19, (%rsi) +; AVX512DQ-NEXT: vmovdqa %ymm12, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %ymm20, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %ymm21, (%r8) ; AVX512DQ-NEXT: vmovdqa %ymm14, (%r9) -; AVX512DQ-NEXT: vmovdqa %ymm8, (%r10) +; AVX512DQ-NEXT: vmovdqa %ymm8, (%rdi) ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm2 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,1,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm5 ^ (ymm3 & (ymm4 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm7 ^ ymm3)) ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem) -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm18 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm8[2],ymm14[3,4],ymm8[5],ymm14[6,7,8,9],ymm8[10],ymm14[11,12],ymm8[13],ymm14[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm11 & mem) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm16 & (ymm1 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm18 ^ (ymm6 & (ymm2 ^ ymm18)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,1,3,4,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm10 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm10, %xmm14 +; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm10, %xmm15 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm3 ^ ymm7)) ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm15 & ~mem) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm5 ^ (ymm15 & (ymm4 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[2,9],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm7 ^ (ymm6 & (ymm3 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm18 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[1,8,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm1 & ymm16) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,1,3,5,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm18 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[5,12] +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12] -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm17 & (ymm6 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm4 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm3 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2],ymm8[3],ymm13[4,5,6],ymm8[7,8],ymm13[9,10],ymm8[11],ymm13[12,13,14],ymm8[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm1 & ymm16) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm6[1,2,3,4,5,6,7],ymm13[8],ymm6[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm18 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[6,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm15 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm17 & (ymm6 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm3 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3],ymm8[4],ymm13[5,6],ymm8[7,8],ymm13[9,10,11],ymm8[12],ymm13[13,14],ymm8[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm1 & ymm16) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm6[1,2,3,4,5,6,7],ymm13[8],ymm6[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm18 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm14[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm12[0,7,14] ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14] -; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm17 & (ymm6 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm15 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm3 ^ (ymm15 & (ymm7 ^ ymm3)) ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2)) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm1 & ymm16) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm6[1,2,3,4,5,6,7],ymm15[8],ymm6[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm18 ^ (ymm11 & (ymm2 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm12[1,8,15] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm11, %xmm2 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm17 & (ymm2 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm7 ^ ymm3)) ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm1 & ymm16) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride7_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31] +; AVX512BW-NEXT: vpermw %zmm2, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u] +; AVX512BW-NEXT: vpermw %zmm2, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u] +; AVX512BW-NEXT: vpermw %zmm2, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15] +; AVX512BW-NEXT: vpermw %zmm2, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31] +; AVX512BW-NEXT: vpermw %zmm2, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122 -; AVX512BW-NEXT: kmovd %r11d, %k5 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: movw $992, %r11w # imm = 0x3E0 -; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: movw $-28382, %ax # imm = 0x9122 +; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm3, %ymm7 {%k4} +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm5 +; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 +; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512BW-NEXT: movw $8772, %r11w # imm = 0x2244 -; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u] -; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13 +; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512BW-NEXT: movw $8772, %ax # imm = 0x2244 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm5, %ymm9 {%k1} +; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm2 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u] +; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm14 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm15 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %xmm13, %xmm8, %xmm15 ; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 -; AVX512BW-NEXT: kmovd %edi, %k4 -; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4} -; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 +; AVX512BW-NEXT: kmovd %eax, %k5 +; AVX512BW-NEXT: movw $4644, %ax # imm = 0x1224 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm3, %ymm15 {%k2} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[6,13],zero,zero,xmm16[2,9,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF -; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3} -; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-NEXT: movl $511, %eax # imm = 0x1FF +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k3} +; AVX512BW-NEXT: movw $9288, %ax # imm = 0x2448 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm2 {%k5} +; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3] -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4} -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 -; AVX512BW-NEXT: kmovd %edi, %k4 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5} -; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX512BW-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vpshufb %xmm14, %xmm9, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm3, %ymm18 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm18[2,9],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm16, %xmm14, %xmm14 +; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,xmm16[0,7,14],zero,zero,xmm16[3,10,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] +; AVX512BW-NEXT: vporq %xmm19, %xmm16, %xmm0 +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k4} +; AVX512BW-NEXT: movl $261632, %eax # imm = 0x3FE00 +; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm12 +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[1,8,15,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm12, %ymm10 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm16[u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u,u,u] +; AVX512BW-NEXT: vpor %xmm14, %xmm12, %xmm12 +; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-NEXT: vpshufb %xmm13, %xmm9, %xmm13 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 -; AVX512BW-NEXT: kmovd %edi, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm3, %ymm14 {%k3} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512BW-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] +; AVX512BW-NEXT: kmovd %r10d, %k5 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm0 {%k5} +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm7, %ymm13 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} +; AVX512BW-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm1, %ymm14 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm0 {%k5} +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm7, %ymm13 {%k3} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} +; AVX512BW-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] +; AVX512BW-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm1, %ymm14 {%k1} ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} -; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm0 {%k5} +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqu16 %ymm7, %ymm5 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] +; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm1 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k5} +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 {%k4} = ymm20[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-NEXT: vmovdqa %ymm10, (%rdx) ; AVX512BW-NEXT: vmovdqa %ymm12, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm11, (%r8) -; AVX512BW-NEXT: vmovdqa %ymm5, (%r9) -; AVX512BW-NEXT: vmovdqa %ymm4, (%r10) -; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-NEXT: vmovdqa %ymm6, (%r9) +; AVX512BW-NEXT: vmovdqa %ymm0, (%rdi) +; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride7_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vpermw %zmm9, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k5 +; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 +; AVX512BW-FCP-NEXT: kmovd %eax, %k5 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: movw $992, %r11w # imm = 0x3E0 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: movw $8772, %r11w # imm = 0x2244 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FCP-NEXT: movl $-524288, %r11d # imm = 0xFFF80000 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4} -; AVX512BW-FCP-NEXT: movw $4644, %r11w # imm = 0x1224 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: movl $511, %r11d # imm = 0x1FF -; AVX512BW-FCP-NEXT: kmovd %r11d, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} -; AVX512BW-FCP-NEXT: movw $9288, %r11w # imm = 0x2448 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6] -; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4} -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FCP-NEXT: movl $261632, %r11d # imm = 0x3FE00 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6] -; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000 -; AVX512BW-FCP-NEXT: kmovd %edi, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm13 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm1 {%k2} +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,1,2,4,6] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm13, %ymm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX512BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 +; AVX512BW-FCP-NEXT: kmovd %eax, %k4 +; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: movl $511, %eax # imm = 0x1FF +; AVX512BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm7 {%k3} +; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 +; AVX512BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm13 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm1 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,1,3,4,6] +; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm13, %ymm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm7 {%k4} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm12 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-FCP-NEXT: movl $261632, %eax # imm = 0x3FE00 +; AVX512BW-FCP-NEXT: kmovd %eax, %k4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermw %zmm9, %zmm10, %zmm13 +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm9 {%k5} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,1,3,5,6] +; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm10, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1,2,3,4,5,6,7],ymm12[8],ymm9[9,10,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm10 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm11 +; AVX512BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[5,12] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: movl $-134217728, %eax # imm = 0xF8000000 +; AVX512BW-FCP-NEXT: kmovd %eax, %k5 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm15 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm14 {%k5} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm12, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm13 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} @@ -6758,369 +6764,369 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm10[0,7,14] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm10[1,8,15] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm5 {%k5} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm12, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rdi) ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride7_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31] ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: movw $-28382, %r11w # imm = 0x9122 -; AVX512DQ-BW-NEXT: kmovd %r11d, %k5 -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: movw $992, %r11w # imm = 0x3E0 -; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-BW-NEXT: movw $-28382, %ax # imm = 0x9122 +; AVX512DQ-BW-NEXT: kmovd %eax, %k4 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm3, %ymm7 {%k4} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: movw $992, %ax # imm = 0x3E0 +; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: movw $8772, %r11w # imm = 0x2244 -; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13 +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: movw $8772, %ax # imm = 0x2244 +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm5, %ymm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm8, %ymm2 {%k2} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm8 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm14 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm8, %xmm15 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm8, %xmm15 ; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 -; AVX512DQ-BW-NEXT: kmovd %edi, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4} -; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512DQ-BW-NEXT: kmovd %edi, %k2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 +; AVX512DQ-BW-NEXT: kmovd %eax, %k5 +; AVX512DQ-BW-NEXT: movw $4644, %ax # imm = 0x1224 +; AVX512DQ-BW-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm3, %ymm15 {%k2} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[6,13],zero,zero,xmm16[2,9,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF -; AVX512DQ-BW-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3} -; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512DQ-BW-NEXT: kmovd %edi, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-NEXT: movl $511, %eax # imm = 0x1FF +; AVX512DQ-BW-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k3} +; AVX512DQ-BW-NEXT: movw $9288, %ax # imm = 0x2448 +; AVX512DQ-BW-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm2 {%k5} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4} -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00 -; AVX512DQ-BW-NEXT: kmovd %edi, %k4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm9, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm3, %ymm18 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm18[2,9],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm14, %xmm14 +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm18, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,xmm16[0,7,14],zero,zero,xmm16[3,10,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] +; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm16, %xmm0 +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k4} +; AVX512DQ-BW-NEXT: movl $261632, %eax # imm = 0x3FE00 +; AVX512DQ-BW-NEXT: kmovd %eax, %k4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm12 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[1,8,15,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm12, %ymm10 {%k5} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm16[u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm12, %xmm12 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm9, %xmm13 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 -; AVX512DQ-BW-NEXT: kmovd %edi, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k2} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm3, %ymm14 {%k3} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: kmovd %r10d, %k5 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm0 {%k5} +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm7, %ymm13 {%k1} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm1, %ymm14 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm0 {%k5} +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm7, %ymm13 {%k3} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] +; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm1, %ymm14 {%k1} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm0 {%k5} +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm7, %ymm5 {%k2} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm1 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k5} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 {%k4} = ymm20[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%rdx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm12, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm11, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%r10) -; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rdi) +; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm9, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm9, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k5 +; AVX512DQ-BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: movw $992, %r11w # imm = 0x3E0 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: movw $8772, %r11w # imm = 0x2244 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-BW-FCP-NEXT: movl $-524288, %r11d # imm = 0xFFF80000 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4} -; AVX512DQ-BW-FCP-NEXT: movw $4644, %r11w # imm = 0x1224 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: movl $511, %r11d # imm = 0x1FF -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} -; AVX512DQ-BW-FCP-NEXT: movw $9288, %r11w # imm = 0x2448 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: movl $261632, %r11d # imm = 0x3FE00 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,1,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm14, %ymm13, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 +; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: movl $511, %eax # imm = 0x1FF +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm7 {%k3} +; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm13 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm1 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,1,3,4,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm14, %ymm13, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm7 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: movl $261632, %eax # imm = 0x3FE00 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm9, %zmm10, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm9 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,1,3,5,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm14, %ymm10, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1,2,3,4,5,6,7],ymm12[8],ymm9[9,10,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[5,12] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %eax # imm = 0xF8000000 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm15 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm14 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} @@ -7129,53 +7135,55 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm10[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm10[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm5 {%k5} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm12, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rdi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -9336,30 +9344,30 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1] -; AVX-NEXT: # xmm4 = mem[0,0] ; AVX-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1] +; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm2 -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] +; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX-NEXT: vpor %xmm1, %xmm3, %xmm3 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] ; AVX-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm12, %xmm1 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] ; AVX-NEXT: vmovdqa 128(%rdi), %xmm14 ; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm9 ; AVX-NEXT: vpor %xmm1, %xmm9, %xmm9 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u] -; AVX-NEXT: vpblendvb %xmm15, %xmm2, %xmm9, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 400(%rdi), %xmm9 -; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX-NEXT: vmovdqa 384(%rdi), %xmm6 -; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm2 +; AVX-NEXT: vpblendvb %xmm15, %xmm3, %xmm9, %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 400(%rdi), %xmm6 +; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX-NEXT: vmovdqa 384(%rdi), %xmm8 +; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovdqa 368(%rdi), %xmm8 -; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX-NEXT: vmovdqa 368(%rdi), %xmm9 +; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm2 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm11 ; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -9385,13 +9393,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX-NEXT: vpblendvb %xmm15, %xmm5, %xmm13, %xmm5 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -9412,17 +9420,17 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovq {{.*#+}} xmm4 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm2 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm1 = [128,128,128,6,13,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm0 +; AVX-NEXT: vpshufb %xmm1, %xmm11, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm3 @@ -9437,20 +9445,20 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm12, %xmm3, %xmm13, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpor %xmm1, %xmm2, %xmm14 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm2 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm14 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm3 +; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpblendvb %xmm12, %xmm14, %xmm2, %xmm2 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm2 +; AVX-NEXT: vmovq {{.*#+}} xmm8 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm8, %xmm10, %xmm2 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX-NEXT: vpor %xmm2, %xmm4, %xmm2 @@ -9462,17 +9470,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa %xmm12, %xmm14 ; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm5, %xmm2 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm5 -; AVX-NEXT: vmovdqa %xmm13, %xmm9 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX-NEXT: vmovdqa %xmm1, %xmm12 -; AVX-NEXT: vpor %xmm5, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm5 +; AVX-NEXT: vmovdqa %xmm0, %xmm8 +; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX-NEXT: vmovdqa %xmm9, %xmm12 +; AVX-NEXT: vpor %xmm5, %xmm2, %xmm0 +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm2 +; AVX-NEXT: vmovdqa %xmm1, %xmm9 +; AVX-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX-NEXT: vmovdqa %xmm13, %xmm6 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpblendvb %xmm14, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm14, %xmm6 -; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpblendvb %xmm14, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovq {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm3 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,0,0,0,0,0,0,0,0] @@ -9483,15 +9492,16 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] ; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm14 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX-NEXT: vpblendvb %xmm6, %xmm5, %xmm13, %xmm5 -; AVX-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm13, %xmm0 +; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm3 +; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: vpblendvb %xmm7, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -9510,15 +9520,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm13, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm3 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9535,14 +9545,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX-NEXT: vmovdqa %xmm2, %xmm8 ; AVX-NEXT: vpblendvb %xmm2, %xmm5, %xmm13, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX-NEXT: vmovdqa %xmm10, %xmm2 -; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX-NEXT: vmovdqa %xmm7, %xmm10 +; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX-NEXT: vmovdqa %xmm9, %xmm2 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm3 -; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX-NEXT: vmovdqa %xmm7, %xmm9 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm14 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,0,0,0,0,0,0] @@ -9558,44 +9568,43 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpor %xmm5, %xmm11, %xmm5 ; AVX-NEXT: vmovdqa %xmm8, %xmm11 ; AVX-NEXT: vpblendvb %xmm8, %xmm4, %xmm5, %xmm8 -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm3 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX-NEXT: vmovdqa %xmm6, %xmm9 +; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm3 +; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm6 +; AVX-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm5 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] ; AVX-NEXT: # xmm11 = mem[0,0] ; AVX-NEXT: vpshufb %xmm11, %xmm5, %xmm13 ; AVX-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX-NEXT: vpmovsxdq {{.*#+}} xmm13 = [18446744073709486080,16777215] +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 432(%rdi), %xmm4 -; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX-NEXT: vmovdqa 416(%rdi), %xmm3 -; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa 416(%rdi), %xmm4 +; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm11 ; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0 ; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] ; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm11 +; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm11 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX-NEXT: # xmm0 = mem[0,0] ; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm15 ; AVX-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpblendvb %xmm13, (%rsp), %xmm11, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpblendvb %xmm13, %xmm14, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9604,90 +9613,88 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm1 ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] ; AVX-NEXT: # xmm11 = mem[0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm14 +; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm14 ; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1 ; AVX-NEXT: vpblendvb %xmm13, %xmm8, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm13, %xmm6, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm1 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,11,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm6 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] ; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm8 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] -; AVX-NEXT: # xmm2 = mem[0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm10 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm8 +; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] +; AVX-NEXT: # xmm6 = mem[0,0] +; AVX-NEXT: vpshufb %xmm6, %xmm12, %xmm10 ; AVX-NEXT: vpor %xmm8, %xmm10, %xmm8 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1,2],xmm8[3,4,5,6,7] ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] ; AVX-NEXT: # xmm7 = mem[0,0] ; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm10 -; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] ; AVX-NEXT: # xmm11 = mem[0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm12 +; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm12 ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX-NEXT: vpblendvb %xmm13, %xmm8, %xmm10, %xmm8 -; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX-NEXT: vpblendvb %xmm13, %xmm8, %xmm10, %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm8 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] -; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm1 -; AVX-NEXT: vmovdqa %xmm4, %xmm8 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm2 -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm1 +; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm6 +; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX-NEXT: vpblendvb %xmm13, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] -; AVX-NEXT: vmovdqa %xmm6, %xmm13 -; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX-NEXT: vmovdqa %xmm0, %xmm5 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] +; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm1 +; AVX-NEXT: vmovdqa %xmm9, %xmm14 +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] +; AVX-NEXT: vpshufb %xmm13, %xmm5, %xmm6 +; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm7 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] ; AVX-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm10 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm10 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] -; AVX-NEXT: # xmm11 = mem[0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm7, %xmm10 -; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,7,14,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] +; AVX-NEXT: # xmm7 = mem[0,0] +; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm10 +; AVX-NEXT: vmovd {{.*#+}} xmm6 = [0,7,14,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm12 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX-NEXT: vandnps %ymm10, %ymm2, %ymm10 +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm10, %ymm0, %ymm10 ; AVX-NEXT: vorps %ymm10, %ymm12, %ymm10 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -9695,56 +9702,59 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10 ; AVX-NEXT: vorps %ymm1, %ymm10, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm1 -; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm10 -; AVX-NEXT: vmovdqa %xmm3, %xmm5 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm2, %xmm9 +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm13, %xmm4, %xmm10 +; AVX-NEXT: vmovdqa %xmm4, %xmm13 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX-NEXT: vmovdqa 304(%rdi), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm14 -; AVX-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm15 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9] -; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX-NEXT: vmovdqa 336(%rdi), %xmm8 -; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm15 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm11 +; AVX-NEXT: vmovdqa 336(%rdi), %xmm7 +; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm6, %ymm0, %ymm0 ; AVX-NEXT: vorps %ymm0, %ymm11, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm15, %xmm10 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm10 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm0[7] -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,128,128,3,10,0,128,128,128,128,128,3,10] -; AVX-NEXT: # xmm2 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm4, %xmm11 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,128,128,128,128,128,3,10,0,128,128,128,128,128,3,10] +; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm11 ; AVX-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovd {{.*#+}} xmm8 = [1,8,15,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm8, %xmm15, %xmm14 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload @@ -9755,108 +9765,108 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] -; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm6[7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload ; AVX-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] -; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] -; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm6 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] -; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] -; AVX-NEXT: # xmm6 = mem[0,0] -; AVX-NEXT: vmovdqa %xmm4, %xmm12 -; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm7 -; AVX-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm7 -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 +; AVX-NEXT: vmovdqa %xmm4, %xmm10 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] +; AVX-NEXT: vmovdqa %xmm5, %xmm14 +; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm6[7] +; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] +; AVX-NEXT: # xmm5 = mem[0,0] +; AVX-NEXT: vmovdqa %xmm2, %xmm12 +; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm6 +; AVX-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm8 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm8, %xmm15, %xmm6 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm6 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7 -; AVX-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm6, %ymm1, %ymm6 +; AVX-NEXT: vorps %ymm6, %ymm7, %ymm6 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX-NEXT: vandnps %ymm8, %ymm15, %ymm8 -; AVX-NEXT: vandps %ymm7, %ymm15, %ymm7 -; AVX-NEXT: vorps %ymm7, %ymm8, %ymm0 +; AVX-NEXT: vandnps %ymm7, %ymm15, %ymm7 +; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7] -; AVX-NEXT: vmovdqa %xmm5, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm4[6,7] +; AVX-NEXT: vmovdqa %xmm9, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] ; AVX-NEXT: vmovdqa %xmm11, %xmm1 -; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm4 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX-NEXT: vxorps %xmm7, %xmm7, %xmm7 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] +; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm6[7] +; AVX-NEXT: vmovdqa %xmm13, %xmm9 +; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX-NEXT: vpshufb %xmm8, %xmm13, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload ; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 ; AVX-NEXT: vandnps %ymm3, %ymm15, %ymm3 ; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2 ; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX-NEXT: vmovdqa %xmm14, %xmm8 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] ; AVX-NEXT: # xmm4 = mem[0,0] ; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm5 ; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm6[7] ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] ; AVX-NEXT: # xmm5 = mem[0,0] ; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm6 -; AVX-NEXT: vmovdqa %xmm12, %xmm11 ; AVX-NEXT: vpor %xmm6, %xmm2, %xmm6 -; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm7 +; AVX-NEXT: vmovd {{.*#+}} xmm11 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm7 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload @@ -9865,17 +9875,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload ; AVX-NEXT: vandnps %ymm7, %ymm15, %ymm7 ; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vorps %ymm7, %ymm6, %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm4 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm0[7] -; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm2[7] +; AVX-NEXT: vpshufb %xmm5, %xmm9, %xmm4 ; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm4 +; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm4 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX-NEXT: vmovaps %ymm12, %ymm11 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload ; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 @@ -9884,12 +9895,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm3 -; AVX-NEXT: vmovd {{.*#+}} xmm0 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm0, %xmm13, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [12,0,0,0,128,128,128,5,12,0,0,0,128,128,128,5] ; AVX-NEXT: # xmm4 = mem[0,0] @@ -9901,67 +9910,60 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[u,u] ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] ; AVX-NEXT: # xmm5 = mem[0,0] ; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm9 ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm9[7] -; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,128,6,13,0,128,128,128,128,128,6,13] -; AVX-NEXT: # xmm9 = mem[0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm10 +; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm2[7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[6,13] ; AVX-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX-NEXT: vmovdqa %xmm0, %xmm11 -; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm10 +; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm10 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7 +; AVX-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX-NEXT: vandnps %ymm7, %ymm11, %ymm7 ; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload ; AVX-NEXT: vandnps %ymm7, %ymm15, %ymm7 ; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX-NEXT: vorps %ymm7, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm7 -; AVX-NEXT: vmovdqa %xmm11, %xmm2 +; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm7 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[u,u] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm5 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm5[6,13] ; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX-NEXT: vandnps %ymm4, %ymm11, %ymm4 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4 ; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX-NEXT: vmovd {{.*#+}} xmm8 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm8, %xmm13, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] ; AVX-NEXT: # xmm4 = mem[0,0] @@ -9971,73 +9973,74 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[u,u,u] ; AVX-NEXT: vpor %xmm3, %xmm9, %xmm9 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14] ; AVX-NEXT: # xmm7 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm13 +; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm13 ; AVX-NEXT: vpor %xmm13, %xmm9, %xmm9 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm13 +; AVX-NEXT: vmovd {{.*#+}} xmm3 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 -; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX-NEXT: vandnps %ymm9, %ymm13, %ymm9 +; AVX-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9 ; AVX-NEXT: vorps %ymm5, %ymm9, %ymm5 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload ; AVX-NEXT: vandnps %ymm9, %ymm15, %ymm9 ; AVX-NEXT: vandps %ymm5, %ymm15, %ymm5 ; AVX-NEXT: vorps %ymm5, %ymm9, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm9 -; AVX-NEXT: vmovdqa %xmm8, %xmm14 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm9 +; AVX-NEXT: vmovdqa %xmm3, %xmm13 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[2,9,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[u,u,u] ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] -; AVX-NEXT: # xmm11 = mem[0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm6 -; AVX-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX-NEXT: # xmm14 = mem[0,0] +; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm6 +; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm6 +; AVX-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX-NEXT: vandnps %ymm4, %ymm11, %ymm4 ; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4 ; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2 -; AVX-NEXT: vorps %ymm4, %ymm2, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128] ; AVX-NEXT: # xmm6 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] -; AVX-NEXT: # xmm0 = mem[0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] +; AVX-NEXT: # xmm8 = mem[0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm9 ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7] ; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] @@ -10045,15 +10048,16 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm9 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm12[u,u,u] ; AVX-NEXT: vpor %xmm9, %xmm13, %xmm9 -; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm9 ; AVX-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] ; AVX-NEXT: # xmm13 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm13, %xmm12, %xmm14 ; AVX-NEXT: vpor %xmm14, %xmm9, %xmm9 -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm14 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm14 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 -; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vmovaps %ymm11, %ymm0 ; AVX-NEXT: vandps %ymm4, %ymm11, %ymm4 ; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9 ; AVX-NEXT: vorps %ymm4, %ymm9, %ymm4 @@ -10061,27 +10065,27 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vandnps %ymm9, %ymm15, %ymm9 ; AVX-NEXT: vandps %ymm4, %ymm15, %ymm4 ; AVX-NEXT: vorps %ymm4, %ymm9, %ymm4 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm12 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm12 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm8 +; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm7 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm10[u,u,u] ; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero -; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm7 +; AVX-NEXT: vpshufb %xmm13, %xmm5, %xmm7 ; AVX-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX-NEXT: vandps %ymm6, %ymm11, %ymm3 -; AVX-NEXT: vandnps %ymm2, %ymm11, %ymm1 +; AVX-NEXT: vandps %ymm0, %ymm6, %ymm3 +; AVX-NEXT: vandnps %ymm2, %ymm0, %ymm1 ; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm2 # 16-byte Folded Reload ; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2 @@ -10121,17 +10125,17 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i8_stride7_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $760, %rsp # imm = 0x2F8 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-NEXT: subq $792, %rsp # imm = 0x318 +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm13, %ymm12, %ymm10, %ymm0 -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm0 +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] @@ -10139,338 +10143,333 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm11, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm15, %ymm5, %ymm11, %ymm3 ; AVX2-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm7, %ymm0 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm10, %ymm5 +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa %ymm6, %ymm0 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm8 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm15, %ymm2 -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm2 +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 +; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm9, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm1, %ymm12, %ymm10, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm7 +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm6 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0] -; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 +; AVX2-NEXT: vpblendvb %ymm14, %ymm8, %ymm15, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] -; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm10, %ymm5 +; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 +; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm8 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm15, %ymm2 -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm13, %ymm8, %ymm15, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,2,9,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm10 +; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm6 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 -; AVX2-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-NEXT: vpshufb %xmm12, %xmm13, %xmm6 +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX2-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-NEXT: vmovdqa 432(%rdi), %xmm7 +; AVX2-NEXT: vpshufb %xmm12, %xmm7, %xmm10 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] +; AVX2-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm8, %ymm15, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm10 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] -; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm11 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,3,10,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm12 +; AVX2-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,0,0,5,12,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm14, %xmm1, %xmm9 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 -; AVX2-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 +; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm1 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm6 -; AVX2-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-NEXT: vmovdqa %xmm7, %xmm5 +; AVX2-NEXT: vpshufb %xmm10, %xmm7, %xmm1 +; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm9 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm8 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-NEXT: vpshufb %xmm9, %xmm12, %xmm10 -; AVX2-NEXT: vmovdqa %xmm12, %xmm3 -; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12] +; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128] +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX2-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm9, %ymm9 +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm12, %ymm15, %ymm8, %ymm9 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm9, %xmm14, %xmm7 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm10, %xmm7, %xmm7 ; AVX2-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm7 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-NEXT: vpshufb %xmm7, %xmm13, %xmm9 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13] +; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm10 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128] +; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm12 +; AVX2-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 -; AVX2-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX2-NEXT: vpblendvb %ymm6, %ymm15, %ymm8, %ymm1 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm6 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm6 -; AVX2-NEXT: vmovdqa %xmm5, %xmm13 -; AVX2-NEXT: vpshufb %xmm10, %xmm14, %xmm7 +; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm7 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vmovdqa %ymm13, %ymm2 +; AVX2-NEXT: vpblendvb %ymm3, %ymm13, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm7, %xmm12, %xmm9 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm10, %xmm14, %xmm11 -; AVX2-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm7, %xmm13, %xmm10 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14] +; AVX2-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vmovdqa %ymm8, %ymm12 +; AVX2-NEXT: vpblendvb %ymm3, %ymm15, %ymm8, %ymm1 +; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm7, %xmm11, %xmm6 -; AVX2-NEXT: vpshufb %xmm10, %xmm13, %xmm7 -; AVX2-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm11, %xmm8, %xmm7 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-NEXT: vmovdqa %ymm4, %ymm1 +; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm7 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-NEXT: vpshufb %xmm8, %xmm14, %xmm9 -; AVX2-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128] +; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15] +; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm11 +; AVX2-NEXT: vpor %xmm7, %xmm11, %xmm7 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpblendvb %ymm3, %ymm15, %ymm12, %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm7, %ymm12, %ymm15, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm3 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-NEXT: vpshufb %xmm8, %xmm13, %xmm3 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 -; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm13, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm11, %ymm13, %ymm0, %ymm10 +; AVX2-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm13, %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm13, %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm13, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 -; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm14 +; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm13 +; AVX2-NEXT: vpblendvb %ymm9, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 -; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 -; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 -; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm2, %ymm0 +; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm4 +; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm12 +; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm6, %ymm15 +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm9, %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 -; AVX2-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 -; AVX2-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 -; AVX2-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 +; AVX2-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm9, %ymm2 +; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm9, %ymm7 +; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm11 +; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm6, %ymm8 +; AVX2-NEXT: vpblendvb %ymm15, %ymm9, %ymm6, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm15 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm15 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm15 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] @@ -10478,44 +10477,43 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 +; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3],ymm7[4,5,6],ymm1[7,8],ymm7[9,10],ymm1[11],ymm7[12,13,14],ymm1[15] ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 -; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1,2,3],ymm3[4],ymm12[5,6],ymm3[7,8],ymm12[9,10,11],ymm3[12],ymm12[13,14],ymm3[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm0 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1,2,3],ymm1[4],ymm11[5,6],ymm1[7,8],ymm11[9,10,11],ymm1[12],ymm11[13,14],ymm1[15] ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -10528,8 +10526,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -10537,9 +10534,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm3 ; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6,7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13,14,15] ; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u] @@ -10552,8 +10548,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm5 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10561,8 +10556,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm5 ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -10572,10 +10568,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm8, %xmm9, %xmm9 ; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] +; AVX2-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX2-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,6,13,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm10, %xmm11, %xmm11 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] @@ -10590,8 +10586,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX2-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -10601,9 +10597,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload ; AVX2-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] @@ -10645,23 +10642,23 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm3, 32(%rax) ; AVX2-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-NEXT: addq $760, %rsp # imm = 0x2F8 +; AVX2-NEXT: addq $792, %rsp # imm = 0x318 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride7_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $760, %rsp # imm = 0x2F8 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-FP-NEXT: subq $792, %rsp # imm = 0x318 +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm10, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] @@ -10669,338 +10666,333 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm11, %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm5, %ymm11, %ymm3 ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm10, %ymm5 +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm8 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm15, %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm9, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm12, %ymm10, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm5 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm6 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm8, %ymm15, %ymm4 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm10, %ymm5 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm9, %ymm15, %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm15, %ymm2 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,2,9,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm3 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm10 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm6 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm13, %xmm6 +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm7, %xmm10 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm8 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm8, %ymm15, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm10 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm11 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,3,10,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm12 +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,0,0,5,12,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm1, %xmm9 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm6 -; AVX2-FP-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FP-NEXT: vmovdqa %xmm7, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm7, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm9 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm8 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm12, %xmm10 -; AVX2-FP-NEXT: vmovdqa %xmm12, %xmm3 -; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12] +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128] +; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX2-FP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm9, %ymm9 +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm15, %ymm8, %ymm9 +; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm14, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm7, %xmm7 ; AVX2-FP-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm1 +; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm7 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm9 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128] +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 +; AVX2-FP-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm10, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 -; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm15, %ymm8, %ymm1 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm6 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm6 -; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm13 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm7 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vmovdqa %ymm13, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm13, %ymm4, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm9 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm11 -; AVX2-FP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14] +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm10, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm12 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm15, %ymm8, %ymm1 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm13, %xmm7 -; AVX2-FP-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm8, %xmm7 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm7 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm14, %xmm9 -; AVX2-FP-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128] +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm5, %xmm11 +; AVX2-FP-NEXT: vpor %xmm7, %xmm11, %xmm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm7, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm15, %ymm12, %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm12, %ymm15, %ymm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm13, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm3 ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm13, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm13, %ymm0, %ymm10 +; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm13, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm13, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm13, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm3 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm14 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm13 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm6, %ymm2, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm12 +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm2, %ymm6, %ymm15 +; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm2, %ymm6, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 +; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm6, %ymm9, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm9, %ymm7 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm11 +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm9, %ymm6, %ymm8 +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm9, %ymm6, %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm15 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm15 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm15 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] @@ -11008,44 +11000,43 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] -; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3],ymm7[4,5,6],ymm1[7,8],ymm7[9,10],ymm1[11],ymm7[12,13,14],ymm1[15] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 -; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] -; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1,2,3],ymm3[4],ymm12[5,6],ymm3[7,8],ymm12[9,10,11],ymm3[12],ymm12[13,14],ymm3[15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm0 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1,2,3],ymm1[4],ymm11[5,6],ymm1[7,8],ymm11[9,10,11],ymm1[12],ymm11[13,14],ymm1[15] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11058,8 +11049,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] -; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -11067,9 +11057,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm3 ; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6,7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u] @@ -11082,8 +11071,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -11091,8 +11079,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm5 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -11102,10 +11091,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm9, %xmm9 ; AVX2-FP-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,6,13,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm11, %xmm11 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] @@ -11120,8 +11109,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -11131,9 +11120,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] @@ -11175,7 +11165,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-FP-NEXT: addq $760, %rsp # imm = 0x2F8 +; AVX2-FP-NEXT: addq $792, %rsp # imm = 0x318 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -11183,86 +11173,82 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $776, %rsp # imm = 0x308 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm11, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm13, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm15, %ymm8, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm15, %ymm8, %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] +; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm11, %ymm9, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm15 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm13, %ymm10, %ymm2 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm14, %ymm11, %ymm4 +; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm12, %ymm4 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,18446744073709551360,16777215,0] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm8, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7,8,9],ymm8[10],ymm3[11,12,13],ymm8[14],ymm3[15] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm5 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm5 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm5 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm9 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm10, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 @@ -11271,33 +11257,33 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm14, %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm14, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm13 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm7 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm15, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm5, %ymm12, %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm4 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -11305,180 +11291,175 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm11 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm10, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm14, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm3, %ymm14, %ymm0 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm9, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm1 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm12 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,u,u,u,128,128,128,5,12] +; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,0,7,14,128,128] +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm12 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm12, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm12, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm12, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm12 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm4 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11 +; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm11 ; AVX2-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm14, %ymm3, %ymm6 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm11 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-FCP-NEXT: vmovdqa %xmm15, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,128,128,128,6,13] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,u,u,u,u,u,1,8,15,128,128] +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm3 ; AVX2-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm13, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm13, %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm9, %ymm2, %ymm6 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm10, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm14, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm14, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm10 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm10, %xmm1 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,u,u,u,2,9,128,128,128] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,u,u,u,u,u,u,128,128,0,7,14] +; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm11 ; AVX2-FCP-NEXT: vpor %xmm13, %xmm11, %xmm11 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm11, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm11, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm13 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm6 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm9, %ymm13, %ymm1 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm6 -; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm14 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm10 +; AVX2-FCP-NEXT: vmovdqa %xmm7, %xmm12 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm10 +; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm7 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm14, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm14, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,u,u,u,u,u,u,3,10,128,128,128] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,128,128,1,8,15] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm13, %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm9, %ymm13, %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm13, %ymm9, %ymm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm3 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm3 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm9 -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm9 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11486,30 +11467,29 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm8 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm13 -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm13 +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm7 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm12 -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm15 -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm14 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm5, %ymm0, %ymm15 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm0 @@ -11519,10 +11499,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpor %xmm0, %xmm11, %xmm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8,9,10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm0 @@ -11542,8 +11521,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm9 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 @@ -11554,39 +11532,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5,6],ymm1[7,8],ymm12[9,10],ymm1[11],ymm12[12,13,14],ymm1[15] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm12 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4],ymm14[5,6],ymm3[7,8],ymm14[9,10,11],ymm3[12],ymm14[13,14],ymm3[15] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm14 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1,2,3],ymm1[4],ymm15[5,6],ymm1[7,8],ymm15[9,10,11],ymm1[12],ymm15[13,14],ymm1[15] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm15 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1,2,3],ymm3[4],ymm10[5,6],ymm3[7,8],ymm10[9,10,11],ymm3[12],ymm10[13,14],ymm3[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -11594,87 +11548,107 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1,2,3],ymm2[4],ymm14[5,6],ymm2[7,8],ymm14[9,10,11],ymm2[12],ymm14[13,14],ymm2[15] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,1,2,1,3,5,6] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX2-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm3 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13,14,15] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6 -; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm6, %ymm3 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm6 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7,8],ymm6[9],ymm7[10,11,12],ymm6[13],ymm7[14,15] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm6, %ymm4 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX2-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,1,2,1,3,5,6] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6,7],ymm11[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm8 +; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw $254, (%rsp), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendw $254, (%rsp), %ymm3, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -11683,192 +11657,192 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rdx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm8, 32(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm8, 32(%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FCP-NEXT: addq $776, %rsp # imm = 0x308 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm31 -; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm31 +; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm29 +; AVX512-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm31 ^ ymm29)) ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm19 -; AVX512-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm19 ^ ymm31)) -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm21 -; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm29 -; AVX512-NEXT: vmovdqa %ymm14, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm21 ^ (ymm1 & (ymm29 ^ ymm21)) -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm30 +; AVX512-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm30 ^ ymm4)) +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm1 & mem) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm26 +; AVX512-NEXT: vmovdqa %ymm12, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm26 ^ ymm25)) +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm27 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm10 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 224(%rdi), %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm4 & (zmm22 ^ zmm2)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512-NEXT: vmovdqa 224(%rdi), %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm23 ; AVX512-NEXT: vmovdqa64 288(%rdi), %ymm18 ; AVX512-NEXT: vmovdqa64 256(%rdi), %ymm16 -; AVX512-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512-NEXT: vmovdqa %ymm5, %ymm2 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512-NEXT: vporq %xmm6, %xmm2, %xmm21 ; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm17 -; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm28 -; AVX512-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17)) +; AVX512-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512-NEXT: vmovdqa %ymm12, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm14 ^ ymm17)) ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm23) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vmovdqa %ymm7, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13)) -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm15 -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm15, %xmm2 -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm15 -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm31 ^ ymm19)) -; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem) -; AVX512-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm29 ^ ymm21)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512-NEXT: vmovdqa %ymm7, %ymm13 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm29 ^ (ymm13 & (ymm31 ^ ymm29)) +; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX512-NEXT: vmovdqa64 %ymm28, %ymm13 +; AVX512-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm4 ^ ymm30)) +; AVX512-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm4[2],ymm13[3,4,5],ymm4[6],ymm13[7,8,9],ymm4[10],ymm13[11,12,13],ymm4[14],ymm13[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm3 & ~mem) +; AVX512-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm25 ^ (ymm3 & (ymm26 ^ ymm25)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm19 & (zmm23 ^ zmm11)) +; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm21 & ymm20) +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm24 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm4 & (zmm24 ^ zmm15)) -; AVX512-NEXT: vmovdqa %ymm14, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13)) +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm24 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm19 & (zmm24 ^ zmm13)) +; AVX512-NEXT: vmovdqa %ymm12, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm31 ^ ymm29)) ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %ymm7, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19)) -; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm6 ^ ymm30)) +; AVX512-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8,9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX512-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm15 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm15) -; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm5 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm25, %ymm19 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm25 ^ ymm26)) +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[1,8,15,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm11, %xmm3 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25 -; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2)) -; AVX512-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm13 +; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm13 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm15[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm4) +; AVX512-NEXT: vpor %xmm11, %xmm13, %xmm0 +; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm25 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm31 ^ ymm29)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm21 & (zmm25 ^ zmm2)) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19)) -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15] +; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %ymm12, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm22 ^ ymm30)) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2],ymm6[3],ymm2[4,5,6],ymm6[7,8],ymm2[9,10],ymm6[11],ymm2[12,13,14],ymm6[15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm15) -; AVX512-NEXT: vmovdqa %ymm15, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm4) ; AVX512-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm19 ^ ymm26)) +; AVX512-NEXT: vmovdqa64 %ymm26, %ymm22 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512-NEXT: vmovdqa %xmm5, %xmm10 -; AVX512-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm0)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm11 +; AVX512-NEXT: vmovdqa %xmm10, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (mem & (ymm11 ^ ymm0)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm15[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm0, %xmm13, %xmm0 ; AVX512-NEXT: vmovdqa64 416(%rdi), %ymm26 -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm30 +; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm11 ; AVX512-NEXT: vmovdqa64 384(%rdi), %ymm27 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm3 ^ (zmm21 & (zmm11 ^ zmm3)) ; AVX512-NEXT: vmovdqa %ymm7, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 @@ -11876,249 +11850,249 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero ; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm8)) -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 -; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22)) -; AVX512-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm28 ^ ymm17)) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23) -; AVX512-NEXT: vmovdqa %ymm14, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10] -; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm24)) -; AVX512-NEXT: vmovdqa %ymm14, %ymm0 +; AVX512-NEXT: vmovdqa %ymm7, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm18 ^ ymm16)) +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm4 & (ymm0 ^ ymm8)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] +; AVX512-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX512-NEXT: vmovdqa %ymm5, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm14 ^ ymm17)) +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7,8,9,10],ymm13[11],ymm8[12,13],ymm13[14],ymm8[15] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa %ymm12, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm26 ^ (ymm8 & (ymm27 ^ ymm26)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm13 & (zmm21 ^ zmm23)) +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm20) +; AVX512-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm4 & (ymm1 ^ ymm0)) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 +; AVX512-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %ymm7, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm28 ^ (ymm3 & (ymm17 ^ ymm28)) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4,5,6],ymm6[7,8],ymm3[9,10],ymm6[11],ymm3[12,13,14],ymm6[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23) -; AVX512-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm13 & (zmm23 ^ zmm24)) +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %ymm7, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm14 ^ (ymm1 & (ymm17 ^ ymm14)) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6],ymm3[7,8],ymm1[9,10],ymm3[11],ymm1[12,13,14],ymm3[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm27 ^ ymm26)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ymm20) +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11] -; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm8, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm4 & (ymm0 ^ ymm1)) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm8 & (zmm24 ^ zmm25)) -; AVX512-NEXT: vmovdqa %ymm14, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm13 & (zmm24 ^ zmm25)) +; AVX512-NEXT: vmovdqa %ymm12, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm17 ^ ymm14)) +; AVX512-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm16 ^ (ymm1 & (ymm18 ^ ymm16)) ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u] ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm18 ^ ymm16)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0 +; AVX512-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm20) | ymm0 ; AVX512-NEXT: vmovdqa %ymm7, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero -; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm4 & (ymm0 ^ ymm1)) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30)) -; AVX512-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18)) -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0 -; AVX512-NEXT: vmovdqa %ymm14, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm13 & (zmm25 ^ zmm11)) +; AVX512-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm17 ^ ymm14)) +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] +; AVX512-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm16 ^ ymm18)) +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm8 +; AVX512-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm1 & (ymm30 ^ ymm2)) -; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm28 ^ ymm17)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512-NEXT: vmovdqa %ymm7, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18)) -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm14 ^ ymm17)) +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] +; AVX512-NEXT: vmovdqa %ymm7, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm16 ^ ymm18)) +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0 -; AVX512-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~mem) | ymm0 +; AVX512-NEXT: vmovdqa %ymm5, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512-NEXT: vpshufb %ymm13, %ymm11, %ymm11 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm20) | ymm11 ; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm23 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm23 = ymm23 ^ (ymm1 & (ymm23 ^ ymm2)) -; AVX512-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm27 ^ ymm26)) -; AVX512-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm13 ^ ymm12)) -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512-NEXT: vmovdqa %ymm9, %ymm15 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm31 ^ ymm19)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512-NEXT: vmovdqa %ymm14, %ymm12 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = ymm18 ^ (ymm14 & (ymm16 ^ ymm18)) -; AVX512-NEXT: vmovdqa %ymm7, %ymm9 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm19 ^ ymm31)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm19 ^ ymm31)) -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3],ymm8[4],ymm3[5,6],ymm8[7,8],ymm3[9,10,11],ymm8[12],ymm3[13,14],ymm8[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa %ymm11, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm11) -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm2 & ymm1) -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3,4],ymm8[5],ymm14[6,7,8],ymm8[9],ymm14[10,11,12],ymm8[13],ymm14[14,15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm1) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm29 ^ (ymm12 & (ymm21 ^ ymm29)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] +; AVX512-NEXT: vmovdqa %ymm7, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm31 ^ (ymm3 & (ymm29 ^ ymm31)) +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm20 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm4 & (ymm20 ^ ymm8)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm4 & (ymm8 ^ ymm1)) +; AVX512-NEXT: vpor %xmm2, %xmm11, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm28 = ymm26 ^ (ymm28 & (ymm27 ^ ymm26)) +; AVX512-NEXT: vmovdqa %ymm12, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm19 ^ ymm22)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm2 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vmovdqa %ymm12, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm31 ^ (ymm3 & (ymm29 ^ ymm31)) +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm31 ^ (ymm5 & (ymm29 ^ ymm31)) +; AVX512-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm30 ^ (ymm9 & (ymm10 ^ ymm30)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = ymm18 ^ (ymm12 & (ymm16 ^ ymm18)) +; AVX512-NEXT: vmovdqa %ymm7, %ymm13 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm10 ^ (ymm13 & (ymm30 ^ ymm10)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm10 ^ (ymm12 & (ymm30 ^ ymm10)) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3],ymm10[4],ymm9[5,6],ymm10[7,8],ymm9[9,10,11],ymm10[12],ymm9[13,14],ymm10[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm0 & ymm9) +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13,14,15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm18 & (ymm1 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm3 & ymm9) +; AVX512-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm2 -; AVX512-NEXT: vmovdqa %xmm10, %xmm13 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm28 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17)) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7,8],ymm10[9],ymm12[10,11,12],ymm10[13],ymm12[14,15] ; AVX512-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm29 ^ (ymm15 & (ymm21 ^ ymm29)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm29 ^ ymm21)) -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm14[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm0 & ymm3) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512-NEXT: vpshufb %xmm10, %xmm5, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm17 ^ (ymm7 & (ymm14 ^ ymm17)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm19 ^ ymm22)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm19 ^ (ymm7 & (ymm22 ^ ymm19)) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm3)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm6 -; AVX512-NEXT: vmovdqa %xmm12, %xmm15 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm15)) +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa %xmm4, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm1 & (ymm6 ^ ymm3)) -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm7, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %xmm7, %xmm15, %xmm12 -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm1 & (ymm12 ^ ymm3)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11)) -; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm9)) -; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm18 & (ymm3 ^ ymm1)) +; AVX512-NEXT: vpor %xmm4, %xmm7, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm18 & (ymm7 ^ ymm1)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm13)) +; AVX512-NEXT: vpshufb %xmm10, %xmm12, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm9)) +; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 ; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm28[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] -; AVX512-NEXT: vextracti32x4 $1, %ymm16, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm1, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] +; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u] +; AVX512-NEXT: vextracti32x4 $1, %ymm16, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] +; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & mem) | ymm0 +; AVX512-NEXT: vmovdqa64 %ymm28, %ymm6 +; AVX512-NEXT: vextracti32x4 $1, %ymm28, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm20, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm22, (%rdx) +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm21, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm23, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm25, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -12127,34 +12101,34 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm20 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm20 ^ ymm12)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm31 ^ ymm27)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm30 ^ (ymm1 & (ymm31 ^ ymm30)) ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm28 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm30 ^ ymm28)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm28 ^ ymm27)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,1,2,4,6] +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 +; AVX512-FCP-NEXT: vpermd %ymm22, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm4 @@ -12195,204 +12169,204 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm23 & (ymm15 ^ ymm13)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm12 ^ (ymm13 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm31 ^ (ymm15 & (ymm30 ^ ymm31)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7,8,9],ymm6[10],ymm15[11,12,13],ymm6[14],ymm15[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm13 & ~mem) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm28 ^ ymm27)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm8)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm20 ^ ymm12)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm27 ^ ymm31)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7,8,9],ymm6[10],ymm13[11,12,13],ymm6[14],ymm13[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ~mem) -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm30 ^ ymm28)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6] -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm15[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm21 & (zmm7 ^ zmm13)) +; AVX512-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,1,3,4,6] +; AVX512-FCP-NEXT: vpermd %ymm22, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm8, %xmm13, %xmm8 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm21 & (zmm3 ^ zmm15)) ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm11 ^ (ymm8 & (ymm16 ^ ymm11)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u] +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm14 ^ (ymm15 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7,8,9,10],ymm7[11],ymm15[12,13],ymm7[14],ymm15[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm17 ^ (ymm15 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero ; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26) -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10] -; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm23 & (ymm8 ^ ymm13)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ymm26) +; AVX512-FCP-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm23 & (ymm0 ^ ymm7)) ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm20 ^ ymm12)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm25) -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm22 & (zmm3 ^ zmm8)) -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm11 ^ (ymm7 & (ymm16 ^ ymm11)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm3)) +; AVX512-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm31 ^ (ymm3 & (ymm30 ^ ymm31)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm25) +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm27 ^ ymm28)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,1,3,5,6] +; AVX512-FCP-NEXT: vpermd %ymm22, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm22 & (zmm0 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm16 ^ ymm11)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm14 ^ ymm2)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5,6],ymm13[7,8],ymm8[9,10],ymm13[11],ymm8[12,13,14],ymm13[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm26) -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm17 ^ (ymm13 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm8)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm21 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm20 ^ ymm12)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm3 & ymm26) +; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm8)) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5,6],ymm6[7,8],ymm7[9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm25) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm0)) +; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm31 ^ (ymm3 & (ymm30 ^ ymm31)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4,5,6],ymm6[7,8],ymm3[9,10],ymm6[11],ymm3[12,13,14],ymm6[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm28 ^ (ymm3 & (ymm27 ^ ymm28)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm0 & ymm25) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm13 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm3)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm22 & (zmm0 ^ zmm7)) -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6],ymm4[7,8],ymm3[9,10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm16 ^ ymm11)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm4)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm0)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm8 ^ (zmm22 & (zmm0 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm14 ^ ymm2)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm11 ^ (ymm5 & (ymm16 ^ ymm11)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u] +; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm18 ^ (ymm6 & (ymm17 ^ ymm18)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm26) | ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm23 & (ymm4 ^ ymm5)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm22 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm14 ^ ymm2)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm11 ^ ymm16)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm26) | ymm0 +; AVX512-FCP-NEXT: vporq %xmm5, %xmm4, %xmm29 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = (ymm29 & ymm26) | ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero ; AVX512-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm23 & (ymm29 ^ ymm3)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm2 ^ ymm14)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm14 ^ (ymm4 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm11 ^ ymm16)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm26 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm23 & (ymm26 ^ ymm3)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~mem) | ymm6 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm18 ^ (ymm6 & (ymm17 ^ ymm18)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm23 & (ymm5 ^ ymm29)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm6, %ymm0, %ymm26 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm23 & (ymm26 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm20 ^ (ymm13 & (ymm12 ^ ymm20)) ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7 @@ -12401,112 +12375,112 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm12 ^ ymm20)) ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm20 ^ (ymm10 & (ymm12 ^ ymm20)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) +; AVX512-FCP-NEXT: vporq %xmm6, %xmm12, %xmm17 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm30 ^ ymm31)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vporq %xmm12, %xmm0, %xmm18 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm12 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm9 & (ymm11 ^ ymm16)) ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm31 ^ ymm27)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm30 ^ (ymm9 & (ymm31 ^ ymm30)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[4,11],zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6],ymm6[7,8],ymm7[9,10,11],ymm6[12],ymm7[13,14],ymm6[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm3 & ymm25) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm0 & ymm25) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7,8],ymm6[9],ymm9[10,11,12],ymm6[13],ymm9[14,15] +; AVX512-FCP-NEXT: vporq %xmm6, %xmm10, %xmm16 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm7[1,2,3],ymm0[4],ymm7[5,6],ymm0[7,8],ymm7[9,10,11],ymm0[12],ymm7[13,14],ymm0[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm17 & ymm25) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6,7,8],ymm0[9],ymm13[10,11],ymm0[12],ymm13[13,14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm18 & ymm25) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm25) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm16 & ymm25) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm28 ^ (ymm12 & (ymm27 ^ ymm28)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm1 & (ymm2 ^ ymm14)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm28 ^ ymm30)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm27 ^ ymm28)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm28 ^ ymm27)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm0, %xmm9, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm9 & (ymm3 ^ ymm0)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX512-FCP-NEXT: vpor %xmm12, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm13 & (ymm9 ^ ymm0)) +; AVX512-FCP-NEXT: vpor %xmm12, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512-FCP-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm9 & (ymm4 ^ ymm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm13 & (ymm4 ^ ymm0)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm9 & (ymm5 ^ ymm1)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,2,4,6,0,0,0,0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm13 & (ymm3 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,6,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm10)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,4,6,0,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm3 & (zmm4 ^ zmm7)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,6,0,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm6)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm9 & (zmm1 ^ zmm10)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,4,6,u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,5,6,u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm9 & (zmm4 ^ zmm7)) +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm6)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 ; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) @@ -12516,216 +12490,219 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $24, %rsp +; AVX512DQ-NEXT: pushq %rax ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm31 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm31 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13)) +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm31 ^ ymm13)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm30 ; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm28 -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm28 ^ ymm31)) -; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm11 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm12 ^ ymm30)) +; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm1 & mem) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm25 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm27 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm4 ^ ymm25)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm21 ^ (ymm1 & (ymm27 ^ ymm21)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm30 -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6 +; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm19 & (zmm22 ^ zmm2)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 288(%rdi), %ymm18 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %ymm17 -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm21 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm18 ^ (ymm15 & (ymm17 ^ ymm18)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm15 +; AVX512DQ-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm20 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm16 -; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm24) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm15 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm15, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm15 -; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm29 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm28 ^ (ymm15 & (ymm31 ^ ymm28)) -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem) -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm16 ^ ymm20)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3,4,5],ymm7[6],ymm0[7,8,9],ymm7[10],ymm0[11,12,13],ymm7[14],ymm0[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm31 ^ ymm13)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm15, %zmm1, %zmm22 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm21 ^ (ymm1 & (ymm27 ^ ymm21)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm30 ^ ymm12)) ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm4 ^ ymm25)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm26 -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm23 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm19 & (zmm23 ^ zmm15)) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7,8,9],ymm4[10],ymm3[11,12,13],ymm4[14],ymm3[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ~mem) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm19 & (zmm22 ^ zmm9)) +; AVX512DQ-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm23 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm19 & (zmm23 ^ zmm3)) ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm31 ^ ymm13)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28)) -; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm27 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm27) -; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm15 -; AVX512DQ-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm0 & (ymm25 ^ ymm6)) -; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm19 +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm12 ^ (ymm1 & (ymm30 ^ ymm12)) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] +; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm15 ; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ymm19) +; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm21 ^ ymm27)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm6 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm6 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2)) -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm7 +; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm24 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm31 ^ ymm13)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28)) -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm27) -; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm11 -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm19 ^ (ymm0 & (ymm15 ^ ymm19)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm1 ^ (zmm28 & (zmm24 ^ zmm1)) +; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm12 ^ (ymm1 & (ymm30 ^ ymm12)) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4,5,6],ymm15[7,8],ymm1[9,10],ymm15[11],ymm1[12,13,14],ymm15[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm21 ^ ymm27)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm19) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm0, %xmm15, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm5 ; AVX512DQ-NEXT: vpor %xmm5, %xmm15, %xmm5 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm0)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 416(%rdi), %ymm26 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %ymm27 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3)) -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm6 & (ymm5 ^ ymm0)) +; AVX512DQ-NEXT: vpor %xmm15, %xmm9, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 416(%rdi), %ymm25 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %ymm26 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm3 ^ (zmm28 & (zmm29 ^ zmm3)) +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm8 = ymm8 | (ymm9 & mem) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm8)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22)) -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm18 ^ ymm17)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm21 ^ (ymm3 & (ymm16 ^ ymm21)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm2 & (ymm0 ^ ymm8)) +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm18 ^ ymm17)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm19 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm8 & (zmm19 ^ zmm22)) +; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm20 ^ (ymm3 & (ymm16 ^ ymm20)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24) -; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm25 ^ (ymm5 & (ymm26 ^ ymm25)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm9) +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10] -; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpor %xmm0, %xmm15, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm2 & (ymm0 ^ ymm3)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm23)) ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 @@ -12734,649 +12711,639 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm21 ^ ymm16)) +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm20 ^ ymm16)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5,6],ymm5[7,8],ymm3[9,10],ymm5[11],ymm3[12,13,14],ymm5[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24) -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm9) +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11] ; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm2 & (ymm0 ^ ymm3)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm8 & (zmm23 ^ zmm25)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm8 & (zmm23 ^ zmm24)) ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm20 ^ ymm16)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm17 ^ (ymm2 & (ymm18 ^ ymm17)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero -; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm18 ^ ymm17)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u] +; AVX512DQ-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm25 ^ ymm26)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm9) | ymm0 +; AVX512DQ-NEXT: vpor %xmm5, %xmm3, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30)) -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm8 & (zmm24 ^ zmm29)) +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm17 ^ ymm18)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm20 ^ ymm16)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6,7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13,14,15] +; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm9) | ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm25 ^ ymm26)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm24 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm24 = ymm24 ^ (ymm29 & (ymm24 ^ ymm2)) -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm16 ^ ymm21)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm2 & (ymm8 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm16 ^ ymm20)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm17 ^ ymm18)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~mem) | ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm25 ^ ymm26)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm29 & (ymm30 ^ ymm2)) -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm2 & (ymm29 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm31 ^ (ymm0 & (ymm13 ^ ymm31)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vporq %xmm2, %xmm0, %xmm29 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm19 = ymm26 ^ (ymm19 & (ymm27 ^ ymm26)) +; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm1 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm15 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm31 ^ (ymm0 & (ymm13 ^ ymm31)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm28 = ymm25 ^ (ymm28 & (ymm26 ^ ymm25)) +; AVX512DQ-NEXT: vpor %xmm9, %xmm15, %xmm0 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm31 ^ (ymm11 & (ymm13 ^ ymm31)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm30 ^ ymm12)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm13 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm17 = ymm18 ^ (ymm14 & (ymm17 ^ ymm18)) -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm28 ^ ymm31)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm28 ^ ymm31)) -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6],ymm4[7,8],ymm2[9,10,11],ymm4[12],ymm2[13,14],ymm4[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm29 & ymm11) -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6,7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13,14,15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ymm1) -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7,8],ymm4[9],ymm14[10,11,12],ymm4[13],ymm14[14,15] +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm30 ^ (ymm9 & (ymm12 ^ ymm30)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm30 ^ (ymm14 & (ymm12 ^ ymm30)) +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6],ymm2[7,8],ymm3[9,10,11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm1 & ymm12) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13,14,15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ymm12) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7,8],ymm2[9],ymm14[10,11,12],ymm2[13],ymm14[14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm3 & ymm1) -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm5 & (ymm13 ^ ymm6)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm18 & (ymm2 ^ ymm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21)) -; AVX512DQ-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm6 ^ (ymm15 & (ymm13 ^ ymm6)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm13 ^ (ymm7 & (ymm6 ^ ymm13)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm5 & ymm12) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm21 ^ ymm27)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm14, %xmm1 +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm6 & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = ymm20 ^ (ymm10 & (ymm16 ^ ymm20)) +; AVX512DQ-NEXT: vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm27 ^ (ymm15 & (ymm21 ^ ymm27)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm21 ^ (ymm10 & (ymm27 ^ ymm21)) +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm0 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm12)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm3)) +; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm14, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm18 & (ymm5 ^ ymm3)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm7 & (ymm5 ^ ymm3)) +; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm14, %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm18 & (ymm8 ^ ymm3)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm4 & (ymm8 ^ ymm3)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11)) -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm13, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm9)) -; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm24, %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm30, %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm16[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512DQ-NEXT: vextracti32x4 $1, %ymm17, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] -; AVX512DQ-NEXT: vextracti32x4 $1, %ymm17, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512DQ-NEXT: vextracti32x4 $1, %ymm19, %xmm0 +; AVX512DQ-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-NEXT: vextracti32x4 $1, %ymm28, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-NEXT: addq $24, %rsp +; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: pushq %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm11 ^ ymm23)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm27 ^ (ymm2 & (ymm31 ^ ymm27)) -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm30 ^ (ymm1 & (ymm31 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm2 & mem) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm27 ^ (ymm2 & (ymm28 ^ ymm27)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,1,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm1 ^ ymm16)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm12 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm9, %xmm13 ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4,5],ymm8[6],ymm13[7,8,9],ymm8[10],ymm13[11,12,13],ymm8[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm14 ^ (ymm9 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3,4,5],ymm15[6],ymm9[7,8,9],ymm15[10],ymm9[11,12,13],ymm15[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm10 & ymm26) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm17 ^ (ymm13 & (ymm18 ^ ymm17)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm17 ^ (ymm9 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm13 & ymm26) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm9[4,11],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm21 & (ymm13 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm15)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm11 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm30 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7,8,9],ymm9[10],ymm13[11,12,13],ymm9[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm4 & ~mem) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm27 ^ (ymm4 & (ymm28 ^ ymm27)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm7)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7,8,9],ymm3[10],ymm8[11,12,13],ymm3[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ~mem) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm28 ^ (ymm7 & (ymm30 ^ ymm28)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,4,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm16 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,1,3,4,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm16 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm13)) +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm13 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10] -; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm21 & (ymm8 ^ ymm13)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8,9,10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm17 ^ (ymm15 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm26) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm21 & (ymm3 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm11 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm31 ^ (ymm4 & (ymm30 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7,8,9,10],ymm9[11],ymm4[12,13],ymm9[14],ymm4[15] ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm23 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm23) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,5,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm8, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm13 ^ (zmm22 & (zmm3 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm16 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm28 ^ (ymm4 & (ymm27 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm25) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,1,3,5,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm12 ^ (ymm13 & (ymm16 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm22 & (zmm3 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm13 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm14 ^ ymm2)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5,6],ymm15[7,8],ymm13[9,10],ymm15[11],ymm13[12,13,14],ymm15[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm17 ^ (ymm15 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm26) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm21 & (ymm7 ^ ymm13)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm11 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm29 & (zmm20 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm11 ^ ymm12)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm31 ^ (ymm4 & (ymm30 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4,5,6],ymm9[7,8],ymm4[9,10],ymm9[11],ymm4[12,13,14],ymm9[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm3 & ymm25) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm28 ^ (ymm3 & (ymm27 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm25 & (ymm3 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm22 & (zmm3 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm9 +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm22 & (zmm0 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm14 ^ ymm2)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm12 ^ (ymm6 & (ymm16 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm16 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm26) | ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm17 ^ ymm18)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm5)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm14 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm12 ^ ymm16)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm17 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm0 +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm0 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm26 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm21 & (ymm26 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm14 ^ (ymm3 & (ymm2 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm12 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ~mem) | ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm29 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm21 & (ymm29 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm12 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ~mem) | ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm23 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm21 & (ymm29 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm12 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm11 ^ (ymm6 & (ymm12 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm16 ^ (ymm9 & (ymm1 ^ ymm16)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm15 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm27 ^ (ymm15 & (ymm31 ^ ymm27)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vporq %xmm11, %xmm0, %xmm16 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6],ymm4[7,8],ymm8[9,10,11],ymm4[12],ymm8[13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ymm23) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm3 & ymm23) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7,8],ymm4[9],ymm9[10,11,12],ymm4[13],ymm9[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm23 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm11 ^ (ymm8 & (ymm23 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vporq %xmm6, %xmm9, %xmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm31 ^ (ymm4 & (ymm30 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm16 ^ (ymm10 & (ymm12 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm30 ^ (ymm10 & (ymm31 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3],ymm9[4],ymm4[5,6],ymm9[7,8],ymm4[9,10,11],ymm9[12],ymm4[13,14],ymm9[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm17 & ymm25) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7,8],ymm9[9],ymm13[10,11],ymm9[12],ymm13[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm0 & ymm25) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7,8],ymm9[9],ymm10[10,11,12],ymm9[13],ymm10[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm16 & ymm23) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm10 & (ymm2 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm28 ^ (ymm10 & (ymm30 ^ ymm28)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm9 +; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm8 & ymm25) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm1 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm28 ^ (ymm11 & (ymm27 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm28 ^ (ymm7 & (ymm27 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm28 ^ ymm27)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm9 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm25 & (ymm9 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm11 & (ymm9 ^ ymm8)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm7, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm25 & (ymm7 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm11 & (ymm8 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm25 & (ymm10 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,2,4,6,0,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm11 & (ymm3 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,6,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,4,6,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,5,6,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm9 & (zmm1 ^ zmm6)) ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm26, %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,4,6,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm26, %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm8, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm9 & (zmm6 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,5,6,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 ; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-FCP-NEXT: popq %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm25 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] -; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm18 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] -; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm24 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] -; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] -; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u] +; AVX512BW-NEXT: vpermw %zmm24, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31] +; AVX512BW-NEXT: vpermw %zmm24, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm9 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1} -; AVX512BW-NEXT: kmovq %k1, %k2 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm0 {%k1} +; AVX512BW-NEXT: kmovq %k1, %k3 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm4, %xmm3, %xmm16 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpermw %zmm24, %zmm3, %zmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm4, %xmm0, %xmm23 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} +; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm23 {%k1} ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512BW-NEXT: movw $8772, %ax # imm = 0x2244 @@ -13388,112 +13355,115 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm17 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %xmm17, %xmm7, %xmm3 ; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm8 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-NEXT: vmovdqa64 240(%rdi), %xmm26 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm12 +; AVX512BW-NEXT: vmovdqa64 224(%rdi), %xmm27 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm27[0,7,14],zero,zero,xmm27[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm12[7] +; AVX512BW-NEXT: vpor %xmm5, %xmm13, %xmm5 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512BW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} ; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm13 ; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX512BW-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] -; AVX512BW-NEXT: vporq %xmm5, %xmm0, %xmm19 -; AVX512BW-NEXT: vmovdqa64 352(%rdi), %ymm17 -; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: movw $9288, %r10w # imm = 0x2448 +; AVX512BW-NEXT: kmovd %r10d, %k5 +; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm5 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u] +; AVX512BW-NEXT: kmovq %rax, %k2 +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u] +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vporq %xmm14, %xmm5, %xmm19 +; AVX512BW-NEXT: vmovdqa64 352(%rdi), %ymm16 +; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512BW-NEXT: vpblendmw %ymm16, %ymm15, %ymm0 {%k6} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7,8,9],ymm5[10],ymm0[11,12,13],ymm5[14],ymm0[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movw $3968, %ax # imm = 0xF80 -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} -; AVX512BW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm19 {%k1} +; AVX512BW-NEXT: vmovdqa 416(%rdi), %ymm14 ; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512BW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512BW-NEXT: vporq %xmm22, %xmm20, %xmm20 +; AVX512BW-NEXT: vpblendmw %ymm14, %ymm5, %ymm0 {%k4} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm0, %xmm20 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[2,9] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero +; AVX512BW-NEXT: vporq %xmm20, %xmm0, %xmm20 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm0 {%k4} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm0, %xmm21 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,zero,xmm21[6,13],zero,zero,xmm21[2,9,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm21, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: movl $511, %eax # imm = 0x1FF +; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm21 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm21[u,u,u,6,13],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[u,u,u,u] +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u],zero,zero,xmm21[4,11],zero,zero,xmm21[0,7,14,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm10 {%k1} +; AVX512BW-NEXT: vporq %xmm22, %xmm21, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm27[1,8,15],zero,zero,xmm27[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k6} +; AVX512BW-NEXT: vporq %xmm2, %xmm21, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm22[2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = zero,zero,xmm22[0,7,14],zero,zero,xmm22[3,10,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm21, %xmm22, %xmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15] +; AVX512BW-NEXT: vpermw %zmm24, %zmm22, %zmm22 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-NEXT: movl $261632, %eax # imm = 0x3FE00 +; AVX512BW-NEXT: kmovd %eax, %k7 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm21 {%k7} = ymm22[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm22 {%k3} +; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm22[u,u,0,7,14],zero,zero,xmm22[3,10],zero,zero,zero,xmm22[u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb %xmm17, %xmm8, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm17[0],xmm2[0],xmm17[1],xmm2[1],xmm17[2],xmm2[2],xmm17[3],xmm2[3] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm27[2,9],zero,zero,zero,xmm27[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm17, %xmm2, %xmm2 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm17 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm17[3,10],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[u,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm22, %ymm9 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm14, %xmm22, %xmm14 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm2, %xmm14, %xmm2 -; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 -; AVX512BW-NEXT: kmovd %edi, %k5 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512BW-NEXT: vporq %xmm22, %xmm14, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm14, %xmm21, %xmm14 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm22 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 +; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm25 = zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17 +; AVX512BW-NEXT: vporq %xmm22, %xmm25, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 {%k7} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k1} +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm2 {%k4} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm2, %xmm18 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm18, %xmm3, %xmm3 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm18, %xmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512BW-NEXT: vporq %xmm18, %xmm21, %xmm18 @@ -13501,416 +13471,419 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-NEXT: kmovd %edi, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm27[3,10],zero,zero,zero,xmm27[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm18, %ymm2 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm21, %xmm18, %xmm18 +; AVX512BW-NEXT: vinserti32x4 $2, %xmm18, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k3} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 ; AVX512BW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} ; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512BW-NEXT: vextracti32x4 $1, %ymm2, %xmm19 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[2,9],zero,zero,zero,xmm19[5,12,u,u] +; AVX512BW-NEXT: kmovq %rax, %k2 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] +; AVX512BW-NEXT: vporq %xmm19, %xmm0, %xmm0 +; AVX512BW-NEXT: vpblendmw %ymm16, %ymm15, %ymm2 {%k5} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm14, %ymm5, %ymm3 {%k6} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1} +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k3} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[6,13,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] +; AVX512BW-NEXT: vpblendmw %ymm15, %ymm16, %ymm3 {%k4} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm22 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} +; AVX512BW-NEXT: vpblendmw %ymm14, %ymm5, %ymm4 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero +; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11] +; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} +; AVX512BW-NEXT: vporq %xmm19, %xmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k3} +; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k5} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k2} +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm0 +; AVX512BW-NEXT: vpblendmw %ymm15, %ymm16, %ymm2 {%k6} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6],ymm3[7,8],ymm2[9,10,11],ymm3[12],ymm2[13,14],ymm3[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1} +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm14, %ymm2 {%k4} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k3} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm19 {%k6} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512BW-NEXT: vporq %xmm3, %xmm2, %xmm19 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} +; AVX512BW-NEXT: vpblendmw %ymm15, %ymm16, %ymm3 {%k5} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm14, %ymm22 {%k6} +; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm4 {%k4} +; AVX512BW-NEXT: vporq %xmm0, %xmm2, %xmm21 +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[2,9,u,u,u] +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm21 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u] +; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vpblendmw %ymm16, %ymm15, %ymm2 {%k1} +; AVX512BW-NEXT: kmovq %k1, %k2 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} -; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} -; AVX512BW-NEXT: kmovq %k1, %k7 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm20 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k1} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[6,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm22[u,u,u,u,u,u,u,5,12],zero,zero,xmm22[1,8,15],zero,zero +; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm22 {%k6} +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm14, %ymm2 {%k5} +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm21 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm2 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k6} -; AVX512BW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k3} +; AVX512BW-NEXT: vmovdqu16 %ymm16, %ymm15 {%k4} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] ; AVX512BW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u] -; AVX512BW-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm12 ; AVX512BW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} -; AVX512BW-NEXT: vpblendmw %ymm10, %ymm1, %ymm12 {%k4} -; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} -; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu16 %ymm14, %ymm5 {%k2} +; AVX512BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm13 {%k4} +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm0 {%k5} +; AVX512BW-NEXT: vmovdqu16 %ymm9, %ymm1 {%k5} ; AVX512BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm19[u,u,2,9],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[u,u,u,u,u] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm19, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm2 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} +; AVX512BW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] -; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] -; AVX512BW-NEXT: vpermw %zmm25, %zmm7, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] -; AVX512BW-NEXT: vpermw %zmm25, %zmm8, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] -; AVX512BW-NEXT: vpermw %zmm25, %zmm10, %zmm10 -; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm11 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-NEXT: vpshufb %xmm10, %xmm4, %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm11 {%k5} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm0 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31] +; AVX512BW-NEXT: vpermw %zmm24, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31] +; AVX512BW-NEXT: vpermw %zmm24, %zmm6, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[2,9],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpermw %zmm24, %zmm7, %zmm7 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k7} = ymm7[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm7 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %xmm7, %xmm27, %xmm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm8 {%k7} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm22[5,12],zero,zero,xmm22[1,8,15],zero,zero,xmm22[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k7} = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm12 {%k5} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm27[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k7} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm10, %xmm26, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} +; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 {%k7} = ymm4[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm27[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm7, %xmm26, %xmm4 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k7} ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k1} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rdi) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm0, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm0, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm10, %ymm5 {%k1} ; AVX512BW-FCP-NEXT: kmovq %k1, %k2 ; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm5, %xmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm5 {%k1} -; AVX512BW-FCP-NEXT: kmovq %k1, %k3 -; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6] +; AVX512BW-FCP-NEXT: kmovd %eax, %k6 +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm8 {%k6} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm0 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,1,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm8, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] ; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 ; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm1 {%k5} -; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-FCP-NEXT: kmovd %eax, %k7 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm21 {%k7} -; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm14[u,u,u,u,u,3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm0 {%k3} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm5, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 -; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm18 {%k4} +; AVX512BW-FCP-NEXT: kmovd %eax, %k5 +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm18 {%k5} ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm14, %ymm21 {%k1} +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm14 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm10, %ymm18 {%k5} +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm14, %ymm0, %ymm22 +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm18, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm18 {%k7} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm6 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm24, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [u,u,u,u,1,3,4,6] ; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm10, %ymm18 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm18[2,9],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm24, %xmm18, %xmm18 ; AVX512BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 -; AVX512BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm12 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm14 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm6 {%k3} +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm12 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] -; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,1,3,5,6] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm14, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm10, %ymm17 {%k7} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm17[3,10],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vporq %xmm24, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm13 {%k5} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm24, %xmm15 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm15 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k5} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[2,9],zero,zero,zero,xmm19[5,12,u,u] ; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k4} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: kmovq %rax, %k2 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm16[5,12],zero,zero ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm6 {%k2} +; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k6} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k6} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k1} +; AVX512BW-FCP-NEXT: kmovq %k1, %k6 +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm15 {%k7} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] @@ -13919,533 +13892,537 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k6} +; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k7} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm16 {%k5} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 -; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm2, %ymm14 {%k5} +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm15, %zmm15 +; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm19 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,0,7,14],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm21 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm22 {%k5} +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm15 +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm22, %xmm19 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm21, %ymm15 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm22[u,u,u,u,1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm20, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm19, %ymm16 {%k1} ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} -; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm19 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u],zero,zero,xmm20[3,10],zero,zero,zero,xmm20[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,5,12],zero,zero,xmm19[1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm20 +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm2, %ymm19 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm21 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm22 {%k7} +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm15 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm22[u,u,u,u,u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm22, %xmm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm16 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm2 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm7 {%k5} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] +; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k2} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm11, %xmm11 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] ; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k3} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm7, %xmm20 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm7, %xmm7 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] ; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20 -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm7 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm17, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 ; AVX512BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k1} = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,6,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm17 +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[5,12],zero,zero,xmm19[1,8,15],zero,zero,xmm19[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm19, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,4,6,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm14 {%k4} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm10 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 {%k4} = ymm8[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,6,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm1 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm2 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride7_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u] +; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] -; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm3 {%k1} +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm9, %ymm3 {%k1} ; AVX512DQ-BW-NEXT: kmovq %k1, %k2 ; AVX512DQ-BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm4, %xmm3, %xmm16 +; AVX512DQ-BW-NEXT: vporq %xmm4, %xmm3, %xmm23 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512DQ-BW-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm5, %ymm3 {%k6} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm23 {%k1} +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %xmm28 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm20 = [0,0,0,0,0,0,4,11,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm28, %xmm4 +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQ-BW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm12, %xmm5 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 224(%rdi), %xmm27 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm27[0,7,14],zero,zero,xmm27[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} +; AVX512DQ-BW-NEXT: kmovq %rax, %k7 ; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm13 ; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm12 ; AVX512DQ-BW-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm5, %xmm0, %xmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm6 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k7} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm0, %xmm6, %xmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512DQ-BW-NEXT: vpblendmw %ymm16, %ymm14, %ymm0 {%k6} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7,8,9],ymm6[10],ymm0[11,12,13],ymm6[14],ymm0[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movw $3968, %ax # imm = 0xF80 -; AVX512DQ-BW-NEXT: kmovd %eax, %k7 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 416(%rdi), %ymm15 -; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512DQ-BW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQ-BW-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 +; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm6, %ymm21 {%k4} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm21, %xmm22 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm23, %xmm22, %xmm22 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm21[4,11],zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm19 {%k1} +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm9, %ymm0 {%k4} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm0, %xmm24 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,zero,xmm24[6,13],zero,zero,xmm24[2,9,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm21, %xmm21 +; AVX512DQ-BW-NEXT: vporq %xmm24, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm22, %ymm10 {%k1} -; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] +; AVX512DQ-BW-NEXT: movl $511, %eax # imm = 0x1FF +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm5, %ymm22 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm10 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 +; AVX512DQ-BW-NEXT: vporq %xmm0, %xmm22, %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm22, %xmm14 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k5} -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k6} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm24 = xmm28[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm27[1,8,15],zero,zero,xmm27[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm9, %ymm22 {%k6} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm24 = xmm22[2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = zero,zero,xmm22[0,7,14],zero,zero,xmm22[3,10,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 +; AVX512DQ-BW-NEXT: vporq %xmm24, %xmm22, %xmm22 ; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512DQ-BW-NEXT: kmovd %edi, %k5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm14, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm21, %xmm14 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm22 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm11, %ymm18 {%k2} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm18, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm10 {%k7} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm18[u,u,0,7,14],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm8, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm28[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm18[0],xmm1[0],xmm18[1],xmm1[1],xmm18[2],xmm1[2],xmm18[3],xmm1[3] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm27[2,9],zero,zero,zero,xmm27[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm9, %ymm18 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[3,10],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm18[1,8,15],zero,zero,xmm18[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm20, %xmm18, %xmm20 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm20 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm11, %ymm18 {%k4} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm17 +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm18, %xmm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[2,9,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm18[u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm28[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vporq %xmm1, %xmm18, %xmm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2 ; AVX512DQ-BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm27[3,10],zero,zero,zero,xmm27[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm1, %xmm18, %xmm1 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm21, %ymm19 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 +; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm1 {%k4} ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm16, %ymm14, %ymm3 {%k3} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm6, %ymm4 {%k6} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] +; AVX512DQ-BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm0 {%k5} +; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm1 {%k6} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm1, %xmm0 +; AVX512DQ-BW-NEXT: vpblendmw %ymm14, %ymm16, %ymm1 {%k4} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6],ymm3[7,8],ymm1[9,10],ymm3[11],ymm1[12,13,14],ymm3[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm6, %ymm3 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k5} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm1 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm14, %ymm16, %ymm1 {%k6} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6],ymm4[7,8],ymm1[9,10,11],ymm4[12],ymm1[13,14],ymm4[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm15, %ymm4 {%k4} +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k5} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} +; AVX512DQ-BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm1 {%k7} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512DQ-BW-NEXT: vporq %xmm3, %xmm2, %xmm19 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} -; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} -; AVX512DQ-BW-NEXT: kmovq %k1, %k7 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm14, %ymm16, %ymm4 {%k3} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13,14,15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm15, %ymm0 {%k6} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13] +; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm20 {%k4} +; AVX512DQ-BW-NEXT: vporq %xmm3, %xmm1, %xmm19 +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm19 {%k5} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm20[u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm16, %ymm14, %ymm3 {%k7} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} -; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} -; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm22 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm21 {%k6} -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm20 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm15, %ymm1 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm19 {%k2} +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm1, %xmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm11, %ymm22 {%k6} +; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm2, %ymm21 {%k6} +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm16, %ymm14 {%k4} ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[2,3,0,1] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm1[2],ymm14[3,4],ymm1[5],ymm14[6,7,8,9],ymm1[10],ymm14[11,12],ymm1[13],ymm14[14,15] +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm3, %xmm12 ; AVX512DQ-BW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} -; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm2 {%k4} -; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm1 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm11 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm2, %ymm0 {%k4} +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm15, %ymm6 {%k7} +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm11, %ymm1 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm2 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm11, %ymm5 {%k4} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] -; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm3 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] -; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm7, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm28[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQ-BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm28[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm1 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm28[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31] +; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm8, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] -; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm11, %zmm11 -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm4, %xmm12 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} -; AVX512DQ-BW-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX512DQ-BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm27, %xmm11 +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm21, %xmm9 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm26, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 {%k1} = ymm7[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm27[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm9, %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm5[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm26, %xmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm27[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5,6,7],ymm12[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%rsi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdi) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -14454,21 +14431,21 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 @@ -14484,319 +14461,317 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k3 -; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm6 {%k6} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,1,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm6, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm7 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u] +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm7 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm7, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm21 {%k7} +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm18 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[2,9] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm14, %ymm0, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k4} ; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm14[u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [u,u,u,u,1,3,4,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm22, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[1,8,15],zero,zero,xmm19[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm22, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm15 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm15[2,9],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm15, %xmm22 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm22 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,1,3,5,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm19[2,9],zero,zero,zero,xmm19[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm13[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm17 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[3,10],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm25 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm25 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm17 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm22, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm19[3,10],zero,zero,zero,xmm19[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm14, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k4} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k6} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm15 {%k6} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[6,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm16 {%k7} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm16[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm16[6,13],zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[1,8,15],zero,zero,xmm16[4,11] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm15 {%k7} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[2,9],zero,zero,zero,xmm19[5,12] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm15, %zmm15 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm20 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,5,12],zero,zero,xmm20[1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm23 {%k4} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm23, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u],zero,zero,zero,xmm16[6,13],zero,zero,xmm16[2,9,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm23[u,u,u,u,1,8,15],zero,zero,xmm23[4,11],zero,zero,xmm23[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm16 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k6} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k7} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,2,9],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm22 {%k7} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k7} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm10, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm22[u,u,3,10],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm22, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm9, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm20 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm9, %xmm19, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm21[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [1,2,4,6,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm19, %ymm19 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,4,6,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm9 {%k5} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,5,6,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm10, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm16 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm20, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k5} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm4 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX512DQ-BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi @@ -14804,8 +14779,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdi) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 99932c0026b23..7acc94ab6a5df 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -35,8 +35,8 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm5, %xmm5 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: packuswb %xmm5, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm6, %xmm6 @@ -72,9 +72,6 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i8_stride8_vf2: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -89,16 +86,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX-NEXT: vpextrw $0, %xmm6, (%r11) -; AVX-NEXT: vpextrw $0, %xmm7, (%r10) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: vpextrw $0, %xmm7, (%rax) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride8_vf2: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -113,16 +110,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX2-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX2-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX2-NEXT: vpextrw $0, %xmm6, (%r11) -; AVX2-NEXT: vpextrw $0, %xmm7, (%r10) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vpextrw $0, %xmm7, (%rax) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride8_vf2: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -137,16 +134,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX2-FP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX2-FP-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX2-FP-NEXT: vpextrw $0, %xmm6, (%r11) -; AVX2-FP-NEXT: vpextrw $0, %xmm7, (%r10) +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vpextrw $0, %xmm7, (%rax) +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride8_vf2: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -161,16 +158,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) ; AVX2-FCP-NEXT: vpextrw $0, %xmm4, (%r8) ; AVX2-FCP-NEXT: vpextrw $0, %xmm5, (%r9) -; AVX2-FCP-NEXT: vpextrw $0, %xmm6, (%r11) -; AVX2-FCP-NEXT: vpextrw $0, %xmm7, (%r10) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: vpextrw $0, %xmm7, (%rax) +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride8_vf2: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -184,16 +181,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpextrw $0, %xmm2, (%rcx) ; AVX512-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512-NEXT: vpextrw $0, %xmm4, (%r9) -; AVX512-NEXT: vpextrw $0, %xmm5, (%r11) -; AVX512-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vpextrw $0, %xmm5, (%rax) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpextrw $0, %xmm7, (%rax) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride8_vf2: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -207,16 +204,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rcx) ; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512-FCP-NEXT: vpextrw $0, %xmm4, (%r9) -; AVX512-FCP-NEXT: vpextrw $0, %xmm5, (%r11) -; AVX512-FCP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vpextrw $0, %xmm5, (%rax) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpextrw $0, %xmm7, (%rax) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride8_vf2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -230,16 +227,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rcx) ; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512DQ-NEXT: vpextrw $0, %xmm4, (%r9) -; AVX512DQ-NEXT: vpextrw $0, %xmm5, (%r11) -; AVX512DQ-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vpextrw $0, %xmm5, (%rax) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpextrw $0, %xmm7, (%rax) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride8_vf2: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -253,16 +250,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm4, (%r9) -; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm5, (%r11) -; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm5, (%rax) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm7, (%rax) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride8_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -276,16 +273,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rcx) ; AVX512BW-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512BW-NEXT: vpextrw $0, %xmm4, (%r9) -; AVX512BW-NEXT: vpextrw $0, %xmm5, (%r11) -; AVX512BW-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vpextrw $0, %xmm5, (%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vpextrw $0, %xmm7, (%rax) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride8_vf2: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -299,16 +296,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rcx) ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm4, (%r9) -; AVX512BW-FCP-NEXT: vpextrw $0, %xmm5, (%r11) -; AVX512BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vpextrw $0, %xmm5, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm7, (%rax) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride8_vf2: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -322,16 +319,16 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rcx) ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm4, (%r9) -; AVX512DQ-BW-NEXT: vpextrw $0, %xmm5, (%r11) -; AVX512DQ-BW-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vpextrw $0, %xmm5, (%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm7, (%rax) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf2: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -345,8 +342,11 @@ define void @load_i8_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm4, (%r9) -; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm5, (%r11) -; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm5, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm6, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm7, (%rax) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i8>, ptr %in.vec, align 64 @@ -522,37 +522,37 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm7 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm8 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm9 ; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -571,37 +571,37 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm7 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm7 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm8 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -620,37 +620,37 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm5 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm7 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -1186,74 +1186,74 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm9 ; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm9 ; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm10 ; AVX2-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 ; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm10, %xmm1, %xmm11 ; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm12 ; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm11 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1277,74 +1277,74 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm9 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm9 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm9 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm11 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm11 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1366,30 +1366,30 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,1,5,9,13,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm8 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 @@ -2269,13 +2269,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-LABEL: load_i8_stride8_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm6 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm2 @@ -2285,147 +2285,147 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm9, %xmm7, %xmm10 ; AVX2-NEXT: vpshufb %xmm9, %xmm6, %xmm9 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm10 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX2-NEXT: vpshufb %xmm10, %xmm1, %xmm10 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm10 ; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm9 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm11 ; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm10 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm10 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX2-NEXT: vpshufb %xmm10, %xmm6, %xmm10 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm11 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm11 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm10 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm11, %xmm4, %xmm12 ; AVX2-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm11 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm11, %xmm7, %xmm12 ; AVX2-NEXT: vpshufb %xmm11, %xmm6, %xmm11 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm12 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm13 ; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm12 ; AVX2-NEXT: vmovdqa %xmm1, %xmm9 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm11, %xmm8, %xmm12 ; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm11 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm12, %xmm4, %xmm13 ; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm12 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm12 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm12, %xmm7, %xmm13 ; AVX2-NEXT: vpshufb %xmm12, %xmm6, %xmm12 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm13, %xmm2, %xmm14 ; AVX2-NEXT: vpshufb %xmm13, %xmm1, %xmm13 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm12, %xmm8, %xmm13 ; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX2-NEXT: vpshufb %xmm13, %xmm5, %xmm13 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm13 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm13, %xmm7, %xmm14 ; AVX2-NEXT: vpshufb %xmm13, %xmm6, %xmm13 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm15 ; AVX2-NEXT: vpshufb %xmm14, %xmm1, %xmm14 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm13, %xmm8, %xmm14 ; AVX2-NEXT: vpshufb %xmm13, %xmm3, %xmm13 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm14, %xmm4, %xmm15 ; AVX2-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm14 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX2-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm15 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm15, %xmm2, %xmm0 ; AVX2-NEXT: vpshufb %xmm15, %xmm1, %xmm15 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm14 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm14, %xmm4, %xmm15 ; AVX2-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm14 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX2-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm15 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm15, %xmm2, %xmm0 ; AVX2-NEXT: vpshufb %xmm15, %xmm9, %xmm15 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm8 ; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm4 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] @@ -2449,13 +2449,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-LABEL: load_i8_stride8_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm6 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 @@ -2465,147 +2465,147 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm6, %xmm9 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm10 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm10 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm11 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm10 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm11 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3] ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm4, %xmm12 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm11 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm7, %xmm12 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm6, %xmm11 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm12 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm12 ; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm9 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm8, %xmm12 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm4, %xmm13 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm12 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm12 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm7, %xmm13 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm6, %xmm12 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm2, %xmm14 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm1, %xmm13 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm13 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm13 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm6, %xmm13 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm15 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm1, %xmm14 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm3, %xmm13 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm4, %xmm15 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm14 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm15 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm15 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm14 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm4, %xmm15 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm14 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm15 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm9, %xmm15 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm8 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] @@ -2632,12 +2632,12 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm3 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[3] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm3 @@ -2662,18 +2662,18 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm1 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm5 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 @@ -2726,13 +2726,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 @@ -2742,11 +2742,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-NEXT: vpmovqb %zmm5, %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm7 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] @@ -2754,11 +2754,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpsrlq $8, %zmm5, %zmm7 ; AVX512-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm8 ; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm8, %xmm2, %xmm9 ; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm8 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] @@ -2766,11 +2766,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpsrlq $16, %zmm5, %zmm8 ; AVX512-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm10 ; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm9 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] @@ -2778,11 +2778,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpsrlq $24, %zmm5, %zmm9 ; AVX512-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm10 ; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm9 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm10 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] @@ -2790,11 +2790,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpsrlq $32, %zmm5, %zmm10 ; AVX512-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm10 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm11 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] @@ -2802,11 +2802,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpsrlq $40, %zmm5, %zmm11 ; AVX512-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm12 ; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm11 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm13 ; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] @@ -2814,11 +2814,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpsrlq $48, %zmm5, %zmm12 ; AVX512-NEXT: vpmovqb %zmm12, %xmm12 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -2844,41 +2844,41 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX512-FCP-NEXT: vpmovqd %ymm9, %xmm8 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 ; AVX512-FCP-NEXT: vpmovqd %ymm10, %xmm11 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2 ; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX512-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14 ; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] ; AVX512-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11 ; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7] ; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 @@ -2921,13 +2921,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3 @@ -2937,11 +2937,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-NEXT: vpmovqb %zmm5, %xmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm3, %xmm7 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] @@ -2949,11 +2949,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpsrlq $8, %zmm5, %zmm7 ; AVX512DQ-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm0, %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm2, %xmm9 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm8 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] @@ -2961,11 +2961,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpsrlq $16, %zmm5, %zmm8 ; AVX512DQ-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm10 ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm9 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] @@ -2973,11 +2973,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpsrlq $24, %zmm5, %zmm9 ; AVX512DQ-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm0, %xmm10 ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm9 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm10 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] @@ -2985,11 +2985,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpsrlq $32, %zmm5, %zmm10 ; AVX512DQ-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm10 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm11 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] @@ -2997,11 +2997,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpsrlq $40, %zmm5, %zmm11 ; AVX512DQ-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm12 ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm1, %xmm11 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm2, %xmm13 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] @@ -3009,11 +3009,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpsrlq $48, %zmm5, %zmm12 ; AVX512DQ-NEXT: vpmovqb %zmm12, %xmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -3039,41 +3039,41 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX512DQ-FCP-NEXT: vpmovqd %ymm9, %xmm8 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 ; AVX512DQ-FCP-NEXT: vpmovqd %ymm10, %xmm11 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] ; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 @@ -3116,13 +3116,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3 @@ -3132,11 +3132,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vpmovqb %zmm5, %xmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX512BW-NEXT: vpshufb %xmm7, %xmm3, %xmm7 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] @@ -3144,11 +3144,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpsrlq $8, %zmm5, %zmm7 ; AVX512BW-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm7, %xmm0, %xmm8 ; AVX512BW-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm9 ; AVX512BW-NEXT: vpshufb %xmm8, %xmm3, %xmm8 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] @@ -3156,11 +3156,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpsrlq $16, %zmm5, %zmm8 ; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm9, %xmm2, %xmm10 ; AVX512BW-NEXT: vpshufb %xmm9, %xmm3, %xmm9 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] @@ -3168,11 +3168,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpsrlq $24, %zmm5, %zmm9 ; AVX512BW-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm9, %xmm0, %xmm10 ; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm9 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX512BW-NEXT: vpshufb %xmm10, %xmm3, %xmm10 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] @@ -3180,11 +3180,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpsrlq $32, %zmm5, %zmm10 ; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX512BW-NEXT: vpshufb %xmm10, %xmm1, %xmm10 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX512BW-NEXT: vpshufb %xmm11, %xmm3, %xmm11 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] @@ -3192,11 +3192,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpsrlq $40, %zmm5, %zmm11 ; AVX512BW-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm11, %xmm0, %xmm12 ; AVX512BW-NEXT: vpshufb %xmm11, %xmm1, %xmm11 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm13 ; AVX512BW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] @@ -3204,11 +3204,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpsrlq $48, %zmm5, %zmm12 ; AVX512BW-NEXT: vpmovqb %zmm12, %xmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -3234,41 +3234,41 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: vpmovqd %ymm4, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vpmovqd %ymm6, %xmm7 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2 ; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14 ; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7] ; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4 ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 @@ -3311,13 +3311,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm3 @@ -3327,11 +3327,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vpmovqb %zmm5, %xmm6 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm3, %xmm7 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] @@ -3339,11 +3339,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm0, %xmm8 ; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm2, %xmm9 ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm3, %xmm8 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] @@ -3351,11 +3351,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm2, %xmm10 ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm3, %xmm9 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] @@ -3363,11 +3363,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpmovqb %zmm9, %xmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm0, %xmm10 ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm1, %xmm9 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm3, %xmm10 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] @@ -3375,11 +3375,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm1, %xmm10 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm3, %xmm11 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] @@ -3387,11 +3387,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm11 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm0, %xmm12 ; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm1, %xmm11 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm2, %xmm13 ; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] @@ -3399,11 +3399,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpmovqb %zmm12, %xmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -3429,41 +3429,41 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm6, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 @@ -4902,7 +4902,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm4 ; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm2 @@ -4910,7 +4910,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm2 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm6 @@ -4924,11 +4924,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm14 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm14, %xmm3, %xmm9 ; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm11 ; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm13 @@ -4972,24 +4972,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm1 ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm0 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm5, %xmm14, %xmm7 @@ -5023,24 +5023,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-NEXT: vmovdqa %xmm9, %xmm15 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm14, %xmm7 @@ -5074,24 +5074,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-NEXT: vmovdqa %xmm8, %xmm13 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -5125,24 +5125,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -5175,24 +5175,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa %xmm12, %xmm11 ; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -5226,24 +5226,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -5276,24 +5276,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -5350,7 +5350,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm4 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm2 @@ -5358,7 +5358,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm2 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm6 @@ -5372,11 +5372,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm14 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm3, %xmm9 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm11 ; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm13 @@ -5420,24 +5420,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm4, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm7, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm14, %xmm7 @@ -5471,24 +5471,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm15 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm14, %xmm7 @@ -5522,24 +5522,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm13 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -5573,24 +5573,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -5623,24 +5623,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa %xmm12, %xmm11 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -5674,24 +5674,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -5724,24 +5724,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -5806,7 +5806,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm14 @@ -5819,7 +5819,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm4 @@ -5884,7 +5884,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload @@ -5896,7 +5896,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload @@ -5978,14 +5978,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm1 ; AVX512-NEXT: vmovdqa 224(%rdi), %xmm7 ; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm2 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm8 ; AVX512-NEXT: vpshufb %xmm4, %xmm8, %xmm6 @@ -6017,13 +6017,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512-NEXT: vmovdqa 160(%rdi), %xmm10 ; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm12, %xmm19 ; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm12 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm14, %xmm9, %xmm12 ; AVX512-NEXT: vmovdqa64 %xmm9, %xmm22 ; AVX512-NEXT: vpshufb %xmm14, %xmm8, %xmm13 @@ -6062,13 +6062,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm3 ; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm2 ; AVX512-NEXT: vmovdqa %xmm7, %xmm9 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm7 @@ -6106,12 +6106,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX512-NEXT: vmovdqa64 %xmm7, %xmm22 @@ -6148,13 +6148,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm11 ; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm1 ; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm2 ; AVX512-NEXT: vmovdqa %xmm9, %xmm7 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm9 ; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm14 @@ -6191,12 +6191,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm1 ; AVX512-NEXT: vpshufb %xmm5, %xmm7, %xmm2 ; AVX512-NEXT: vmovdqa %xmm7, %xmm13 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm3 ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm26 ; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm4 @@ -6235,12 +6235,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm10, %xmm11, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm11, %xmm23 ; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm11 ; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm5 @@ -6276,12 +6276,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %xmm23, %xmm2 ; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm5 ; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -6329,21 +6329,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i8_stride8_vf32: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm20 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16 ; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0 @@ -6375,25 +6375,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm30 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6 ; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm28 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5 ; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm27 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6 ; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm26 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] @@ -6401,30 +6401,30 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2 ; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1 ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7 @@ -6436,7 +6436,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0 @@ -6517,14 +6517,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm1 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm7 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm2 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm8, %xmm6 @@ -6556,13 +6556,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm10 ; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm19 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm12 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm14, %xmm9, %xmm12 ; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm22 ; AVX512DQ-NEXT: vpshufb %xmm14, %xmm8, %xmm13 @@ -6601,13 +6601,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm9 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm7 @@ -6645,12 +6645,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm22 @@ -6687,13 +6687,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm11 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm7 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm9 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm14 @@ -6730,12 +6730,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm11, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm7, %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm13 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm3 ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm26 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm4 @@ -6774,12 +6774,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm11, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm23 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm11 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm14, %xmm5 @@ -6815,12 +6815,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm5 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -6868,21 +6868,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i8_stride8_vf32: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16 ; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0 @@ -6914,25 +6914,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm30 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm28 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm27 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm26 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] @@ -6940,30 +6940,30 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1 ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7 @@ -6975,7 +6975,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0 @@ -7059,14 +7059,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 ; AVX512BW-NEXT: vmovdqa 224(%rdi), %xmm6 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm6, %xmm3 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm5, %xmm7, %xmm3 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm9 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm9, %xmm8 @@ -7098,23 +7098,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm12 ; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm15 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm17 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -7133,23 +7133,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512BW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm15 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 ; AVX512BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] @@ -7168,23 +7168,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512BW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm15 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 ; AVX512BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] @@ -7203,23 +7203,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512BW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm14, %xmm13, %xmm17 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm17 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -7238,23 +7238,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm10, %xmm7, %xmm15 ; AVX512BW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm15 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm17 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -7273,23 +7273,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm10 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm15 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm17 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -7307,23 +7307,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX512BW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm7 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm7, %xmm13, %xmm9 ; AVX512BW-NEXT: vpshufb %xmm7, %xmm12, %xmm7 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm9, %xmm11, %xmm10 ; AVX512BW-NEXT: vpshufb %xmm9, %xmm19, %xmm9 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] @@ -7355,7 +7355,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i8_stride8_vf32: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] @@ -7385,15 +7385,15 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] ; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10 @@ -7401,47 +7401,47 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] ; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13 ; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm30 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3] ; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8 @@ -7453,7 +7453,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7] ; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12 ; AVX512BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] @@ -7529,14 +7529,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %xmm6 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm6, %xmm3 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm7, %xmm3 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm9 ; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm9, %xmm8 @@ -7568,23 +7568,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm12 ; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm15 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm17 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -7603,23 +7603,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm15 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] @@ -7638,23 +7638,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm15 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] @@ -7673,23 +7673,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm14 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm13, %xmm17 ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm17 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -7708,23 +7708,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm10 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm7, %xmm15 ; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm15 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm17 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -7743,23 +7743,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm10 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm14 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm15 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm17 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -7777,23 +7777,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm7 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm13, %xmm9 ; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm12, %xmm7 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm11, %xmm10 ; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm19, %xmm9 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] @@ -7825,7 +7825,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf32: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] @@ -7855,15 +7855,15 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm31 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10 @@ -7871,47 +7871,47 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm30 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8 @@ -7923,7 +7923,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] @@ -10755,14 +10755,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: subq $840, %rsp # imm = 0x348 ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm13 ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-NEXT: vmovdqa 336(%rdi), %xmm15 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm15, %xmm1 ; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm4 @@ -10772,14 +10772,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm9, %xmm0, %xmm1 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm11 ; AVX2-NEXT: vpshufb %xmm9, %xmm11, %xmm5 ; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm12 @@ -10893,23 +10893,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm6, %xmm15, %xmm3 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm6, %xmm13, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm0 ; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm15 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] @@ -10994,25 +10994,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -11095,25 +11095,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm0 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -11198,24 +11198,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm0 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm14 @@ -11300,26 +11300,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -11401,26 +11401,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm10, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -11506,24 +11506,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm15 @@ -11647,14 +11647,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: subq $840, %rsp # imm = 0x348 ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm13 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm15 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm15, %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm4 @@ -11664,14 +11664,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm1 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm11 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm11, %xmm5 ; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm12 @@ -11785,23 +11785,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm3 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm13, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm10, %xmm0 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm15 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] @@ -11886,25 +11886,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -11987,25 +11987,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -12090,24 +12090,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm14 @@ -12192,26 +12192,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -12293,26 +12293,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm10, %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm11, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -12398,24 +12398,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm10, %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm15 @@ -12545,7 +12545,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm10 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm0 @@ -12568,14 +12568,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 @@ -12700,13 +12700,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm14 ; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,2,6,10,14,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 ; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm8 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] @@ -12752,15 +12752,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm12 ; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,3,7,11,15,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm11 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] @@ -12790,7 +12790,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload @@ -12810,21 +12810,21 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm7 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm8[0,1,2],xmm7[3] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm11 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm12 @@ -13046,14 +13046,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512-NEXT: vpmovqb %zmm0, %xmm2 ; AVX512-NEXT: vmovdqa 496(%rdi), %xmm7 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm1 ; AVX512-NEXT: vmovdqa 480(%rdi), %xmm8 ; AVX512-NEXT: vpshufb %xmm0, %xmm8, %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm11 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm4 ; AVX512-NEXT: vmovdqa 448(%rdi), %xmm15 ; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm5 @@ -13133,13 +13133,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm7, %xmm24 ; AVX512-NEXT: vpshufb %xmm9, %xmm8, %xmm2 ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm21 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm3 ; AVX512-NEXT: vmovdqa64 %xmm11, %xmm27 ; AVX512-NEXT: vpshufb %xmm8, %xmm15, %xmm4 @@ -13230,7 +13230,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %xmm24, %xmm13 ; AVX512-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm1 @@ -13238,7 +13238,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm0 ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm8 @@ -13325,11 +13325,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm10 ; AVX512-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm3 @@ -13413,13 +13413,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm3 ; AVX512-NEXT: vmovdqa64 %xmm18, %xmm10 ; AVX512-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13503,13 +13503,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm4 @@ -13595,14 +13595,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -13686,13 +13686,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa64 %xmm29, %xmm1 ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -13793,26 +13793,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $232, %rsp ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7 @@ -13823,13 +13823,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpmovqd %ymm2, %xmm4 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpmovqd %ymm3, %xmm12 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 ; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm21 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] @@ -13871,27 +13871,27 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 ; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm24 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm22 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm2 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] @@ -13924,24 +13924,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX512-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14 @@ -13968,25 +13968,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm6 ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] ; AVX512-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8 @@ -14010,30 +14010,30 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm22 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] ; AVX512-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3 @@ -14064,27 +14064,27 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm5 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm25 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] ; AVX512-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15 @@ -14114,26 +14114,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm24 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm14 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX512-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14 @@ -14160,23 +14160,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] ; AVX512-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8 @@ -14223,14 +14223,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm2 ; AVX512DQ-NEXT: vmovdqa 496(%rdi), %xmm7 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm1 ; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm8, %xmm3 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm11 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm4 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm15 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm5 @@ -14310,13 +14310,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm9 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm7, %xmm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm24 ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm8, %xmm2 ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm21 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm8 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm3 ; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm27 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm15, %xmm4 @@ -14407,7 +14407,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm13 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm13, %xmm1 @@ -14415,7 +14415,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm10, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm8 @@ -14502,11 +14502,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm10 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm3 @@ -14590,13 +14590,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm3 ; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm10 ; AVX512DQ-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -14680,13 +14680,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm4 @@ -14772,14 +14772,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -14863,13 +14863,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -14970,26 +14970,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $232, %rsp ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7 @@ -15000,13 +15000,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpmovqd %ymm2, %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpmovqd %ymm3, %xmm12 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm21 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] @@ -15048,27 +15048,27 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm24 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm22 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] @@ -15101,24 +15101,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14 @@ -15145,25 +15145,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] ; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8 @@ -15187,30 +15187,30 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512DQ-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm22 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3 @@ -15241,27 +15241,27 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm5 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm25 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] ; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15 @@ -15291,26 +15291,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm24 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm14 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14 @@ -15337,23 +15337,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] ; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8 @@ -15403,14 +15403,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa64 496(%rdi), %xmm24 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm12, %xmm24, %xmm3 ; AVX512BW-NEXT: vmovdqa64 480(%rdi), %xmm25 ; AVX512BW-NEXT: vpshufb %xmm12, %xmm25, %xmm4 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vmovdqa64 464(%rdi), %xmm26 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm19 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm19, %xmm26, %xmm4 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm30 ; AVX512BW-NEXT: vpshufb %xmm19, %xmm30, %xmm6 @@ -15489,14 +15489,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 400(%rdi), %xmm21 ; AVX512BW-NEXT: vmovdqa64 416(%rdi), %xmm23 ; AVX512BW-NEXT: vmovdqa64 432(%rdi), %xmm29 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %xmm24, %xmm19 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm24, %xmm1 ; AVX512BW-NEXT: vmovdqa64 %xmm25, %xmm11 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %xmm26, %xmm12 ; AVX512BW-NEXT: vpshufb %xmm13, %xmm26, %xmm24 ; AVX512BW-NEXT: vmovdqa64 %xmm30, %xmm16 @@ -15505,12 +15505,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm26 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm26, %xmm29, %xmm24 ; AVX512BW-NEXT: vpshufb %xmm26, %xmm23, %xmm25 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm30 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm30, %xmm21, %xmm24 ; AVX512BW-NEXT: vpshufb %xmm30, %xmm7, %xmm25 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] @@ -15574,12 +15574,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm1, %xmm19, %xmm2 ; AVX512BW-NEXT: vmovdqa64 %xmm19, %xmm20 ; AVX512BW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 ; AVX512BW-NEXT: vmovdqa64 %xmm12, %xmm25 ; AVX512BW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -15588,14 +15588,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %xmm29, %xmm24 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm29, %xmm13 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm23, %xmm30 ; AVX512BW-NEXT: vmovdqa64 %xmm23, %xmm16 ; AVX512BW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm30 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %xmm21, %xmm18 ; AVX512BW-NEXT: vpshufb %xmm30, %xmm21, %xmm0 ; AVX512BW-NEXT: vmovdqa64 %xmm7, %xmm17 @@ -15664,12 +15664,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm25, %xmm3 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 @@ -15677,12 +15677,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm24, %xmm4 ; AVX512BW-NEXT: vmovdqa64 %xmm24, (%rsp) # 16-byte Spill ; AVX512BW-NEXT: vpshufb %xmm3, %xmm16, %xmm19 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm19 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 ; AVX512BW-NEXT: vpshufb %xmm19, %xmm17, %xmm13 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3] @@ -15744,13 +15744,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 ; AVX512BW-NEXT: vmovdqa64 %xmm11, %xmm28 ; AVX512BW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload @@ -15759,12 +15759,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm4, %xmm24, %xmm13 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm4, %xmm26, %xmm19 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm19 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 ; AVX512BW-NEXT: vmovdqa64 %xmm18, %xmm24 ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload @@ -15828,24 +15828,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm28, %xmm2 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm16, %xmm4 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm26, %xmm13 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm13 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm13, %xmm24, %xmm19 ; AVX512BW-NEXT: vmovdqa64 %xmm24, %xmm26 ; AVX512BW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -15905,25 +15905,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm0, %xmm28, %xmm1 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm16, %xmm5 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm4, %xmm15, %xmm5 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm4, %xmm15, %xmm13 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm13 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm13, %xmm26, %xmm19 ; AVX512BW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] @@ -15980,23 +15980,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm1, %xmm28, %xmm2 ; AVX512BW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm16, %xmm5 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm15, %xmm8 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm8 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm8, %xmm6, %xmm9 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -16074,23 +16074,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $264, %rsp # imm = 0x108 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 ; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12 ; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22 ; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20 ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27 ; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9 ; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3 @@ -16098,11 +16098,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24 ; AVX512BW-FCP-NEXT: vpmovqd %ymm24, %xmm18 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23 ; AVX512BW-FCP-NEXT: vpmovqd %ymm23, %xmm17 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] ; AVX512BW-FCP-NEXT: vpmovqb %zmm26, %xmm3 @@ -16141,20 +16141,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] ; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15 @@ -16178,20 +16178,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] ; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15 @@ -16215,20 +16215,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm30 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] ; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12 @@ -16252,28 +16252,28 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] ; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2 ; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7] ; AVX512BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4 ; AVX512BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm23 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] ; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14 @@ -16302,20 +16302,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14 @@ -16338,20 +16338,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm23 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] ; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12 @@ -16374,12 +16374,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] @@ -16436,14 +16436,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 496(%rdi), %xmm24 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm24, %xmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 480(%rdi), %xmm25 ; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm25, %xmm4 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vmovdqa64 464(%rdi), %xmm26 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm19 = [0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm26, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %xmm30 ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm30, %xmm6 @@ -16522,14 +16522,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 400(%rdi), %xmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 416(%rdi), %xmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 432(%rdi), %xmm29 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [0,0,0,0,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, %xmm19 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm24, %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm25, %xmm11 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm13 = [0,0,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm26, %xmm12 ; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm26, %xmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm30, %xmm16 @@ -16538,12 +16538,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm26 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm29, %xmm24 ; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm23, %xmm25 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm30 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm21, %xmm24 ; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm7, %xmm25 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] @@ -16607,12 +16607,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,2,10,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm19, %xmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm19, %xmm20 ; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,2,10,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm12, %xmm25 ; AVX512DQ-BW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -16621,14 +16621,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm29, %xmm24 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm29, %xmm13 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm23, %xmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm23, %xmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm30 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm21, %xmm18 ; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm21, %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm7, %xmm17 @@ -16697,12 +16697,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm25, %xmm3 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 @@ -16710,12 +16710,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm24, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, (%rsp) # 16-byte Spill ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm16, %xmm19 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm19 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm17, %xmm13 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3] @@ -16777,13 +16777,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,4,12,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm11, %xmm28 ; AVX512DQ-BW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,4,12,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload @@ -16792,12 +16792,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm24, %xmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm26, %xmm19 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm19 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm18, %xmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload @@ -16861,24 +16861,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm28, %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm16, %xmm4 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm26, %xmm13 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm13 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm24, %xmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, %xmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -16938,25 +16938,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm28, %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm16, %xmm5 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm15, %xmm5 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm15, %xmm13 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm13 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm26, %xmm19 ; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] @@ -17013,23 +17013,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm28, %xmm2 ; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm16, %xmm5 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm15, %xmm8 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm8 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm6, %xmm9 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -17107,23 +17107,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $264, %rsp # imm = 0x108 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3 @@ -17131,11 +17131,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24 ; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm24, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23 ; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm23, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm26, %xmm3 @@ -17174,20 +17174,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15 @@ -17211,20 +17211,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15 @@ -17248,20 +17248,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm30 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12 @@ -17285,28 +17285,28 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] ; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm23 = [u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14 @@ -17335,20 +17335,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14 @@ -17371,20 +17371,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm23 = [u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] ; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12 @@ -17407,12 +17407,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll index a034363895c0e..5d4c9e127727d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -322,7 +322,7 @@ define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -332,7 +332,7 @@ define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper @@ -342,7 +342,7 @@ define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper @@ -352,7 +352,7 @@ define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -507,7 +507,7 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -517,7 +517,7 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper @@ -527,7 +527,7 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper @@ -537,7 +537,7 @@ define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -790,9 +790,9 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -803,9 +803,9 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -816,9 +816,9 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -829,9 +829,9 @@ define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -1291,10 +1291,10 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 @@ -1311,10 +1311,10 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 @@ -1331,10 +1331,10 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 @@ -1351,10 +1351,10 @@ define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 7dbff047e4f87..4c52853b60384 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -115,7 +115,7 @@ define void @store_i16_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] ; AVX512BW-NEXT: vpermi2w (%rdx), %xmm0, %xmm1 ; AVX512BW-NEXT: vpextrd $2, %xmm1, 8(%rcx) ; AVX512BW-NEXT: vmovq %xmm1, (%rcx) @@ -125,7 +125,7 @@ define void @store_i16_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] ; AVX512BW-FCP-NEXT: vpermi2w (%rdx), %xmm0, %xmm1 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm1, 8(%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx) @@ -135,7 +135,7 @@ define void @store_i16_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] ; AVX512DQ-BW-NEXT: vpermi2w (%rdx), %xmm0, %xmm1 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm1, 8(%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) @@ -145,7 +145,7 @@ define void @store_i16_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,8,1,3,9,1,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2w (%rdx), %xmm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm1, 8(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx) @@ -330,7 +330,7 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, 16(%rcx) @@ -344,7 +344,7 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm0, 16(%rcx) @@ -358,7 +358,7 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-NEXT: vmovq %xmm0, 16(%rcx) @@ -372,7 +372,7 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 16(%rcx) @@ -466,9 +466,9 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] ; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -490,9 +490,9 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] ; AVX2-FP-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -509,12 +509,12 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] @@ -536,7 +536,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] ; AVX512-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3)) ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -554,10 +554,10 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5],zero,zero,ymm3[2,3,6,7],zero,zero,ymm3[8,9,12,13],zero,zero,ymm3[18,19,22,23],zero,zero,ymm3[24,25,28,29],zero,zero,ymm3[26,27] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ~mem) | ymm3 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -581,7 +581,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] ; AVX512DQ-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3)) ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -599,10 +599,10 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5],zero,zero,ymm3[2,3,6,7],zero,zero,ymm3[8,9,12,13],zero,zero,ymm3[18,19,22,23],zero,zero,ymm3[24,25,28,29],zero,zero,ymm3[26,27] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ~mem) | ymm3 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -620,7 +620,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rcx) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rcx) @@ -632,7 +632,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,0,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rcx) @@ -644,7 +644,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rcx) @@ -656,7 +656,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rcx) @@ -811,29 +811,29 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,5,0,6,6,0,7,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7] ; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[3,3,3,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2] ; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5] ; AVX2-NEXT: vpermd (%rdi), %ymm4, %ymm4 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u] ; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm2, (%rcx) @@ -847,37 +847,37 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,1,0,2] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,0,0,u,1,1,u,2] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,5,0,6,6,0,7,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5] ; AVX2-FP-NEXT: vpermd (%rdi), %ymm4, %ymm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm3, 64(%rcx) @@ -891,37 +891,37 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,1,0,2] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,0,0,u,1,1,u,2] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,5,0,6,6,0,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5] ; AVX2-FCP-NEXT: vpermd (%rdi), %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%rcx) @@ -948,7 +948,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21] ; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] ; AVX512-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero @@ -956,11 +956,11 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & mem) ; AVX512-NEXT: vprold $16, %xmm4, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,5,0,6,6,0,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [5,5,u,6,6,u,7,7] ; AVX512-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512-NEXT: vmovdqa %ymm0, 64(%rcx) @@ -987,7 +987,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21] ; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero @@ -995,11 +995,11 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & mem) ; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,5,0,6,6,0,7,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [5,5,u,6,6,u,7,7] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) @@ -1026,7 +1026,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21] ; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero @@ -1034,11 +1034,11 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & mem) ; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,5,0,6,6,0,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [5,5,u,6,6,u,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rcx) @@ -1065,7 +1065,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21] ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero @@ -1073,11 +1073,11 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & mem) ; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,5,0,6,6,0,7,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [5,5,u,6,6,u,7,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rcx) @@ -1090,9 +1090,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -1104,9 +1104,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -1118,9 +1118,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -1132,9 +1132,9 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [42,11,27,43,12,28,44,13,29,45,14,30,46,15,31,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,1,17,33,2,18,34,3,19,35,4,20,36,5,21,37,6,22,38,7,23,39,8,24,40,9,25,41,10,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -1415,81 +1415,81 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-LABEL: store_i16_stride3_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm5 = [5,5,0,6,6,0,7,7] -; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX2-NEXT: vpshufb %xmm7, %xmm14, %xmm7 -; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 -; AVX2-NEXT: vpermd %ymm2, %ymm5, %ymm5 -; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[3,3,3,3,4,5,6,7] +; AVX2-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7] +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX2-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,3,3,3,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2],xmm8[3,4],xmm11[5],xmm8[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,2] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,3,3,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,1,0,2] -; AVX2-NEXT: vpermd %ymm2, %ymm7, %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 -; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,3,3,3,4,5,6,7] +; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,0,0,u,1,1,u,2] +; AVX2-NEXT: vpermd %ymm3, %ymm7, %ymm10 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[3,3,3,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 -; AVX2-NEXT: vpermd %ymm1, %ymm7, %ymm7 -; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm7, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,3,0,4,4,0,5] -; AVX2-NEXT: vpermd (%rdi), %ymm9, %ymm10 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,0,3,3,0,4,4,0] -; AVX2-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpermd 32(%rdi), %ymm9, %ymm4 -; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 +; AVX2-NEXT: vpermd %ymm0, %ymm7, %ymm7 +; AVX2-NEXT: vpblendvb %ymm11, %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,3,u,4,4,u,5] +; AVX2-NEXT: vpermd (%rdi), %ymm8, %ymm9 +; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm10, %ymm2, %ymm9, %ymm2 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [2,u,3,3,u,4,4,u] +; AVX2-NEXT: vpermd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpermd 32(%rdi), %ymm8, %ymm3 +; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vpermd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 128(%rcx) ; AVX2-NEXT: vmovdqa %ymm2, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm7, 96(%rcx) +; AVX2-NEXT: vmovdqa %ymm5, 96(%rcx) ; AVX2-NEXT: vmovdqa %ymm6, (%rcx) -; AVX2-NEXT: vmovdqa %ymm5, 64(%rcx) -; AVX2-NEXT: vmovdqa %ymm3, 160(%rcx) +; AVX2-NEXT: vmovdqa %ymm4, 64(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, 160(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1498,7 +1498,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm6 @@ -1508,9 +1508,9 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,1,0,2] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,0,0,u,1,1,u,2] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm7, %ymm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa 48(%rsi), %xmm9 @@ -1530,11 +1530,11 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,5,0,6,6,0,7,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7] ; AVX2-FP-NEXT: vpermd %ymm3, %ymm7, %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 @@ -1546,18 +1546,18 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm8 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,3,0,4,4,0,5] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,3,u,4,4,u,5] ; AVX2-FP-NEXT: vpermd (%rdi), %ymm8, %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 ; AVX2-FP-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,3,3,0,4,4,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,u,3,3,u,4,4,u] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm8, %ymm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermd %ymm3, %ymm8, %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 @@ -1575,7 +1575,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 @@ -1585,9 +1585,9 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,1,0,2] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,0,0,u,1,1,u,2] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa 48(%rsi), %xmm9 @@ -1607,11 +1607,11 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,5,0,6,6,0,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm5 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 @@ -1623,18 +1623,18 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm8 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,3,0,4,4,0,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,3,u,4,4,u,5] ; AVX2-FCP-NEXT: vpermd (%rdi), %ymm8, %ymm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 ; AVX2-FCP-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,3,3,0,4,4,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,u,3,3,u,4,4,u] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm8, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 @@ -1670,52 +1670,52 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] ; AVX512-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm3)) -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] ; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512-NEXT: vpor %ymm6, %ymm9, %ymm6 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm9 ; AVX512-NEXT: vmovdqa 48(%rsi), %xmm11 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] ; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm7 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vprold $16, %xmm11, %xmm11 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[0,1,2,3] ; AVX512-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX512-NEXT: vpermd %ymm9, %ymm4, %ymm4 -; AVX512-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512-NEXT: vpandn %ymm4, %ymm12, %ymm4 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm6 & mem) +; AVX512-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX512-NEXT: vmovdqa (%rsi), %ymm8 ; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX512-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512-NEXT: vprold $16, %xmm0, %xmm8 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] -; AVX512-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,0,0,u,1,1,u,2] +; AVX512-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vpshufb %ymm11, %ymm7, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm12 & (zmm5 ^ zmm3)) ; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1745,52 +1745,52 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] ; AVX512-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpor %ymm6, %ymm9, %ymm6 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm9 ; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm11 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vprold $16, %xmm11, %xmm11 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX512-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpandn %ymm4, %ymm12, %ymm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm6 & mem) +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm8 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm8 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,0,0,u,1,1,u,2] +; AVX512-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm12 & (zmm5 ^ zmm3)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1820,52 +1820,52 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] ; AVX512DQ-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm3)) -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512DQ-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512DQ-NEXT: vpor %ymm6, %ymm9, %ymm6 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm9 ; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm11 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm12, %xmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vprold $16, %xmm11, %xmm11 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[0,1,2,3] ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX512DQ-NEXT: vpermd %ymm9, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512DQ-NEXT: vpandn %ymm4, %ymm12, %ymm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm6 & mem) +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm8 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512DQ-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX512DQ-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512DQ-NEXT: vprold $16, %xmm0, %xmm8 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] -; AVX512DQ-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,0,0,u,1,1,u,2] +; AVX512DQ-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm7, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm12 & (zmm5 ^ zmm3)) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1895,52 +1895,52 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] ; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm9, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm11 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm11, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpandn %ymm4, %ymm12, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm6 & mem) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm8 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,0,0,u,1,1,u,2] +; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm12 & (zmm5 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1952,17 +1952,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1975,17 +1975,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1998,17 +1998,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -2021,17 +2021,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -2632,9 +2632,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-LABEL: store_i16_stride3_vf64: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdx), %ymm1 ; AVX2-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX2-NEXT: vmovdqa 96(%rdx), %ymm1 +; AVX2-NEXT: vmovdqa 96(%rdx), %ymm0 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 80(%rsi), %xmm5 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] @@ -2645,9 +2645,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,5,0,6,6,0,7,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [5,5,u,6,6,u,7,7] ; AVX2-NEXT: vpermd %ymm3, %ymm8, %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5 @@ -2659,7 +2659,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 -; AVX2-NEXT: vpermd %ymm0, %ymm8, %ymm6 +; AVX2-NEXT: vpermd %ymm1, %ymm8, %ymm6 ; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm6 @@ -2671,7 +2671,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7] ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 -; AVX2-NEXT: vpermd %ymm1, %ymm8, %ymm10 +; AVX2-NEXT: vpermd %ymm0, %ymm8, %ymm10 ; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rdi), %xmm10 @@ -2695,9 +2695,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,1,0,2] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,0,0,u,1,1,u,2] ; AVX2-NEXT: vpermd %ymm4, %ymm12, %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm9 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] @@ -2707,19 +2707,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm15 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX2-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 ; AVX2-NEXT: vpermd %ymm3, %ymm12, %ymm10 ; AVX2-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,3,3,3,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[3,3,3,3,4,5,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2],xmm2[3,4],xmm10[5],xmm2[6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3,4],xmm14[5],xmm2[6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] ; AVX2-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 -; AVX2-NEXT: vpermd %ymm0, %ymm12, %ymm10 +; AVX2-NEXT: vpermd %ymm1, %ymm12, %ymm10 ; AVX2-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm10 ; AVX2-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[3,3,3,3,4,5,6,7] @@ -2731,38 +2731,38 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqa (%rsi), %ymm14 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-NEXT: vpermd %ymm1, %ymm12, %ymm5 +; AVX2-NEXT: vpermd %ymm0, %ymm12, %ymm5 ; AVX2-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm11 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-NEXT: vpshufb %ymm2, %ymm14, %ymm5 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,3,3,0,4,4,0,5] -; AVX2-NEXT: vpermd (%rdi), %ymm12, %ymm13 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,3,3,u,4,4,u,5] +; AVX2-NEXT: vpermd (%rdi), %ymm5, %ymm12 +; AVX2-NEXT: vpshufb %ymm2, %ymm14, %ymm13 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm12, %ymm12 ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm13 ; AVX2-NEXT: vpshufb %ymm2, %ymm13, %ymm13 -; AVX2-NEXT: vpermd 64(%rdi), %ymm12, %ymm15 +; AVX2-NEXT: vpermd 64(%rdi), %ymm5, %ymm15 ; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm13 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm15 ; AVX2-NEXT: vpshufb %ymm2, %ymm15, %ymm15 -; AVX2-NEXT: vpermd 32(%rdi), %ymm12, %ymm6 +; AVX2-NEXT: vpermd 32(%rdi), %ymm5, %ymm6 ; AVX2-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm6 +; AVX2-NEXT: vpermd 96(%rdi), %ymm5, %ymm5 ; AVX2-NEXT: vmovdqa 96(%rsi), %ymm15 ; AVX2-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-NEXT: vpermd 96(%rdi), %ymm12, %ymm12 -; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,3,3,0,4,4,0] -; AVX2-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpermd %ymm3, %ymm12, %ymm3 +; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [2,u,3,3,u,4,4,u] +; AVX2-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm14, %ymm12, %ymm4, %ymm4 +; AVX2-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3 -; AVX2-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 320(%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 128(%rcx) +; AVX2-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 320(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, 128(%rcx) ; AVX2-NEXT: vmovdqa %ymm3, 224(%rcx) ; AVX2-NEXT: vmovdqa %ymm4, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm11, 288(%rcx) @@ -2781,139 +2781,139 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FP-LABEL: store_i16_stride3_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm4 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm3 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,1,0,2] -; AVX2-FP-NEXT: vpermd %ymm2, %ymm13, %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,0,0,u,1,1,u,2] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa 80(%rsi), %xmm5 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,5,0,6,6,0,7,7] -; AVX2-FP-NEXT: vpermd %ymm1, %ymm10, %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7] +; AVX2-FP-NEXT: vpermd %ymm4, %ymm10, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm6 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1],xmm6[2],xmm15[3,4],xmm6[5],xmm15[6,7] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm7 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm4, %ymm13, %ymm6 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX2-FP-NEXT: vmovdqa 48(%rsi), %xmm8 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 -; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 +; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm6, %ymm6 ; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm5 ; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm15 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm15, %ymm6 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FP-NEXT: vpermd %ymm5, %ymm13, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm0 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm13, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8 ; AVX2-FP-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm2 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FP-NEXT: vpermd %ymm15, %ymm13, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm13, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm12 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FP-NEXT: vmovdqa 112(%rsi), %xmm2 +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermd %ymm15, %ymm10, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm10, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm13 ; AVX2-FP-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermd %ymm2, %ymm10, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm1, %ymm10, %ymm3 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm7 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] -; AVX2-FP-NEXT: vpermd (%rdi), %ymm4, %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,3,3,u,4,4,u,5] +; AVX2-FP-NEXT: vpermd (%rdi), %ymm3, %ymm9 +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm2, %ymm9, %ymm2 ; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm9 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX2-FP-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 +; AVX2-FP-NEXT: vpermd 64(%rdi), %ymm3, %ymm11 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX2-FP-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 +; AVX2-FP-NEXT: vpermd 32(%rdi), %ymm3, %ymm14 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11 +; AVX2-FP-NEXT: vpermd 96(%rdi), %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm14 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX2-FP-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] -; AVX2-FP-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpermd %ymm15, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,u,3,3,u,4,4,u] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpermd %ymm4, %ymm3, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm9, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm11, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, 320(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm3, 128(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm1, 224(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm4, 128(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm2, 224(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm7, 64(%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm12, 288(%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm13, 352(%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm8, 96(%rcx) -; AVX2-FP-NEXT: vmovdqa %ymm7, 160(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm6, 160(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2925,139 +2925,139 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FCP-LABEL: store_i16_stride3_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm4 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,1,0,2] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm13, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,0,0,u,1,1,u,2] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa 80(%rsi), %xmm5 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [5,5,0,6,6,0,7,7] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm6 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1],xmm6[2],xmm15[3,4],xmm6[5],xmm15[6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm7 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm13, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovdqa 48(%rsi), %xmm8 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm5 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm15 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm6, %ymm15, %ymm6 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm13, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm0 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm13, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8 ; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm2 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm13, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm13, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm12 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovdqa 112(%rsi), %xmm2 +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm10, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm10, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm13 ; AVX2-FCP-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm10, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm10, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm7 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,3,0,4,4,0,5] -; AVX2-FCP-NEXT: vpermd (%rdi), %ymm4, %ymm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,3,3,u,4,4,u,5] +; AVX2-FCP-NEXT: vpermd (%rdi), %ymm3, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm9 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 +; AVX2-FCP-NEXT: vpermd 64(%rdi), %ymm3, %ymm11 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 +; AVX2-FCP-NEXT: vpermd 32(%rdi), %ymm3, %ymm14 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11 +; AVX2-FCP-NEXT: vpermd 96(%rdi), %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm14 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX2-FCP-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,0,3,3,0,4,4,0] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,u,3,3,u,4,4,u] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm11, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 320(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm3, 128(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm1, 224(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 128(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm2, 224(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 64(%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm12, 288(%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm13, 352(%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm8, 96(%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm7, 160(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm6, 160(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3095,7 +3095,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,0,0,u,1,1,u,2] ; AVX512-NEXT: vpermd %ymm3, %ymm16, %ymm3 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vpandn %ymm3, %ymm15, %ymm3 @@ -3108,19 +3108,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm13 +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512-NEXT: vmovdqa 80(%rsi), %xmm12 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] ; AVX512-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[0,1,2,3] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0)) ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm0 @@ -3199,10 +3199,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermd (%rdx), %zmm18, %zmm2 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0)) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm22 & (zmm2 ^ zmm0)) +; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm17, 320(%rcx) @@ -3239,7 +3239,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,0,0,u,1,1,u,2] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vpandn %ymm3, %ymm15, %ymm3 @@ -3252,19 +3252,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm13 +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm12 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] ; AVX512-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0)) ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 @@ -3343,10 +3343,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm2 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm22 & (zmm2 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 320(%rcx) @@ -3383,7 +3383,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,0,0,u,1,1,u,2] ; AVX512DQ-NEXT: vpermd %ymm3, %ymm16, %ymm3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vpandn %ymm3, %ymm15, %ymm3 @@ -3396,19 +3396,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm13 +; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm12 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512DQ-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512DQ-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] ; AVX512DQ-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0)) ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm0 @@ -3487,10 +3487,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpermd (%rdx), %zmm18, %zmm2 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm22 & (zmm2 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 320(%rcx) @@ -3527,7 +3527,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,0,0,u,1,1,u,2] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm15, %ymm3 @@ -3540,19 +3540,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm12 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10] ; AVX512DQ-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 @@ -3631,10 +3631,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm2 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm22 & (zmm2 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 320(%rcx) @@ -3651,20 +3651,20 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 @@ -3689,20 +3689,20 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 @@ -3727,20 +3727,20 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 @@ -3765,20 +3765,20 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,1,33,0,2,34,0,3,35,0,4,36,0,5,37,0,6,38,0,7,39,0,8,40,0,9,41,0,10,42] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [21,0,54,22,0,55,23,0,56,24,0,57,25,0,58,26,0,59,27,0,60,28,0,61,29,0,62,30,0,63,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,11,43,0,12,44,0,13,45,0,14,46,0,15,47,0,16,48,0,17,49,0,18,50,0,19,51,0,20,52,0,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll index fc4377a08d560..b6e55a242c3b8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -19,8 +19,8 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i16_stride4_vf2: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -134,7 +134,7 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] ; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r8) ; AVX512BW-FCP-NEXT: retq @@ -156,7 +156,7 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: retq @@ -254,7 +254,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r8) @@ -287,7 +287,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r8) @@ -320,7 +320,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r8) @@ -335,7 +335,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23] ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa %ymm2, (%r8) ; AVX512BW-NEXT: vzeroupper @@ -349,7 +349,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23] ; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -363,7 +363,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23] ; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper @@ -377,7 +377,7 @@ define void @store_i16_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,16,20,1,5,17,21,2,6,18,22,3,7,19,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -450,7 +450,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] @@ -473,7 +473,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] @@ -496,7 +496,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] @@ -516,10 +516,10 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] -; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] @@ -539,10 +539,10 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] @@ -562,10 +562,10 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] @@ -585,10 +585,10 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] @@ -608,7 +608,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512BW-NEXT: vzeroupper @@ -621,7 +621,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -634,7 +634,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper @@ -647,7 +647,7 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -883,7 +883,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm7 ; AVX2-FCP-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,1,1,2,2,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,1,1,2,2,3,3] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2],ymm9[3],ymm4[4],ymm9[5],ymm4[6],ymm9[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -926,7 +926,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rdi), %xmm7 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermt2d %ymm4, %ymm10, %ymm9 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] @@ -956,7 +956,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11] ; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm9 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] @@ -986,7 +986,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm7 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-NEXT: vpermt2d %ymm4, %ymm10, %ymm9 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] @@ -1016,7 +1016,7 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm9 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] @@ -1040,9 +1040,9 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1055,9 +1055,9 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1070,9 +1070,9 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1085,9 +1085,9 @@ define void @store_i16_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,32,48,1,17,33,49,2,18,34,50,3,19,35,51,4,20,36,52,5,21,37,53,6,22,38,54,7,23,39,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,40,56,9,25,41,57,10,26,42,58,11,27,43,59,12,28,44,60,13,29,45,61,14,30,46,62,15,31,47,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1516,7 +1516,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm11 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,1,1,2,2,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,1,1,2,2,3,3] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4],ymm12[5],ymm2[6],ymm12[7] ; AVX2-FCP-NEXT: vmovdqa 48(%rcx), %xmm12 @@ -1614,7 +1614,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 48(%rdx), %xmm9 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] ; AVX512-NEXT: vpermt2d %zmm15, %zmm5, %zmm13 ; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512-NEXT: kmovw %eax, %k1 @@ -1694,7 +1694,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 48(%rdx), %xmm9 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm5, %zmm13 ; AVX512-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512-FCP-NEXT: kmovw %eax, %k1 @@ -1774,7 +1774,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm9 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm5, %zmm13 ; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-NEXT: kmovw %eax, %k1 @@ -1854,7 +1854,7 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdx), %xmm9 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 @@ -1911,26 +1911,26 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1946,26 +1946,26 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1981,26 +1981,26 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -2016,26 +2016,26 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -2905,7 +2905,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX2-FCP-NEXT: vmovdqa 16(%rdx), %xmm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill @@ -3095,7 +3095,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] ; AVX512-NEXT: vpermt2d %zmm15, %zmm5, %zmm8 ; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512-NEXT: kmovw %eax, %k1 @@ -3249,7 +3249,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm5, %zmm8 ; AVX512-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512-FCP-NEXT: kmovw %eax, %k1 @@ -3403,7 +3403,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm5, %zmm8 ; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-NEXT: kmovw %eax, %k1 @@ -3557,7 +3557,7 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 @@ -3692,32 +3692,32 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 @@ -3753,32 +3753,32 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 @@ -3814,32 +3814,32 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 @@ -3875,32 +3875,32 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,9,41,0,0,10,42,0,0,11,43,0,0,12,44,0,0,13,45,0,0,14,46,0,0,15,47,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,25,57,0,0,26,58,0,0,27,59,0,0,28,60,0,0,29,61,0,0,30,62,0,0,31,63,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [16,48,0,0,17,49,0,0,18,50,0,0,19,51,0,0,20,52,0,0,21,53,0,0,22,54,0,0,23,55,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 418c987ab9a30..b22c25a515163 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -45,11 +45,11 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11] +; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1 @@ -71,7 +71,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,3,4,7,4,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,65535,65535,65535,0,65535,0,0,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovd %xmm1, 16(%r9) @@ -82,8 +82,8 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-LABEL: store_i16_stride5_vf2: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -100,8 +100,8 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-LABEL: store_i16_stride5_vf2: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -118,8 +118,8 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-LABEL: store_i16_stride5_vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -136,8 +136,8 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-LABEL: store_i16_stride5_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -154,8 +154,8 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-LABEL: store_i16_stride5_vf2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -172,8 +172,8 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-LABEL: store_i16_stride5_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -195,7 +195,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vmovd %xmm1, 16(%r9) @@ -211,7 +211,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-FCP-NEXT: vmovd %xmm1, 16(%r9) @@ -227,7 +227,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-NEXT: vmovd %xmm1, 16(%r9) @@ -243,7 +243,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, 16(%r9) @@ -361,7 +361,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] @@ -389,7 +389,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] @@ -416,7 +416,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] @@ -544,7 +544,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512BW-NEXT: vmovq %xmm1, 32(%r9) @@ -563,7 +563,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm1, 32(%r9) @@ -582,7 +582,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm1, 32(%r9) @@ -601,7 +601,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 32(%r9) @@ -787,19 +787,19 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15] ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6],ymm8[7],ymm9[8,9],ymm8[10,11],ymm9[12,13,14],ymm8[15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] +; AVX2-NEXT: vpbroadcastq (%r8), %ymm7 ; AVX2-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: vpsrlq $48, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] @@ -833,23 +833,23 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[2,3,0,1] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6],ymm8[7],ymm9[8,9],ymm8[10,11],ymm9[12,13,14],ymm8[15] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] +; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm7 ; AVX2-FP-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 ; AVX2-FP-NEXT: vpsrlq $48, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] +; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] @@ -868,29 +868,29 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,2,6,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,5,2,6,2,6,u,u] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[2,3,6,7,2,3],zero,zero,zero,zero,ymm7[8,9,12,13,16,17],zero,zero,zero,zero,ymm7[18,19,22,23,28,29],zero,zero,zero,zero -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,2,6,u,2,6,3,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3],zero,zero,zero,zero,zero,zero,ymm8[4,5,8,9],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25,28,29] ; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] +; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm7 ; AVX2-FCP-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 ; AVX2-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX2-FCP-NEXT: vmovdqa %xmm0, 64(%r9) @@ -937,8 +937,8 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512-NEXT: vpbroadcastd 12(%r8), %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm4, (%r9) @@ -947,22 +947,22 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-FCP-LABEL: store_i16_stride5_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,9,2,10,2,10,0,0] -; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,9,2,10,2,10,u,u] +; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[2,3,6,7,u,u],zero,zero,zero,zero,ymm5[8,9,12,13,u,u],zero,zero,zero,zero,ymm5[18,19,22,23,u,u],zero,zero,zero,zero -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,2,6,u,2,6,3,7] ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3],zero,zero,zero,zero,ymm6[u,u,4,5,8,9],zero,zero,zero,zero,ymm6[u,u,18,19,22,23],zero,zero,zero,zero,ymm6[u,u,24,25,28,29] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 @@ -971,12 +971,12 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) -; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX512-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) @@ -1021,8 +1021,8 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vpbroadcastd 12(%r8), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r9) @@ -1031,22 +1031,22 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-FCP-LABEL: store_i16_stride5_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,9,2,10,2,10,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,9,2,10,2,10,u,u] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[2,3,6,7,u,u],zero,zero,zero,zero,ymm5[8,9,12,13,u,u],zero,zero,zero,zero,ymm5[18,19,22,23,u,u],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,2,6,u,2,6,3,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3],zero,zero,zero,zero,ymm6[u,u,4,5,8,9],zero,zero,zero,zero,ymm6[u,u,18,19,22,23],zero,zero,zero,zero,ymm6[u,u,24,25,28,29] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 @@ -1055,12 +1055,12 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9) @@ -1071,13 +1071,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -1088,13 +1088,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -1105,13 +1105,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -1122,13 +1122,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [22,30,38,7,15,23,31,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,1,9,17,25,33,2,10,18,26,34,3,11,19,27,35,4,12,20,28,36,5,13,21,29,37,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -1473,40 +1473,40 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-LABEL: store_i16_stride5_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-NEXT: vmovdqa (%rcx), %ymm3 ; AVX2-NEXT: vmovdqa (%rdx), %xmm7 ; AVX2-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX2-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11],ymm6[12],ymm10[13],ymm6[14],ymm10[15] +; AVX2-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpbroadcastq (%r8), %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,1,2,5,5,5,6] +; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3],ymm6[4],ymm10[5,6],ymm6[7],ymm10[8,9],ymm6[10],ymm10[11],ymm6[12],ymm10[13,14],ymm6[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm10 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 ; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm10 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] @@ -1515,262 +1515,262 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm7, %ymm7 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm1[0,1,1,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,2,6,7,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,2,3,3,7,6,7,7] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,3,0,7,4,7,4] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,1,2,2] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-NEXT: vmovdqa %ymm8, 128(%r9) ; AVX2-NEXT: vmovdqa %ymm7, 32(%r9) -; AVX2-NEXT: vmovdqa %ymm6, 96(%r9) -; AVX2-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-NEXT: vmovdqa %ymm5, 96(%r9) +; AVX2-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i16_stride5_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm4 +; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3 +; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 -; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 -; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,2,5,5,5,6] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8,9],ymm9[10],ymm8[11],ymm9[12],ymm8[13,14],ymm9[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[3,2,3,3,7,6,7,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3,4],ymm9[5,6,7,8],ymm10[9],ymm9[10],ymm10[11,12],ymm9[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FP-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,3,0,7,4,7,4] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10],ymm6[11],ymm2[12,13],ymm6[14],ymm2[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[1,1,2,2] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm8, 128(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm7, 96(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i16_stride5_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm4 +; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3 +; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 -; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm5 -; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,2,5,5,5,6] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8,9],ymm9[10],ymm8[11],ymm9[12],ymm8[13,14],ymm9[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[3,2,3,3,7,6,7,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3,4],ymm9[5,6,7,8],ymm10[9],ymm9[10],ymm10[11,12],ymm9[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FCP-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,3,0,7,4,7,4] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10],ymm6[11],ymm2[12,13],ymm6[14],ymm2[15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[1,1,2,2] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm8, 128(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm7, 96(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i16_stride5_vf16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX512-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,2,2,2] +; AVX512-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm3)) ; AVX512-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX512-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] -; AVX512-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm5)) -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm7 & (ymm6 ^ ymm5)) -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[0,1,2,3] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm8 & (ymm5 ^ ymm7)) +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[0,1,2,3] ; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm0[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm9 & (ymm8 ^ ymm5)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm9 & (ymm7 ^ ymm5)) ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm7 & (ymm10 ^ ymm5)) -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm10[0,1,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm8 & (ymm10 ^ ymm5)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpbroadcastq 16(%r8), %ymm7 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm7 -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) +; AVX512-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[0,1,2,3] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm7 & mem) ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] @@ -1793,61 +1793,61 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-LABEL: store_i16_stride5_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,2,2,2] +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm3)) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm5)) -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm7 & (ymm6 ^ ymm5)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm8 & (ymm5 ^ ymm7)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[0,1,2,3] ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm9 & (ymm8 ^ ymm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm9 & (ymm7 ^ ymm5)) ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512-FCP-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm7 & (ymm10 ^ ymm5)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm10[0,1,2,3] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm8 & (ymm10 ^ ymm5)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpbroadcastq 16(%r8), %ymm7 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm7 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[0,1,2,3] +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm7 & mem) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] @@ -1866,60 +1866,60 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-LABEL: store_i16_stride5_vf16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpbroadcastq 8(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,2,2,2] +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm3)) ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm5)) -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm7 & (ymm6 ^ ymm5)) -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm8 & (ymm5 ^ ymm7)) +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[0,1,2,3] ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm0[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm9 & (ymm8 ^ ymm5)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm9 & (ymm7 ^ ymm5)) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512DQ-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm7 & (ymm10 ^ ymm5)) -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm10[0,1,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm8 & (ymm10 ^ ymm5)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpbroadcastq 16(%r8), %ymm7 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm7 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) +; AVX512DQ-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[0,1,2,3] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm7 & mem) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] @@ -1942,61 +1942,61 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-LABEL: store_i16_stride5_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,2,2,2] +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm3)) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm7 & (ymm6 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm8 & (ymm5 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[0,1,2,3] ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm9 & (ymm8 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm9 & (ymm7 ^ ymm5)) ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512DQ-FCP-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm7 & (ymm10 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm10[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm8 & (ymm10 ^ ymm5)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpbroadcastq 16(%r8), %ymm7 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm10[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm7 & mem) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] @@ -2016,22 +2016,22 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] +; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512BW-NEXT: vzeroupper @@ -2041,22 +2041,22 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512BW-FCP-NEXT: vzeroupper @@ -2066,22 +2066,22 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512DQ-BW-NEXT: vzeroupper @@ -2091,22 +2091,22 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,13,29,45,61,0,14,30,46,62,0,15,31,47,63,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,16,32,48,0,1,17,33,49,0,2,18,34,50,0,3,19,35,51,0,4,20,36,52,0,5,21,37,53,0,6,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [6,22,0,39,55,7,23,0,40,56,8,24,0,41,57,9,25,0,42,58,10,26,0,43,59,11,27,0,44,60,12,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -2821,66 +2821,69 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-LABEL: store_i16_stride5_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: subq $104, %rsp +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX2-NEXT: vmovdqa (%rdx), %xmm6 ; AVX2-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-NEXT: vmovdqa (%rcx), %xmm7 ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm9 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq (%r8), %ymm11 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq 32(%r8), %ymm5 -; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm13, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%r8), %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,6] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq 32(%r8), %ymm4 +; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-NEXT: vpshufb %xmm11, %xmm12, %xmm0 -; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm12 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] +; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0 +; AVX2-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3],xmm0[4],xmm12[5],xmm0[6],xmm12[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX2-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] -; AVX2-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-NEXT: vmovdqa (%rsi), %ymm4 ; AVX2-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm11 ; AVX2-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] @@ -2889,8 +2892,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm11 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX2-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] @@ -2900,57 +2903,57 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3,4],ymm14[5,6,7,8],ymm13[9],ymm14[10],ymm13[11,12],ymm14[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,1,1] ; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm13, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-NEXT: vpshufhw {{.*#+}} ymm13 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm13 = ymm4[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-NEXT: vmovdqa %ymm4, %ymm9 ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] ; AVX2-NEXT: vmovdqa (%rcx), %ymm13 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,2,3,3,7,6,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3,4],ymm0[5,6,7,8],ymm5[9],ymm0[10],ymm5[11,12],ymm0[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,2,2] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3,4],ymm0[5,6,7,8],ymm4[9],ymm0[10],ymm4[11,12],ymm0[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm12 +; AVX2-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq 56(%r8), %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm4, %ymm12 ; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm14 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX2-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[1,1,1,2,5,5,5,6] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,1,2,5,5,5,6] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3],ymm6[4],ymm4[5,6],ymm6[7],ymm4[8,9],ymm6[10],ymm4[11],ymm6[12],ymm4[13,14],ymm6[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,1,1,2,5,5,5,6] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[1,1,1,2,5,5,5,6] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13],ymm4[14],ymm0[15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm9[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,1,2,1,4,5,6,5] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm4 ; AVX2-NEXT: vpbroadcastq 48(%r8), %ymm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm1 ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 @@ -2959,21 +2962,21 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[3,0,3,0,7,4,7,4] ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,1,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] +; AVX2-NEXT: vpshufb %ymm6, %ymm8, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb %ymm4, %ymm13, %ymm4 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,0,3,0,7,4,7,4] ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] ; AVX2-NEXT: vpshufb %ymm6, %ymm9, %ymm5 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm7[1,1,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] ; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm3 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = mem[1,1,2,2] @@ -2986,90 +2989,92 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm1, 192(%r9) ; AVX2-NEXT: vmovdqa %ymm12, 288(%r9) ; AVX2-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r9) -; AVX2-NEXT: addq $72, %rsp +; AVX2-NEXT: addq $104, %rsp ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i16_stride5_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $40, %rsp -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FP-NEXT: subq $72, %rsp +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm6 -; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5],xmm0[6],xmm7[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm7 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm13, %xmm9 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm7, %xmm10 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm9 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpbroadcastq 40(%rdi), %xmm14 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[1,2,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm13, %xmm9 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm15, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm0 +; AVX2-FP-NEXT: vpbroadcastq 40(%rdi), %xmm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm11, %xmm5 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0],xmm13[1],xmm5[2],xmm13[3],xmm5[4,5],xmm13[6],xmm5[7] +; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm7 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm6, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm7, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm10 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-FP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm8 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm5 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] @@ -3078,11 +3083,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm14, %ymm12 @@ -3093,35 +3099,35 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq 56(%r8), %ymm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 ; AVX2-FP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm2 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq 16(%r8), %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 @@ -3130,23 +3136,23 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm3 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm2 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm11, %ymm6 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[1,1,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%r9) @@ -3155,91 +3161,94 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm12, 288(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm8, 160(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FP-NEXT: addq $40, %rsp +; AVX2-FP-NEXT: addq $72, %rsp ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i16_stride5_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $40, %rsp -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FCP-NEXT: subq $72, %rsp +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm6 -; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5],xmm0[6],xmm7[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm7 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm9 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm10 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[1,2,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm9 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm15, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm0 +; AVX2-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm5 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm5 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0],xmm13[1],xmm5[2],xmm13[3],xmm5[4,5],xmm13[6],xmm5[7] +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm6, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm10 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm8 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm5 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] @@ -3248,11 +3257,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm12 @@ -3263,35 +3273,35 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq 56(%r8), %ymm6 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 ; AVX2-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm2 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq 16(%r8), %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 @@ -3300,23 +3310,23 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm3 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm2 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%r9) @@ -3325,14 +3335,15 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm12, 288(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm8, 160(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FCP-NEXT: addq $40, %rsp +; AVX2-FCP-NEXT: addq $72, %rsp ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -3346,56 +3357,56 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm23 ; AVX512-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,2,3,3,7,6,7,7] -; AVX512-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] ; AVX512-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] ; AVX512-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm2[0,1,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,2,6,7,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7,8],ymm2[9],ymm12[10],ymm2[11],ymm12[12,13],ymm2[14],ymm12[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,0,1] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm17 & (zmm12 ^ zmm0)) -; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm12)) +; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm2 +; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm17 & (zmm1 ^ zmm0)) +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm19 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm1)) +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm14 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm0 +; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm16 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm16[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5],ymm0[6],ymm12[7,8],ymm0[9],ymm12[10,11],ymm0[12],ymm12[13],ymm0[14],ymm12[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5],ymm1[6],ymm12[7,8],ymm1[9],ymm12[10,11],ymm1[12],ymm12[13],ymm1[14],ymm12[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm15 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm12 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2,3],xmm15[4],xmm7[5],xmm15[6],xmm7[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm15 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512-NEXT: vpshufb %ymm2, %ymm15, %ymm1 +; AVX512-NEXT: vpshufb %ymm2, %ymm15, %ymm7 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[3,0,3,0,7,4,7,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7,8],ymm1[9],ymm13[10],ymm1[11],ymm13[12,13],ymm1[14],ymm13[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm6[3,0,3,0,7,4,7,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7,8],ymm7[9],ymm13[10],ymm7[11],ymm13[12,13],ymm7[14],ymm13[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-NEXT: vpshufb %xmm7, %xmm11, %xmm11 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5],xmm10[6],xmm11[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] @@ -3409,82 +3420,81 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,1] ; AVX512-NEXT: vpandnq %ymm11, %ymm20, %ymm11 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm13 & zmm20) -; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX512-NEXT: vmovdqa64 %xmm23, %xmm8 -; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[0,1,0,1] +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512-NEXT: vpshufb %xmm7, %xmm9, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm0)) ; AVX512-NEXT: vpbroadcastq (%r8), %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm10[0,1,1,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,1,4,5,6,5] -; AVX512-NEXT: vprolq $16, %ymm14, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm3, %ymm15, %ymm8 -; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,2,3,3,7,6,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm13 & zmm20) +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vprolq $16, %ymm14, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8],ymm1[9],ymm7[10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm7 +; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,2,3,3,7,6,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3,4],ymm9[5,6,7,8],ymm5[9],ymm9[10],ymm5[11,12],ymm9[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm2 & (zmm5 ^ zmm1)) -; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3,4],ymm9[5,6,7,8],ymm6[9],ymm9[10],ymm6[11,12],ymm9[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm2 & (zmm6 ^ zmm0)) +; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm5)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,2,1,4,5,6,5] -; AVX512-NEXT: vprolq $16, %ymm6, %ymm8 -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3],ymm5[4],ymm8[5,6],ymm5[7],ymm8[8,9],ymm5[10],ymm8[11],ymm5[12],ymm8[13,14],ymm5[15] -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm8 -; AVX512-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm6)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vprolq $16, %ymm5, %ymm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[2,3,2,3] -; AVX512-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[2,3,2,3] +; AVX512-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] ; AVX512-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm5 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,0,3,0,7,4,7,4] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm3[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm4 ^ (zmm17 & (zmm3 ^ zmm4)) +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm1[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm17 & (zmm1 ^ zmm4)) ; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm2, %ymm2 ; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm4 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm3 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & mem) ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm1, 256(%r9) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-NEXT: vmovdqa64 %zmm11, 192(%r9) ; AVX512-NEXT: vmovdqa64 %zmm19, 128(%r9) ; AVX512-NEXT: vzeroupper @@ -3492,11 +3502,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-FCP-LABEL: store_i16_stride5_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[1,1,2,2] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm11[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %xmm18 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 @@ -3508,11 +3518,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm8 ; AVX512-FCP-NEXT: vmovdqa64 32(%rdx), %ymm16 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,0,3,0,7,4,7,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] @@ -3521,12 +3530,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm7 ; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm23 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,2,2,2] +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[1,2,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2],xmm12[3],xmm7[4,5],xmm12[6],xmm7[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm2)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm12 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm6 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3535,23 +3545,22 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] ; AVX512-FCP-NEXT: vpandnq %ymm3, %ymm17, %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm12 & zmm17) -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm13 & zmm17) +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,1,2,1,4,5,6,5] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm15 ; AVX512-FCP-NEXT: vprolq $16, %ymm15, %ymm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm13[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,1,1,2,5,5,5,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,0,3,0,7,4,7,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] @@ -3562,82 +3571,81 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm20 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm7 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm1 & mem) -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,3,2,3,6,7,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,1,4,5,6,5] -; AVX512-FCP-NEXT: vprolq $16, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,3,2,3,10,11,10,10] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vprolq $16, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,3,2,3,10,11,10,10] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3,4],ymm5[5,6,7,8],ymm13[9],ymm5[10],ymm13[11,12],ymm5[13,14,15] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2],ymm11[3,4],ymm5[5,6,7,8],ymm11[9],ymm5[10],ymm11[11,12],ymm5[13,14,15] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10,11],ymm13[12],ymm0[13],ymm13[14],ymm0[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,3,2,3,10,10,11,10] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5],ymm11[6],ymm0[7,8],ymm11[9],ymm0[10,11],ymm11[12],ymm0[13],ymm11[14],ymm0[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,3,2,3,10,10,11,10] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm14 & (zmm13 ^ zmm10)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm9 ^ (zmm14 & (zmm11 ^ zmm9)) ; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 ; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm2 & (zmm10 ^ zmm13)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm2 & (zmm9 ^ zmm11)) ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,2,3,2,8,9,8,9] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,3,2,2,8,9,8,9] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm8 ^ (zmm17 & (zmm4 ^ zmm8)) +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,3,2,2,8,9,8,9] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm17 & (zmm6 ^ zmm4)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6)) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,2,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2],xmm9[3],xmm4[4,5],xmm9[6],xmm4[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,1,8,9,8,8] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,2,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2],xmm10[3],xmm4[4,5],xmm10[6],xmm4[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,0,1],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm14 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm2[0,1,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm6 ^ (zmm14 & (zmm0 ^ zmm6)) ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm2 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm20[0,1,1,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 256(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -3653,56 +3661,56 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm23 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,2,3,3,7,6,7,7] -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm2[0,1,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,2,6,7,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7,8],ymm2[9],ymm12[10],ymm2[11],ymm12[12,13],ymm2[14],ymm12[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm17 & (zmm12 ^ zmm0)) -; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm12)) +; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm2 +; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm17 & (zmm1 ^ zmm0)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm19 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm14 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm16 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm16[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5],ymm0[6],ymm12[7,8],ymm0[9],ymm12[10,11],ymm0[12],ymm12[13],ymm0[14],ymm12[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5],ymm1[6],ymm12[7,8],ymm1[9],ymm12[10,11],ymm1[12],ymm12[13],ymm1[14],ymm12[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX512DQ-NEXT: vpbroadcastq 40(%rdi), %xmm15 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm12 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2,3],xmm15[4],xmm7[5],xmm15[6],xmm7[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm15 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm15, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm15, %ymm7 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[3,0,3,0,7,4,7,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7,8],ymm1[9],ymm13[10],ymm1[11],ymm13[12,13],ymm1[14],ymm13[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm6[3,0,3,0,7,4,7,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7,8],ymm7[9],ymm13[10],ymm7[11],ymm13[12,13],ymm7[14],ymm13[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm11, %xmm11 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5],xmm10[6],xmm11[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] @@ -3716,82 +3724,81 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,1] ; AVX512DQ-NEXT: vpandnq %ymm11, %ymm20, %ymm11 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm13 & zmm20) -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX512DQ-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm8 -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm9, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm0)) ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm10[0,1,1,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,1,4,5,6,5] -; AVX512DQ-NEXT: vprolq $16, %ymm14, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm15, %ymm8 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,2,3,3,7,6,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm13 & zmm20) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vprolq $16, %ymm14, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8],ymm1[9],ymm7[10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm7 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,2,3,3,7,6,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3,4],ymm9[5,6,7,8],ymm5[9],ymm9[10],ymm5[11,12],ymm9[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm2 & (zmm5 ^ zmm1)) -; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3,4],ymm9[5,6,7,8],ymm6[9],ymm9[10],ymm6[11,12],ymm9[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm2 & (zmm6 ^ zmm0)) +; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm5)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,2,1,4,5,6,5] -; AVX512DQ-NEXT: vprolq $16, %ymm6, %ymm8 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3],ymm5[4],ymm8[5,6],ymm5[7],ymm8[8,9],ymm5[10],ymm8[11],ymm5[12],ymm8[13,14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm8 -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm6)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vprolq $16, %ymm5, %ymm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[2,3,2,3] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[2,3,2,3] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] ; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm8, %ymm5 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,0,3,0,7,4,7,4] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm3[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm4 ^ (zmm17 & (zmm3 ^ zmm4)) +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm1[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm17 & (zmm1 ^ zmm4)) ; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm4 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm3 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & mem) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%r9) ; AVX512DQ-NEXT: vzeroupper @@ -3799,11 +3806,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-FCP-LABEL: store_i16_stride5_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm11[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %xmm18 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 @@ -3815,11 +3822,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdx), %ymm16 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,0,3,0,7,4,7,4] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] @@ -3828,12 +3834,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,2,2,2] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[1,2,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2],xmm12[3],xmm7[4,5],xmm12[6],xmm7[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3842,23 +3849,22 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] ; AVX512DQ-FCP-NEXT: vpandnq %ymm3, %ymm17, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm12 & zmm17) -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm13 & zmm17) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,1,2,1,4,5,6,5] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm15 ; AVX512DQ-FCP-NEXT: vprolq $16, %ymm15, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm13[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,1,1,2,5,5,5,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,0,3,0,7,4,7,4] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] @@ -3869,82 +3875,81 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm20 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm1 & mem) -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,3,2,3,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,1,4,5,6,5] -; AVX512DQ-FCP-NEXT: vprolq $16, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,3,2,3,10,11,10,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vprolq $16, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,3,2,3,10,11,10,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3,4],ymm5[5,6,7,8],ymm13[9],ymm5[10],ymm13[11,12],ymm5[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2],ymm11[3,4],ymm5[5,6,7,8],ymm11[9],ymm5[10],ymm11[11,12],ymm5[13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10,11],ymm13[12],ymm0[13],ymm13[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,3,2,3,10,10,11,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5],ymm11[6],ymm0[7,8],ymm11[9],ymm0[10,11],ymm11[12],ymm0[13],ymm11[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,3,2,3,10,10,11,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm14 & (zmm13 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm9 ^ (zmm14 & (zmm11 ^ zmm9)) ; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm2 & (zmm10 ^ zmm13)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm2 & (zmm9 ^ zmm11)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,2,3,2,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,3,2,2,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm8 ^ (zmm17 & (zmm4 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,3,2,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm17 & (zmm6 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6)) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,2,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2],xmm9[3],xmm4[4,5],xmm9[6],xmm4[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,1,8,9,8,8] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,2,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2],xmm10[3],xmm4[4,5],xmm10[6],xmm4[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm14 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm2[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm6 ^ (zmm14 & (zmm0 ^ zmm6)) ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm20[0,1,1,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 256(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -3957,46 +3962,46 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -4013,46 +4018,46 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -4069,46 +4074,46 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -4125,46 +4130,46 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) @@ -5614,338 +5619,335 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-LABEL: store_i16_stride5_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX2-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX2-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm14 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm15 -; AVX2-NEXT: vmovdqa 96(%rdx), %xmm1 -; AVX2-NEXT: vmovdqa 96(%rcx), %xmm7 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,1] -; AVX2-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX2-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm13, %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpbroadcastq (%r8), %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm13, %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX2-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,1] +; AVX2-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-NEXT: vmovdqa 64(%rsi), %xmm12 +; AVX2-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,6] +; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm5, %ymm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-NEXT: vpshufb %xmm4, %xmm10, %xmm10 +; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] +; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm15 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-NEXT: vpshufb %xmm4, %xmm10, %xmm10 +; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] +; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm13 +; AVX2-NEXT: vmovdqa 96(%rdx), %xmm10 +; AVX2-NEXT: vmovdqa 96(%rcx), %xmm11 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,0,1] +; AVX2-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX2-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpbroadcastq (%r8), %ymm9 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm9, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 32(%r8), %ymm5 -; AVX2-NEXT: vpblendvb %ymm13, %ymm14, %ymm5, %ymm5 +; AVX2-NEXT: vpblendvb %ymm14, %ymm15, %ymm5, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 64(%r8), %ymm5 -; AVX2-NEXT: vpblendvb %ymm13, %ymm15, %ymm5, %ymm5 +; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 96(%r8), %ymm5 -; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-NEXT: vpshufb %xmm8, %xmm11, %xmm4 +; AVX2-NEXT: vpblendvb %ymm14, %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX2-NEXT: vpshufb %xmm14, %xmm9, %xmm5 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshufb %xmm8, %xmm10, %xmm4 -; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm5 -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,2,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] -; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX2-NEXT: vpbroadcastq 72(%rdi), %xmm5 -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-NEXT: vmovdqa (%rdx), %ymm10 -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX2-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-NEXT: # xmm6 = mem[1,2,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] -; AVX2-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastq 104(%rdi), %xmm5 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX2-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm14, %xmm7, %xmm5 -; AVX2-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] -; AVX2-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX2-NEXT: vpshufb %xmm15, %xmm7, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,2] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2],xmm5[3],xmm3[4,5],xmm5[6],xmm3[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] +; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,1] -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,1,1] -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %xmm14, %xmm12, %xmm2 +; AVX2-NEXT: vpbroadcastq 72(%rdi), %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX2-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = mem[1,2,2,2] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2],xmm5[3],xmm3[4,5],xmm5[6],xmm3[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] +; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX2-NEXT: vpbroadcastq 104(%rdi), %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX2-NEXT: vpshufb %xmm15, %xmm11, %xmm3 +; AVX2-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,2,2,2] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,1,1] -; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rcx), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm9, %ymm12, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-NEXT: vmovdqa (%rsi), %ymm7 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[1,1,1,2,5,5,5,6] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-NEXT: vmovdqa (%rsi), %ymm8 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-NEXT: vmovdqa (%rdi), %ymm10 +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,1,2,1,4,5,6,5] +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm10 -; AVX2-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,2,5,5,5,6] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm13 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,2,1,4,5,6,5] -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm5 -; AVX2-NEXT: vmovdqa 64(%rdx), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX2-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,1,2,1,4,5,6,5] -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,1,2,1,4,5,6,5] +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm3, %ymm1 +; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm12 +; AVX2-NEXT: vmovdqa 64(%rdx), %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX2-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,1,2,1,4,5,6,5] +; AVX2-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm4 ; AVX2-NEXT: vmovdqa 96(%rcx), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa 96(%rdx), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX2-NEXT: vmovdqa 96(%rsi), %ymm14 +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3],ymm14[4],ymm0[5,6],ymm14[7],ymm0[8,9],ymm14[10],ymm0[11],ymm14[12],ymm0[13,14],ymm14[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendvb %ymm15, %ymm9, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 48(%r8), %ymm9 -; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm9, %ymm2 +; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 80(%r8), %ymm5 -; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpbroadcastq 48(%r8), %ymm2 +; AVX2-NEXT: vpblendvb %ymm6, %ymm12, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq 112(%r8), %ymm1 -; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,2,3,3,7,6,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] -; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastq 80(%r8), %ymm2 +; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastq 112(%r8), %ymm2 +; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,2,3,3,7,6,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] -; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[3,2,3,3,7,6,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3,4],ymm5[5,6,7,8],ymm9[9],ymm5[10],ymm9[11,12],ymm5[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10],ymm9[11],ymm5[12,13],ymm9[14],ymm5[15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vmovdqa %ymm13, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,2,3,3,7,6,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,2,3,3,7,6,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3,4],ymm4[5,6,7,8],ymm6[9],ymm4[10],ymm6[11,12],ymm4[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] +; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,2,6,7,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10],ymm6[11],ymm4[12,13],ymm6[14],ymm4[15] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = mem[3,2,3,3,7,6,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3,4],ymm6[5,6,7,8],ymm7[9],ymm6[10],ymm7[11,12],ymm6[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] +; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,2,6,7,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,2,3,3,7,6,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[3,4],ymm9[5,6,7,8],ymm14[9],ymm9[10],ymm14[11,12],ymm9[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-NEXT: vpblendvb %ymm15, %ymm5, %ymm9, %ymm5 -; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: vpbroadcastq 56(%r8), %ymm9 -; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[3,2,3,3,7,6,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2],ymm15[3,4],ymm7[5,6,7,8],ymm15[9],ymm7[10],ymm15[11,12],ymm7[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] +; AVX2-NEXT: vpblendvb %ymm12, %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpbroadcastq 24(%r8), %ymm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm7, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastq 56(%r8), %ymm0 +; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 88(%r8), %ymm0 -; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 120(%r8), %ymm0 -; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendvb %ymm12, %ymm6, %ymm0, %ymm12 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX2-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,0,3,0,7,4,7,4] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm9 -; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = mem[1,1,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm9 -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,0,3,0,7,4,7,4] -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13],ymm9[14],ymm15[15] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm12 -; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = mem[1,1,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] -; AVX2-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm9 -; AVX2-NEXT: vpshufb %ymm0, %ymm8, %ymm10 -; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[3,0,3,0,7,4,7,4] -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm8 +; AVX2-NEXT: vpshufb %ymm0, %ymm8, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[3,0,3,0,7,4,7,4] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm6 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-NEXT: # ymm7 = mem[1,1,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm7, %ymm7 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vpshufb %ymm0, %ymm10, %ymm6 +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[3,0,3,0,7,4,7,4] +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2],ymm6[3],ymm15[4,5],ymm6[6],ymm15[7,8],ymm6[9],ymm15[10],ymm6[11],ymm15[12,13],ymm6[14],ymm15[15] +; AVX2-NEXT: vpshufb %ymm4, %ymm13, %ymm15 +; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[1,1,2,2] +; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] +; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm13, %ymm6 +; AVX2-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[3,0,3,0,7,4,7,4] +; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm10 +; AVX2-NEXT: vpermq $165, (%rsp), %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[1,1,2,2] +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] +; AVX2-NEXT: vpblendvb %ymm7, %ymm11, %ymm9, %ymm9 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[3,0,3,0,7,4,7,4] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] -; AVX2-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,1,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,0,3,0,7,4,7,4] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7,8],ymm0[9],ymm5[10],ymm0[11],ymm5[12,13],ymm0[14],ymm5[15] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm14[1,1,2,2] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[1,1,2,2] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-NEXT: vpblendvb %ymm3, %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-NEXT: vpblendvb %ymm3, %ymm7, %ymm4, %ymm4 +; AVX2-NEXT: vpblendvb %ymm3, %ymm9, %ymm4, %ymm4 ; AVX2-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = mem[1,1,2,2] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 @@ -5953,8 +5955,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa %ymm4, 384(%r9) ; AVX2-NEXT: vmovdqa %ymm2, 224(%r9) ; AVX2-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 608(%r9) +; AVX2-NEXT: vmovdqa %ymm12, 608(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5971,7 +5972,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-NEXT: vmovdqa %ymm13, 128(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5984,69 +5986,70 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r9) -; AVX2-NEXT: addq $968, %rsp # imm = 0x3C8 +; AVX2-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i16_stride5_vf64: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-FP-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FP-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm14, %xmm14 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm15 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 ; AVX2-FP-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm0 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-FP-NEXT: vpshufb %xmm9, %xmm14, %xmm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdx), %xmm9 -; AVX2-FP-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX2-FP-NEXT: vmovdqa 96(%rcx), %xmm14 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 32(%r8), %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm15, %ymm10, %ymm10 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 64(%r8), %ymm10 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 @@ -6055,88 +6058,87 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm5, %xmm0 -; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpbroadcastq 40(%rdi), %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FP-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm4 = mem[1,2,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FP-NEXT: vpbroadcastq 8(%rdi), %xmm0 +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm6, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm8, %xmm1 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,2,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpbroadcastq 40(%rdi), %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX2-FP-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm2 = mem[1,2,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpbroadcastq 72(%rdi), %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,2,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpbroadcastq 104(%rdi), %xmm1 +; AVX2-FP-NEXT: vpshufb %xmm11, %xmm13, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm2 +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,2,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX2-FP-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm13, %xmm3 -; AVX2-FP-NEXT: vpbroadcastq 104(%rdi), %xmm4 -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX2-FP-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] -; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,1] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,1] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm1, %ymm14, %ymm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,1,1,2,5,5,5,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm11, %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm10, %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6153,7 +6155,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm11 ; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm4 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm2 @@ -6162,7 +6164,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm5 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm5, %ymm3 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6172,7 +6174,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 ; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm7 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] @@ -6188,34 +6190,33 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq 16(%r8), %ymm13 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 48(%r8), %ymm13 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 80(%r8), %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 +; AVX2-FP-NEXT: vpbroadcastq 80(%r8), %ymm11 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm11, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 112(%r8), %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 +; AVX2-FP-NEXT: vpbroadcastq 112(%r8), %ymm11 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm11, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm10, %ymm11 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm13 ; AVX2-FP-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm13 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] @@ -6223,13 +6224,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm4, %ymm15 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[3,2,3,3,7,6,7,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2],ymm10[3,4],ymm15[5,6,7,8],ymm10[9],ymm15[10],ymm10[11,12],ymm15[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm10, %ymm10 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm5, %ymm13 -; AVX2-FP-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -6243,7 +6244,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm12 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm3 @@ -6252,11 +6253,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 ; AVX2-FP-NEXT: vpbroadcastq 24(%r8), %ymm12 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq 56(%r8), %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 +; AVX2-FP-NEXT: vpbroadcastq 56(%r8), %ymm11 +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq 88(%r8), %ymm10 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 @@ -6265,36 +6266,36 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm7, %ymm11 +; AVX2-FP-NEXT: vpshufb %ymm9, %ymm7, %ymm10 ; AVX2-FP-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7,8],ymm10[9],ymm14[10],ymm10[11],ymm14[12,13],ymm10[14],ymm14[15] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm15 ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm13 = mem[1,1,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm10, %ymm13, %ymm10 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm6, %ymm13 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,1,2,2] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10,11],ymm13[12],ymm11[13],ymm13[14],ymm11[15] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm12, %ymm11, %ymm11 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm5, %ymm7 ; AVX2-FP-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm5 -; AVX2-FP-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm6 = mem[1,1,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm4 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] @@ -6306,11 +6307,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm10, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm11, %ymm4, %ymm4 ; AVX2-FP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm6 = mem[1,1,2,2] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 @@ -6359,62 +6360,63 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-LABEL: store_i16_stride5_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm14 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm15 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm0 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm0 ; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm9 -; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm14 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 32(%r8), %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm15, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 64(%r8), %ymm10 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 @@ -6423,88 +6425,87 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm0 -; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = mem[1,2,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm1 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,2,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX2-FCP-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = mem[1,2,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,2,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm2 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,2,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm3 -; AVX2-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] -; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,1] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,1] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,1,1,2,5,5,5,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6521,7 +6522,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm11 ; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm2 @@ -6530,7 +6531,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6540,7 +6541,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 ; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm7 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] @@ -6556,34 +6557,33 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq 16(%r8), %ymm13 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 48(%r8), %ymm13 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 80(%r8), %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastq 80(%r8), %ymm11 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm11, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 112(%r8), %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastq 112(%r8), %ymm11 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm11 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm13 ; AVX2-FCP-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm13 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] @@ -6591,13 +6591,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm15 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[3,2,3,3,7,6,7,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2],ymm10[3,4],ymm15[5,6,7,8],ymm10[9],ymm15[10],ymm10[11,12],ymm15[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm13 -; AVX2-FCP-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -6611,7 +6611,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm12 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm3 @@ -6620,11 +6620,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastq 24(%r8), %ymm12 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq 56(%r8), %ymm10 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 +; AVX2-FCP-NEXT: vpbroadcastq 56(%r8), %ymm11 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq 88(%r8), %ymm10 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 @@ -6633,36 +6633,36 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm10 ; AVX2-FCP-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7,8],ymm10[9],ymm14[10],ymm10[11],ymm14[12,13],ymm10[14],ymm14[15] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm15 ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = mem[1,1,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm10, %ymm13, %ymm10 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm13 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10,11],ymm13[12],ymm11[13],ymm13[14],ymm11[15] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm12, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm7 ; AVX2-FCP-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm5 -; AVX2-FCP-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm6 = mem[1,1,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] @@ -6674,11 +6674,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm10, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = mem[1,1,2,2] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm11, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm6 = mem[1,1,2,2] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 @@ -6732,8 +6732,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512-NEXT: vmovdqa64 96(%rdx), %ymm20 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,0,3,0,7,4,7,4] +; AVX512-NEXT: vmovdqa64 96(%rdx), %ymm23 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[3,0,3,0,7,4,7,4] ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] @@ -6746,8 +6746,9 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm20 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm14 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm14[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] @@ -6762,10 +6763,9 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512-NEXT: vpandn %ymm4, %ymm3, %ymm4 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512-NEXT: vpandnq %ymm4, %ymm21, %ymm4 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm26 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -6785,14 +6785,14 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[2,3,2,3,6,7,6,7] ; AVX512-NEXT: vmovdqa 64(%rsi), %ymm15 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm0[0,1,0,1] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm0[0,1,0,1] ; AVX512-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512-NEXT: vmovdqa 64(%rsi), %xmm2 ; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm1 @@ -6859,6 +6859,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm27 ; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm31 ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512-NEXT: vmovdqa64 %ymm20, %ymm12 ; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm31[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] @@ -6866,7 +6867,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm10 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5],xmm10[6],xmm5[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm16 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm20 ; AVX512-NEXT: vmovdqa 32(%r8), %ymm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm3[0,1,1,1] ; AVX512-NEXT: vmovdqa64 %ymm22, %ymm5 @@ -6889,8 +6890,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm3[2,3,2,3] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512-NEXT: vpshufb %ymm3, %ymm6, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[1,1,1,2,5,5,5,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] @@ -6909,40 +6909,40 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm13 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[1,1,1,2,5,5,5,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5],ymm2[6],ymm13[7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13],ymm2[14],ymm13[15] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,2,3,3,7,6,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,2,3,3,7,6,7,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,1,2,1,4,5,6,5] ; AVX512-NEXT: vprolq $16, %ymm15, %ymm14 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8,9],ymm2[10],ymm14[11],ymm2[12],ymm14[13,14],ymm2[15] ; AVX512-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm17[1,1,2,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm16[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm6[0,1,2,3],zmm2[2,3,2,3] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-NEXT: vpshufb %ymm3, %ymm7, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[3,0,3,0,7,4,7,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10],ymm6[11],ymm14[12,13],ymm6[14],ymm14[15] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm6[0,1,2,3],zmm2[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm31[0,1,2,1,4,5,6,5] -; AVX512-NEXT: vprolq $16, %ymm8, %ymm14 -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3],ymm6[4],ymm14[5,6],ymm6[7],ymm14[8,9],ymm6[10],ymm14[11],ymm6[12],ymm14[13,14],ymm6[15] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512-NEXT: vpshufb %ymm3, %ymm6, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5],ymm14[6],ymm2[7,8],ymm14[9],ymm2[10,11],ymm14[12],ymm2[13],ymm14[14],ymm2[15] +; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm14 +; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm30[3,0,3,0,7,4,7,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm14[0,1,2,3],zmm2[2,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vprolq $16, %ymm8, %ymm15 +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512-NEXT: # ymm14 = mem[0,1,0,0] +; AVX512-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512-NEXT: # ymm15 = mem[0,1,0,0] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,3,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm1[2,3,2,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,3,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[2,3,2,3,6,7,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6] @@ -6960,13 +6960,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm29 # 64-byte Folded Reload ; AVX512-NEXT: # zmm29 = mem ^ (zmm28 & (zmm29 ^ mem)) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm23 # 64-byte Folded Reload -; AVX512-NEXT: # zmm23 = mem ^ (zmm30 & (zmm23 ^ mem)) +; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: # zmm17 = mem ^ (zmm30 & (zmm17 ^ mem)) ; AVX512-NEXT: vpbroadcastq 88(%r8), %ymm1 ; AVX512-NEXT: vpbroadcastq 96(%r8), %ymm19 ; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm19 & (zmm2 ^ zmm23)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm19 & (zmm2 ^ zmm17)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm24 ^ (zmm30 & (zmm25 ^ zmm24)) ; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm17 ; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm23 @@ -6974,7 +6974,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] ; AVX512-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm19 & (zmm17 ^ zmm25)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm27 ^ (zmm28 & (zmm16 ^ zmm27)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm27 ^ (zmm28 & (zmm20 ^ zmm27)) ; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,1,1] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm23, %ymm25 @@ -6988,48 +6988,48 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 | (zmm29 & zmm1) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm16 & zmm1) -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: # zmm11 = zmm11 ^ (zmm14 & (zmm11 ^ mem)) -; AVX512-NEXT: vpbroadcastq 64(%r8), %ymm16 -; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm16, %zmm16 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm20 & zmm1) +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm11 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: # zmm11 = zmm11 ^ (zmm15 & (zmm11 ^ mem)) +; AVX512-NEXT: vpbroadcastq 64(%r8), %ymm20 +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm24 & (zmm16 ^ zmm11)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm24 & (zmm20 ^ zmm11)) ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogq $184, (%rsp), %zmm14, %zmm9 # 64-byte Folded Reload -; AVX512-NEXT: # zmm9 = zmm9 ^ (zmm14 & (zmm9 ^ mem)) ; AVX512-NEXT: vpbroadcastq (%r8), %ymm10 +; AVX512-NEXT: vpternlogq $184, (%rsp), %zmm15, %zmm9 # 64-byte Folded Reload +; AVX512-NEXT: # zmm9 = zmm9 ^ (zmm15 & (zmm9 ^ mem)) ; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm10, %zmm10 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm24 & (zmm10 ^ zmm9)) -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm9 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm13, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm9 ^ (zmm14 & (zmm5 ^ zmm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm9 ^ (zmm15 & (zmm5 ^ zmm9)) ; AVX512-NEXT: vpbroadcastq 112(%r8), %ymm9 -; AVX512-NEXT: vpbroadcastq 120(%r8), %ymm11 -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm23 & (zmm9 ^ zmm5)) -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm6, %zmm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm14, %zmm11 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm14 & (zmm3 ^ zmm5)) -; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm4 +; AVX512-NEXT: vpbroadcastq 120(%r8), %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm15 & (zmm3 ^ zmm11)) +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm23 & (zmm4 ^ zmm5)) ; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm23 & (zmm4 ^ zmm3)) +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm23 & (zmm5 ^ zmm3)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm21 ^ (zmm30 & (zmm18 ^ zmm21)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm20 ^ (zmm30 & (zmm7 ^ zmm20)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 ^ (zmm30 & (zmm6 ^ zmm7)) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm18 & zmm3) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & zmm3) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm6 & zmm3) ; AVX512-NEXT: vmovdqa64 %zmm8, 384(%r9) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512-NEXT: vmovdqa64 %zmm9, 576(%r9) +; AVX512-NEXT: vmovdqa64 %zmm5, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm4, 576(%r9) ; AVX512-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512-NEXT: vmovdqa64 %zmm22, 192(%r9) ; AVX512-NEXT: vmovdqa64 %zmm17, 128(%r9) -; AVX512-NEXT: vmovdqa64 %zmm16, 320(%r9) +; AVX512-NEXT: vmovdqa64 %zmm20, 320(%r9) ; AVX512-NEXT: vmovdqa64 %zmm2, 448(%r9) ; AVX512-NEXT: vmovdqa64 %zmm26, 512(%r9) ; AVX512-NEXT: addq $264, %rsp # imm = 0x108 @@ -7038,30 +7038,29 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-FCP-LABEL: store_i16_stride5_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 96(%rdx), %ymm22 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,0,3,0,7,4,7,4] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm14 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 96(%rdx), %ymm23 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 96(%rdx), %xmm21 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[1,2,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm24 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm24[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm27[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] @@ -7069,267 +7068,269 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %ymm29 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm31 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm25 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %ymm31 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,2,1,4,5,6,5] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vprolq $16, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm31[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm28[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[3,0,3,0,7,4,7,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[0,1,2,1,4,5,6,5] -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512-FCP-NEXT: vprolq $16, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm26[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm24[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm3 +; AVX512-FCP-NEXT: vprolq $16, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm24[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %ymm30 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[3,0,3,0,7,4,7,4] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm2[0,1,2,3],zmm0[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[0,1,2,3],zmm2[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpandn %ymm3, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm21 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,0,3,0,7,4,7,4] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,2,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2],xmm9[3],xmm2[4,5],xmm9[6],xmm2[7] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpandn %ymm3, %ymm6, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm30 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,2,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10,11],ymm6[12],ymm9[13],ymm6[14],ymm9[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5],xmm13[6],xmm9[7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm6[0,1,1,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpandn %ymm9, %ymm15, %ymm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm9 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm13 -; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm18 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm9 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm24[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10],ymm13[11],ymm9[12,13],ymm13[14],ymm9[15] -; AVX512-FCP-NEXT: vprolq $16, %ymm14, %ymm13 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm24[0,1,2,1,4,5,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,3,2,3,10,11,10,10] -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3,4],ymm13[5,6,7,8],ymm15[9],ymm13[10],ymm15[11,12],ymm13[13,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm22[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,3,2,3,10,10,11,10] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm13, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm14 ^ (zmm24 & (zmm7 ^ zmm14)) -; AVX512-FCP-NEXT: vpbroadcastq 112(%r8), %ymm14 -; AVX512-FCP-NEXT: vpbroadcastq 120(%r8), %ymm15 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm22 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm6 & (zmm22 ^ zmm7)) -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm25 -; AVX512-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm6, %ymm7 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX512-FCP-NEXT: vprolq $16, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm24 & (zmm2 ^ zmm1)) -; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 -; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm2)) -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,2,8,9,8,9] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [2,3,2,2,8,9,8,9] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm4 ^ (zmm2 & (zmm13 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm6[1],ymm4[2],ymm6[3,4],ymm4[5,6,7,8],ymm6[9],ymm4[10],ymm6[11,12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512-FCP-NEXT: vpbroadcastq 88(%r8), %ymm1 -; AVX512-FCP-NEXT: vpbroadcastq 96(%r8), %ymm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10,11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm15, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 +; AVX512-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5],xmm15[6],xmm4[7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm18 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,1] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpandnq %ymm4, %ymm16, %ymm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm29, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm16 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512-FCP-NEXT: vprolq $16, %ymm8, %ymm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,3,2,3,10,11,10,10] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2],ymm13[3,4],ymm0[5,6,7,8],ymm13[9],ymm0[10],ymm13[11,12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm8 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10,11],ymm14[12],ymm0[13],ymm14[14],ymm0[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,3,2,3,10,10,11,10] +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm1[1],ymm10[2],ymm1[3,4],ymm10[5,6,7,8],ymm1[9],ymm10[10],ymm1[11,12],ymm10[13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,3,2,8,9,8,9] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm14 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10],ymm13[11],ymm1[12,13],ymm13[14],ymm1[15] +; AVX512-FCP-NEXT: vprolq $16, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3,4],ymm1[5,6,7,8],ymm3[9],ymm1[10],ymm3[11,12],ymm1[13,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1],ymm3[2],ymm4[3,4],ymm3[5,6,7,8],ymm4[9],ymm3[10],ymm4[11,12],ymm3[13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm23 & (zmm0 ^ zmm5)) +; AVX512-FCP-NEXT: vpbroadcastq 112(%r8), %ymm1 +; AVX512-FCP-NEXT: vpbroadcastq 120(%r8), %ymm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm8 & (zmm1 ^ zmm13)) -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX512-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,0,1],zmm9[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm10 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[1,2,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2],xmm13[3],xmm10[4,5],xmm13[6],xmm10[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,0,1,8,9,8,8] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm9 ^ (zmm24 & (zmm4 ^ zmm9)) -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm6 ^ (zmm2 & (zmm10 ^ zmm6)) -; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm6 -; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm8 & (zmm6 ^ zmm10)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm29 & (zmm1 ^ zmm0)) +; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 +; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm23 & (zmm6 ^ zmm2)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm6)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm29, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm10[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512-FCP-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm27 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,3,2,2,8,9,8,9] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm6 +; AVX512-FCP-NEXT: vpbroadcastq 88(%r8), %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm21 & (zmm6 ^ zmm14)) +; AVX512-FCP-NEXT: vpbroadcastq 96(%r8), %ymm11 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm11 & (zmm5 ^ zmm6)) +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm13 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm13[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm13 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm14 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[1,2,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6],xmm14[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,0,1,8,9,8,8] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (zmm23 & (zmm6 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm12 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm12 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm28[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm12 +; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm3 +; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm9 ^ (zmm21 & (zmm12 ^ zmm9)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm11 & (zmm3 ^ zmm12)) ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm25[0,1,1,1] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm10 -; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm10[0,1,1,1] +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm10 +; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm7 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm10[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm11 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,0,1],zmm10[0,1,0,1] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,2,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2],xmm13[3],xmm12[4,5],xmm13[6],xmm12[7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[1,2,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm24 & (zmm5 ^ zmm3)) -; AVX512-FCP-NEXT: vpermq $84, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,1] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm10 = mem ^ (zmm8 & (zmm10 ^ mem)) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm20 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm20 = mem ^ (zmm8 & (zmm20 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 | (zmm10 & zmm7) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm20 & zmm7) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm28 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm28 = mem ^ (zmm2 & (zmm28 ^ mem)) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm19 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm19 = mem ^ (zmm2 & (zmm19 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm28 & zmm2) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm19 & zmm2) -; AVX512-FCP-NEXT: vpbroadcastq 64(%r8), %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm8 & (zmm2 ^ zmm4)) -; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm5)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm23 & (zmm7 ^ zmm8)) +; AVX512-FCP-NEXT: vpermq $84, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm8 = mem[0,1,1,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm25 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm25 = mem ^ (zmm10 & (zmm25 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $226, (%rsp), %zmm10, %zmm18 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm18 = mem ^ (zmm10 & (zmm18 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 | (zmm25 & zmm4) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm18 & zmm4) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm26 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm26 = mem ^ (zmm21 & (zmm26 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm19 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm19 = mem ^ (zmm21 & (zmm19 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm26 & zmm4) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm19 & zmm4) +; AVX512-FCP-NEXT: vpbroadcastq 64(%r8), %ymm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm9 & (zmm4 ^ zmm6)) +; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm9 & (zmm6 ^ zmm7)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 448(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 384(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 576(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 384(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 576(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 512(%r9) -; AVX512-FCP-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 512(%r9) +; AVX512-FCP-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -7341,8 +7342,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-NEXT: vmovdqa64 96(%rdx), %ymm20 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,0,3,0,7,4,7,4] +; AVX512DQ-NEXT: vmovdqa64 96(%rdx), %ymm23 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[3,0,3,0,7,4,7,4] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] @@ -7355,8 +7356,9 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm20 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm14 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm14[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] @@ -7371,10 +7373,9 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512DQ-NEXT: vpandn %ymm4, %ymm3, %ymm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512DQ-NEXT: vpandnq %ymm4, %ymm21, %ymm4 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm26 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -7394,14 +7395,14 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[2,3,2,3,6,7,6,7] ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm15 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm0[0,1,0,1] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm0[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm2 ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm1 @@ -7468,6 +7469,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm31 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm12 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm31[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] @@ -7475,7 +7477,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpbroadcastq 40(%rdi), %xmm10 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5],xmm10[6],xmm5[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm16 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm20 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm3[0,1,1,1] ; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm5 @@ -7498,8 +7500,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm3[2,3,2,3] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm6, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[1,1,1,2,5,5,5,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] @@ -7518,40 +7519,40 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[1,1,1,2,5,5,5,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5],ymm2[6],ymm13[7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13],ymm2[14],ymm13[15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,2,3,3,7,6,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,2,3,3,7,6,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,1,2,1,4,5,6,5] ; AVX512DQ-NEXT: vprolq $16, %ymm15, %ymm14 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8,9],ymm2[10],ymm14[11],ymm2[12],ymm14[13,14],ymm2[15] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm17[1,1,2,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm16[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm6[0,1,2,3],zmm2[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm7, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[3,0,3,0,7,4,7,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10],ymm6[11],ymm14[12,13],ymm6[14],ymm14[15] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm6[0,1,2,3],zmm2[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm31[0,1,2,1,4,5,6,5] -; AVX512DQ-NEXT: vprolq $16, %ymm8, %ymm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3],ymm6[4],ymm14[5,6],ymm6[7],ymm14[8,9],ymm6[10],ymm14[11],ymm6[12],ymm14[13,14],ymm6[15] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm6, %ymm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5],ymm14[6],ymm2[7,8],ymm14[9],ymm2[10,11],ymm14[12],ymm2[13],ymm14[14],ymm2[15] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm14 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm30[3,0,3,0,7,4,7,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm14[0,1,2,3],zmm2[2,3,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vprolq $16, %ymm8, %ymm15 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512DQ-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,0] +; AVX512DQ-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm15 = mem[0,1,0,0] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,3,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm16 = ymm1[2,3,2,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,3,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[2,3,2,3,6,7,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6] @@ -7569,13 +7570,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm29 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm29 = mem ^ (zmm28 & (zmm29 ^ mem)) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm23 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm23 = mem ^ (zmm30 & (zmm23 ^ mem)) +; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm17 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm17 = mem ^ (zmm30 & (zmm17 ^ mem)) ; AVX512DQ-NEXT: vpbroadcastq 88(%r8), %ymm1 ; AVX512DQ-NEXT: vpbroadcastq 96(%r8), %ymm19 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm19 & (zmm2 ^ zmm23)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm19 & (zmm2 ^ zmm17)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm24 ^ (zmm30 & (zmm25 ^ zmm24)) ; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm17 ; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm23 @@ -7583,7 +7584,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm19 & (zmm17 ^ zmm25)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm27 ^ (zmm28 & (zmm16 ^ zmm27)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm27 ^ (zmm28 & (zmm20 ^ zmm27)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,1,1] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm23, %ymm25 @@ -7597,48 +7598,48 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 | (zmm29 & zmm1) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm16 & zmm1) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm11 = zmm11 ^ (zmm14 & (zmm11 ^ mem)) -; AVX512DQ-NEXT: vpbroadcastq 64(%r8), %ymm16 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm16, %zmm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm20 & zmm1) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm11 = zmm11 ^ (zmm15 & (zmm11 ^ mem)) +; AVX512DQ-NEXT: vpbroadcastq 64(%r8), %ymm20 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm24 & (zmm16 ^ zmm11)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm24 & (zmm20 ^ zmm11)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogq $184, (%rsp), %zmm14, %zmm9 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm9 = zmm9 ^ (zmm14 & (zmm9 ^ mem)) ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm10 +; AVX512DQ-NEXT: vpternlogq $184, (%rsp), %zmm15, %zmm9 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm9 = zmm9 ^ (zmm15 & (zmm9 ^ mem)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm10, %zmm10 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm24 & (zmm10 ^ zmm9)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm9 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm13, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm9 ^ (zmm14 & (zmm5 ^ zmm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm9 ^ (zmm15 & (zmm5 ^ zmm9)) ; AVX512DQ-NEXT: vpbroadcastq 112(%r8), %ymm9 -; AVX512DQ-NEXT: vpbroadcastq 120(%r8), %ymm11 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm23 & (zmm9 ^ zmm5)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm6, %zmm5 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm14, %zmm11 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm14 & (zmm3 ^ zmm5)) -; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm4 +; AVX512DQ-NEXT: vpbroadcastq 120(%r8), %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm15 & (zmm3 ^ zmm11)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm23 & (zmm4 ^ zmm5)) ; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm23 & (zmm4 ^ zmm3)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm23 & (zmm5 ^ zmm3)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm21 ^ (zmm30 & (zmm18 ^ zmm21)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm20 ^ (zmm30 & (zmm7 ^ zmm20)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 ^ (zmm30 & (zmm6 ^ zmm7)) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm18 & zmm3) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & zmm3) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm6 & zmm3) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 576(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 576(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 192(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 320(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 512(%r9) ; AVX512DQ-NEXT: addq $264, %rsp # imm = 0x108 @@ -7647,30 +7648,29 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-FCP-LABEL: store_i16_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdx), %ymm22 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,0,3,0,7,4,7,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdx), %ymm23 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdx), %xmm21 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[1,2,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm24[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm27[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] @@ -7678,267 +7678,269 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm31 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %ymm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,2,1,4,5,6,5] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vprolq $16, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm31[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm28[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[3,0,3,0,7,4,7,4] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[0,1,2,1,4,5,6,5] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512DQ-FCP-NEXT: vprolq $16, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm26[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm24[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vprolq $16, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm24[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %ymm30 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[3,0,3,0,7,4,7,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm2[0,1,2,3],zmm0[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[0,1,2,3],zmm2[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,0,3,0,7,4,7,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,2,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2],xmm9[3],xmm2[4,5],xmm9[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,2,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10,11],ymm6[12],ymm9[13],ymm6[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5],xmm13[6],xmm9[7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm6[0,1,1,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpandn %ymm9, %ymm15, %ymm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm18 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm24[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10],ymm13[11],ymm9[12,13],ymm13[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vprolq $16, %ymm14, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm24[0,1,2,1,4,5,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,3,2,3,10,11,10,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3,4],ymm13[5,6,7,8],ymm15[9],ymm13[10],ymm15[11,12],ymm13[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm22[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,3,2,3,10,10,11,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm14 ^ (zmm24 & (zmm7 ^ zmm14)) -; AVX512DQ-FCP-NEXT: vpbroadcastq 112(%r8), %ymm14 -; AVX512DQ-FCP-NEXT: vpbroadcastq 120(%r8), %ymm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm22 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm6 & (zmm22 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm25 -; AVX512DQ-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vprolq $16, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm24 & (zmm2 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,2,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [2,3,2,2,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm4 ^ (zmm2 & (zmm13 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm6[1],ymm4[2],ymm6[3,4],ymm4[5,6,7,8],ymm6[9],ymm4[10],ymm6[11,12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vpbroadcastq 88(%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastq 96(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10,11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm15, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 +; AVX512DQ-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5],xmm15[6],xmm4[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpandnq %ymm4, %ymm16, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm29, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vprolq $16, %ymm8, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,3,2,3,10,11,10,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2],ymm13[3,4],ymm0[5,6,7,8],ymm13[9],ymm0[10],ymm13[11,12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10,11],ymm14[12],ymm0[13],ymm14[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,3,2,3,10,10,11,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm1[1],ymm10[2],ymm1[3,4],ymm10[5,6,7,8],ymm1[9],ymm10[10],ymm1[11,12],ymm10[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,3,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10],ymm13[11],ymm1[12,13],ymm13[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vprolq $16, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3,4],ymm1[5,6,7,8],ymm3[9],ymm1[10],ymm3[11,12],ymm1[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1],ymm3[2],ymm4[3,4],ymm3[5,6,7,8],ymm4[9],ymm3[10],ymm4[11,12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm23 & (zmm0 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpbroadcastq 112(%r8), %ymm1 +; AVX512DQ-FCP-NEXT: vpbroadcastq 120(%r8), %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm8 & (zmm1 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,0,1],zmm9[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[1,2,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2],xmm13[3],xmm10[4,5],xmm13[6],xmm10[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,0,1,8,9,8,8] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm9 ^ (zmm24 & (zmm4 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm6 ^ (zmm2 & (zmm10 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm8 & (zmm6 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm29 & (zmm1 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm23 & (zmm6 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm29, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm10[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm15, %ymm27 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,3,2,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastq 88(%r8), %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm21 & (zmm6 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vpbroadcastq 96(%r8), %ymm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm11 & (zmm5 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm13[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[1,2,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6],xmm14[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,0,1,8,9,8,8] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (zmm23 & (zmm6 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm12 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm28[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm3 +; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm9 ^ (zmm21 & (zmm12 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm11 & (zmm3 ^ zmm12)) ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm25[0,1,1,1] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm10 -; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm10[0,1,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm10 +; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm10[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm11 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,0,1],zmm10[0,1,0,1] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,2,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2],xmm13[3],xmm12[4,5],xmm13[6],xmm12[7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[1,2,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm24 & (zmm5 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpermq $84, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm10 = mem ^ (zmm8 & (zmm10 ^ mem)) -; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm20 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm20 = mem ^ (zmm8 & (zmm20 ^ mem)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 | (zmm10 & zmm7) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm20 & zmm7) -; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm28 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm28 = mem ^ (zmm2 & (zmm28 ^ mem)) -; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm19 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm19 = mem ^ (zmm2 & (zmm19 ^ mem)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm28 & zmm2) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm19 & zmm2) -; AVX512DQ-FCP-NEXT: vpbroadcastq 64(%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm8 & (zmm2 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm23 & (zmm7 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vpermq $84, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm25 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm25 = mem ^ (zmm10 & (zmm25 ^ mem)) +; AVX512DQ-FCP-NEXT: vpternlogq $226, (%rsp), %zmm10, %zmm18 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm18 = mem ^ (zmm10 & (zmm18 ^ mem)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 | (zmm25 & zmm4) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm18 & zmm4) +; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm26 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm26 = mem ^ (zmm21 & (zmm26 ^ mem)) +; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm19 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm19 = mem ^ (zmm21 & (zmm19 ^ mem)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm26 & zmm4) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm19 & zmm4) +; AVX512DQ-FCP-NEXT: vpbroadcastq 64(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm9 & (zmm4 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm9 & (zmm6 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 448(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 384(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 576(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 384(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 576(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 512(%r9) -; AVX512DQ-FCP-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 512(%r9) +; AVX512DQ-FCP-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -7954,56 +7956,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 ; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 ; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 @@ -8050,56 +8052,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 ; AVX512BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 @@ -8146,56 +8148,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 ; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 @@ -8242,56 +8244,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,0,0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,32,0,0,0,1,33,0,0,0,2,34,0,0,0,3,35,0,0,0,4,36,0,0,0,5,37,0,0,0,6,38] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,0,26,58,0,0,0,27,59,0,0,0,28,60,0,0,0,29,61,0,0,0,30,62,0,0,0,31,63,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [25,0,0,0,58,26,0,0,0,59,27,0,0,0,60,28,0,0,0,61,29,0,0,0,62,30,0,0,0,63,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,19,51,0,0,0,20,52,0,0,0,21,53,0,0,0,22,54,0,0,0,23,55,0,0,0,24,56,0,0,0,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [19,0,0,0,52,20,0,0,0,53,21,0,0,0,54,22,0,0,0,55,23,0,0,0,56,24,0,0,0,57,25,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [0,0,0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,13,45,0,0,0,14,46,0,0,0,15,47,0,0,0,16,48,0,0,0,17,49,0,0,0,18,50,0,0,0,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [6,38,0,0,0,7,39,0,0,0,8,40,0,0,0,9,41,0,0,0,10,42,0,0,0,11,43,0,0,0,12,44] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 5e26564465c25..2c213dc5f07db 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -22,8 +22,8 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: movdqa (%r9), %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa (%r9), %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -50,8 +50,8 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 ; AVX-NEXT: vmovdqa (%r8), %xmm2 -; AVX-NEXT: vmovdqa (%r9), %xmm3 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vmovdqa (%r9), %xmm3 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -69,16 +69,16 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-LABEL: store_i16_stride6_vf2: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] @@ -135,16 +135,16 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i16_stride6_vf2: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] @@ -179,16 +179,16 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-LABEL: store_i16_stride6_vf2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] @@ -231,7 +231,7 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,1,3,5,7,17,19,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,1,3,5,7,17,19,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, 16(%rax) @@ -249,7 +249,7 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,1,3,5,7,17,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,1,3,5,7,17,19,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm0, 16(%rax) @@ -267,7 +267,7 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,1,3,5,7,17,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,1,3,5,7,17,19,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-NEXT: vmovq %xmm0, 16(%rax) @@ -285,7 +285,7 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,16,18,1,3,5,7,17,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,1,3,5,7,17,19,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 16(%rax) @@ -364,8 +364,8 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,0] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] @@ -375,9 +375,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5],xmm3[6,7] ; AVX-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX-NEXT: vmovaps %ymm1, (%rax) ; AVX-NEXT: vzeroupper @@ -408,9 +408,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3] ; AVX2-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-NEXT: vzeroupper @@ -439,9 +439,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3] ; AVX2-FP-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FP-NEXT: vzeroupper @@ -472,9 +472,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3] ; AVX2-FCP-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FCP-NEXT: vzeroupper @@ -535,10 +535,10 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] @@ -603,10 +603,10 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] @@ -630,7 +630,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) @@ -651,7 +651,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) @@ -672,7 +672,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) @@ -693,7 +693,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) @@ -854,9 +854,9 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] @@ -896,9 +896,9 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29] @@ -935,7 +935,6 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-FCP-LABEL: store_i16_stride6_vf8: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 @@ -944,15 +943,16 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,1,5,0,4,1,5] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0] +; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,4,0,0,4,4,0] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,1,5,2,6,1,5] @@ -996,8 +996,8 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] @@ -1016,10 +1016,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vzeroupper @@ -1030,35 +1030,35 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,1,5,2,6,1,5] +; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,1,5,0,4,1,5] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,4,4,0,0,4,4,0] +; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7] ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 @@ -1066,12 +1066,12 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] ; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1088,8 +1088,8 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] @@ -1108,10 +1108,10 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-NEXT: vzeroupper @@ -1122,35 +1122,35 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,1,5,2,6,1,5] +; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,1,5,0,4,1,5] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,4,4,0,0,4,4,0] +; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7] ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 @@ -1158,12 +1158,12 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -1172,14 +1172,14 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -1191,14 +1191,14 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -1210,14 +1210,14 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -1229,14 +1229,14 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -1586,22 +1586,26 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-LABEL: store_i16_stride6_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vmovdqa (%rdi), %ymm6 ; AVX2-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-NEXT: vmovdqa (%r8), %ymm13 -; AVX2-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa (%rcx), %ymm13 +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa (%r8), %ymm4 +; AVX2-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-NEXT: vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX2-NEXT: vmovdqa (%rsi), %xmm8 -; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,1,2,1] +; AVX2-NEXT: vmovdqa (%rsi), %xmm7 +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,1,2,1] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,7,6,5] -; AVX2-NEXT: vmovdqa (%rdi), %xmm9 -; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,2,1] +; AVX2-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,1,2,1] +; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] @@ -1614,32 +1618,39 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrldq {{.*#+}} ymm12 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpsrldq {{.*#+}} ymm14 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,1,2,3,6,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,1,2,3,6,5,6,7] +; AVX2-NEXT: vmovdqa %ymm6, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[2,1,2,3,6,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] ; AVX2-NEXT: vmovdqa (%r9), %ymm12 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-NEXT: vmovdqa %ymm4, %ymm9 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm14, %ymm15, %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-NEXT: vmovdqa %xmm7, %xmm13 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX2-NEXT: vmovdqa %xmm1, %xmm8 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,2,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] @@ -1649,170 +1660,169 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,3,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm1, %ymm14, %ymm15, %ymm14 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX2-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-NEXT: vmovdqa %ymm5, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; AVX2-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7] +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11] ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-NEXT: vmovdqa %ymm14, 64(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-NEXT: popq %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i16_stride6_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $24, %rsp -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm7 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FP-NEXT: vmovaps (%r9), %ymm3 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm7 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm5 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm4 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FP-NEXT: vmovdqa (%r9), %xmm8 +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,1,1] +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,2,3,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm11 +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm1 +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm9 -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm0 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FP-NEXT: vmovdqa (%r9), %ymm13 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm13 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FP-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX2-FP-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm12 -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] -; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6],ymm12[7] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm15 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6],ymm15[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,0,2,2] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] +; AVX2-FP-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-FP-NEXT: vpbroadcastq %xmm14, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq %xmm14, %ymm14 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm3[0],ymm11[1],ymm3[1],ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[8],ymm3[8],ymm11[9],ymm3[9],ymm11[10],ymm3[10],ymm11[11],ymm3[11] +; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,0,2,2,5,4,6,6] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6],ymm11[7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FP-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm9, %ymm3 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] -; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm4 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm12, %ymm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,2,2,3] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm15, 96(%rax) @@ -1823,63 +1833,71 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FP-NEXT: addq $24, %rsp ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i16_stride6_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm8 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FCP-NEXT: subq $24, %rsp +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm13 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm7 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm9 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm11 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastq %xmm9, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] +; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm9 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-FCP-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[2,1,3,3,4,5,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7] ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm7, %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm7 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm14 -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm14 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm12 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] -; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm12 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm15 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] ; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm12 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm14, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,0,0,2,0,0,3,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,u,u,2,u,u,3,u] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm8 +; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] @@ -1887,55 +1905,60 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [5,0,0,6,0,0,7,0] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm14, %ymm15, %ymm14 +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm4[4],ymm13[4],ymm4[5],ymm13[5],ymm4[6],ymm13[6],ymm4[7],ymm13[7],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15] +; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,u,u,6,u,u,7,u] +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX2-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero -; AVX2-FCP-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,4,0,0,5,0,0,6] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm15[2,1,2,3] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7] +; AVX2-FCP-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,4,u,u,5,u,u,6] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm14, 64(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FCP-NEXT: addq $24, %rsp ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -1956,7 +1979,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,5,12,u,4,13,u,7] ; AVX512-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] @@ -1965,10 +1988,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [5,u,14,6,u,15,7,u] ; AVX512-NEXT: vpermi2d %ymm7, %ymm8, %ymm9 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] ; AVX512-NEXT: vpermi2d %zmm9, %zmm7, %zmm8 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16 @@ -1978,7 +2001,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,8,0,1,9,0,2,10] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,1,9,u,2,10] ; AVX512-NEXT: vpermi2d %ymm9, %ymm11, %ymm13 ; AVX512-NEXT: vmovdqa (%r9), %xmm9 ; AVX512-NEXT: vmovdqa (%r8), %xmm11 @@ -1993,7 +2016,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] ; AVX512-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3] @@ -2003,17 +2026,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,12,0,5,13,0,6,14] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,u,5,13,u,6,14] ; AVX512-NEXT: vpermi2d %ymm4, %ymm0, %ymm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,10,2,0,11,3,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,10,2,u,11,3,u] ; AVX512-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] ; AVX512-NEXT: vpermi2d %ymm1, %ymm3, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) @@ -2037,12 +2060,12 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,8,u,1,9,u,2,10] ; AVX512-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm9 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm11 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm14 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 @@ -2051,10 +2074,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] ; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] @@ -2062,17 +2085,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,u,10,2,u,11,3,u] ; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,12,u,5,13,u,6,14] ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] ; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 @@ -2084,19 +2107,19 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,5,12,u,4,13,u,7] ; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] ; AVX512-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,u,14,6,u,15,7,u] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1 @@ -2123,7 +2146,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,5,12,u,4,13,u,7] ; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] @@ -2132,10 +2155,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [5,u,14,6,u,15,7,u] ; AVX512DQ-NEXT: vpermi2d %ymm7, %ymm8, %ymm9 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] ; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm7, %zmm8 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16 @@ -2145,7 +2168,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,8,0,1,9,0,2,10] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,1,9,u,2,10] ; AVX512DQ-NEXT: vpermi2d %ymm9, %ymm11, %ymm13 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm9 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm11 @@ -2160,7 +2183,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] ; AVX512DQ-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3] @@ -2170,17 +2193,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,12,0,5,13,0,6,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,12,u,5,13,u,6,14] ; AVX512DQ-NEXT: vpermi2d %ymm4, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,10,2,0,11,3,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,10,2,u,11,3,u] ; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] ; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm3, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) @@ -2204,12 +2227,12 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,8,u,1,9,u,2,10] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm11 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 @@ -2218,10 +2241,10 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] @@ -2229,17 +2252,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [1,u,10,2,u,11,3,u] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,12,u,5,13,u,6,14] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 @@ -2251,19 +2274,19 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,5,12,u,4,13,u,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [5,u,14,6,u,15,7,u] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1 @@ -2275,24 +2298,24 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512BW-LABEL: store_i16_stride6_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -2302,24 +2325,24 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512BW-FCP-LABEL: store_i16_stride6_vf16: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -2329,24 +2352,24 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-BW-LABEL: store_i16_stride6_vf16: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -2356,24 +2379,24 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf16: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -3112,7 +3135,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -3215,7 +3238,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm6, %ymm14, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload @@ -3286,7 +3309,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm12 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -3395,7 +3418,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 @@ -3444,8 +3467,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm15, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -3492,7 +3514,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -3547,7 +3569,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] @@ -3561,7 +3583,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq %xmm0, %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm4 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -3670,7 +3692,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm1 @@ -3722,8 +3744,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 @@ -3756,7 +3777,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,2,1,2,0,0,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,1,2,u,u,3,3] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] @@ -3768,7 +3789,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm13, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -3790,7 +3811,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm4[4],ymm10[5],ymm4[5],ymm10[6],ymm4[6],ymm10[7],ymm4[7],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -3837,7 +3858,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -3862,7 +3883,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6],ymm3[7] @@ -3955,7 +3976,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rdx), %xmm12 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] ; AVX512-NEXT: vpermt2d %zmm1, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqa (%rsi), %xmm6 ; AVX512-NEXT: vmovdqa (%rdi), %xmm9 @@ -4024,7 +4045,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512-NEXT: vpermt2d %zmm2, %zmm26, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] @@ -4147,7 +4168,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm5 ; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4158,17 +4179,17 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm2 ; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm7 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm3 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm8, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [8,21,10,11,20,13,14,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [8,21,10,11,20,13,14,23] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm6 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm30 @@ -4178,11 +4199,10 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm6 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm10 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm31 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,0,3,10,0,10,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,u,3,10,u,10,11] ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm10, %zmm19 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm10 @@ -4212,26 +4232,26 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,10,9,10,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,10,9,10,11] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,8,8,0,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,0,0,u,8,8,u,9] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm23 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm11 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,8,8,0,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,0,0,8,8,u,9] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm10 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 @@ -4240,14 +4260,14 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm15 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [8,8,0,9,0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [8,8,u,9,0,1,0,1] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [16,9,10,17,12,13,18,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [16,9,10,17,12,13,18,15] ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm15 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 @@ -4266,7 +4286,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 ; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2d %ymm6, %ymm22, %ymm24 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm21 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm3 {%k2} @@ -4279,21 +4299,21 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,1,1,1,10,10,10,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 ; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11] ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm6 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [8,9,20,11,12,21,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,20,11,12,21,14,15] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm6 @@ -4301,7 +4321,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,1,0,10,10,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,u,0,1,u,10,10,u] ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm2 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -4359,7 +4379,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5 @@ -4456,7 +4476,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm31 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm31, %zmm4 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] @@ -4581,7 +4601,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4592,17 +4612,17 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm7 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm3 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm8, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [8,21,10,11,20,13,14,23] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm30 @@ -4612,11 +4632,10 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm6 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm31 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,0,3,10,0,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,u,3,10,u,10,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm10, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm10 @@ -4646,26 +4665,26 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,10,9,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,10,9,10,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,0,0,u,8,8,u,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm11 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,0,0,8,8,u,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm10 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 @@ -4674,14 +4693,14 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm15 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [8,8,0,9,0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [8,8,u,9,0,1,0,1] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 ; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm15 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 @@ -4700,7 +4719,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm22, %ymm24 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm3 {%k2} @@ -4713,21 +4732,21 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,1,1,10,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm6 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,20,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm6 @@ -4735,7 +4754,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,1,0,10,10,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,u,0,1,u,10,10,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -4782,352 +4801,352 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512BW-LABEL: store_i16_stride6_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm1 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: movw $9362, %cx # imm = 0x2492 -; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 -; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924 -; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm11, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i16_stride6_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: movw $9362, %cx # imm = 0x2492 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 +; AVX512BW-FCP-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i16_stride6_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm1 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: movw $9362, %cx # imm = 0x2492 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 +; AVX512DQ-BW-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: movw $18724, %cx # imm = 0x4924 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm10, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm8, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: movw $9362, %cx # imm = 0x2492 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 +; AVX512DQ-BW-FCP-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm10, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm8, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm11, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm11, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -6588,7 +6607,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -6815,7 +6834,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] @@ -6961,7 +6980,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # xmm2 = mem[2,3,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7173,7 +7192,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] @@ -7361,7 +7380,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm2 @@ -7442,8 +7461,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm2 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] @@ -7532,7 +7550,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7743,7 +7761,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,2,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3 @@ -7848,8 +7866,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 @@ -7955,7 +7972,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload @@ -8012,7 +8029,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,4,2,2,5,4,6,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,2,2,5,4,6,6] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -8080,7 +8097,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,2,1,2,0,0,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,1,2,u,u,3,3] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -8093,7 +8110,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8154,7 +8171,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] @@ -8325,7 +8342,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm5[4],ymm12[5],ymm5[5],ymm12[6],ymm5[6],ymm12[7],ymm5[7],ymm12[12],ymm5[12],ymm12[13],ymm5[13],ymm12[14],ymm5[14],ymm12[15],ymm5[15] ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[8],ymm5[8],ymm12[9],ymm5[9],ymm12[10],ymm5[10],ymm12[11],ymm5[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] ; AVX512-NEXT: vmovdqa (%rcx), %xmm5 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 @@ -8415,7 +8432,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] @@ -8754,7 +8771,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm2 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm4 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8765,7 +8782,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512-FCP-NEXT: kmovw %eax, %k1 @@ -8773,10 +8790,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm4 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7] ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm24, %zmm0 @@ -8786,10 +8803,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,0,3,10,0,10,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,u,3,10,u,10,11] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm20 ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8799,7 +8815,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,8,8,0,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,0,0,u,8,8,u,9] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm26 ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8826,7 +8842,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,2,3,10,9,10,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,10,9,10,11] ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm2 @@ -8840,7 +8856,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm6 @@ -8851,20 +8867,20 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [8,8,0,9,0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [8,8,u,9,0,1,0,1] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm4 ; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2} ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm6 ; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm4 ; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm27, %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8896,7 +8912,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm27, %ymm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,8,8,0,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,0,0,8,8,u,9] ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm4 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 @@ -9022,26 +9038,26 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[8],ymm10[8],ymm7[9],ymm10[9],ymm7[10],ymm10[10],ymm7[11],ymm10[11] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm4 ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm6, %ymm1 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm28 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,0,10,10,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,u,0,1,u,10,10,u] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -9177,7 +9193,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm4 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9374,7 +9390,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512DQ-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm11 @@ -9661,7 +9677,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9672,7 +9688,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 @@ -9680,10 +9696,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm24, %zmm0 @@ -9693,10 +9709,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,0,3,10,0,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,u,3,10,u,10,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9706,7 +9721,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,0,0,u,8,8,u,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm26 ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9733,7 +9748,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,2,3,10,9,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,10,9,10,11] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm2 @@ -9747,7 +9762,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm6 @@ -9758,20 +9773,20 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [8,8,0,9,0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [8,8,u,9,0,1,0,1] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm4 ; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm4 ; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm27, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9803,7 +9818,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm27, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,0,0,8,8,u,9] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 @@ -9929,26 +9944,26 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[8],ymm10[8],ymm7[9],ymm10[9],ymm7[10],ymm10[10],ymm7[11],ymm10[11] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm6, %ymm1 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm28 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,0,10,10,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,u,0,1,u,10,10,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -10077,139 +10092,139 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm10, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm19, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm26 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm14, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm14, %zmm3, %zmm18 +; AVX512BW-NEXT: vpermi2w %zmm14, %zmm3, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm14, %zmm3, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm1 ; AVX512BW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm12 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm17, %zmm4 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm6, %zmm16 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm5 ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm20, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm12 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm6, %zmm4 ; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm17, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm15, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm17, %zmm1 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -10219,139 +10234,139 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] +; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] -; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] +; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm25, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm10, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] -; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm19, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm25, %zmm26 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 -; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 -; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2w %zmm14, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2w %zmm14, %zmm3, %zmm18 +; AVX512BW-FCP-NEXT: vpermi2w %zmm14, %zmm3, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2w %zmm14, %zmm3, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm17, %zmm6 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] +; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm17, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2w %zmm11, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2w %zmm11, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2w %zmm11, %zmm1, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2w %zmm11, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm15, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm17, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm6, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm20, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm22, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm17, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm6, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm6, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm17, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm15, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm17, %zmm1 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 640(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 704(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -10361,139 +10376,139 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] +; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] -; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] +; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm25, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm20 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm10, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] -; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm19, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm0 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm26 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 -; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 -; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2w %zmm14, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2w %zmm14, %zmm3, %zmm18 +; AVX512DQ-BW-NEXT: vpermi2w %zmm14, %zmm3, %zmm12 +; AVX512DQ-BW-NEXT: vpermi2w %zmm14, %zmm3, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm6 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] +; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2w %zmm11, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2w %zmm11, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2w %zmm11, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2w %zmm11, %zmm1, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512DQ-BW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm17, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm6, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm20, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm6, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm17, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm15, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 448(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 640(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 704(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -10503,139 +10518,139 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] +; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] -; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] +; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm25, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm10, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm19, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm25, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm25, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm14, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm14, %zmm3, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm14, %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm14, %zmm3, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm17, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] +; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm17, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm11, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm11, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm11, %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm11, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm15, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm17, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm6, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm17, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm20, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm22, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm6, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm6, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm17, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm15, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm17, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 640(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 704(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 6f50d61f4d1f4..6bfb5001694a8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -19,13 +19,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i16_stride7_vf2: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1] @@ -60,9 +60,9 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] @@ -71,10 +71,10 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6],xmm3[7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,10,11,14,15,2,3,6,7,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7] ; AVX-NEXT: vpextrd $2, %xmm1, 24(%rax) +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7] ; AVX-NEXT: vmovq %xmm0, 16(%rax) ; AVX-NEXT: vmovdqa %xmm2, (%rax) ; AVX-NEXT: retq @@ -89,15 +89,15 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,ymm0[24,25,20,21] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm2, 24(%rax) ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, 16(%rax) ; AVX2-NEXT: vmovdqa %xmm0, (%rax) @@ -114,15 +114,15 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,ymm0[24,25,20,21] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FP-NEXT: vpextrd $2, %xmm2, 24(%rax) ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FP-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovq %xmm1, 16(%rax) ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) @@ -139,15 +139,15 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,ymm0[24,25,20,21] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FCP-NEXT: vpextrd $2, %xmm2, 24(%rax) ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rax) ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -157,22 +157,22 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-LABEL: store_i16_stride7_vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrd $2, %xmm2, 24(%rax) ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmovq %xmm1, 16(%rax) ; AVX512-NEXT: vmovdqa %xmm0, (%rax) @@ -182,22 +182,22 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-LABEL: store_i16_stride7_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vpextrd $2, %xmm2, 24(%rax) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rax) ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -207,22 +207,22 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-LABEL: store_i16_stride7_vf2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512DQ-NEXT: vinserti128 $1, (%rax), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpextrd $2, %xmm2, 24(%rax) ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vmovq %xmm1, 16(%rax) ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax) @@ -232,22 +232,22 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rax), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm2, 24(%rax) ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -264,9 +264,9 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u] ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-NEXT: vpextrd $2, %xmm0, 24(%rax) @@ -285,9 +285,9 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, 24(%rax) @@ -302,13 +302,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, 24(%rax) @@ -323,13 +323,13 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, 24(%rax) @@ -411,14 +411,14 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] ; SSE-NEXT: pandn %xmm9, %xmm6 ; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,3,1,3,4,5,6,7] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] ; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,0,0,0,65535] @@ -430,7 +430,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: andnps %xmm2, %xmm3 ; SSE-NEXT: orps %xmm0, %xmm3 ; SSE-NEXT: movaps %xmm3, (%rax) -; SSE-NEXT: movq %xmm7, 48(%rax) +; SSE-NEXT: movq %xmm8, 48(%rax) ; SSE-NEXT: movdqa %xmm6, 32(%rax) ; SSE-NEXT: movdqa %xmm5, 16(%rax) ; SSE-NEXT: retq @@ -488,7 +488,6 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-LABEL: store_i16_stride7_vf4: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero @@ -510,18 +509,19 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19] ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,1] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,0,0,65535,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, 48(%rax) @@ -532,7 +532,6 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-LABEL: store_i16_stride7_vf4: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero @@ -554,17 +553,18 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,12,13,4,5,6,7,8,9,10,11,4,5,6,7] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4,5,12,13,4,5,6,7,8,9,10,11,4,5,6,7] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,0,0,65535,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovq %xmm1, 48(%rax) @@ -575,7 +575,6 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-LABEL: store_i16_stride7_vf4: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero @@ -587,24 +586,25 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,5,3,7,1,5,3,7] ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [5,7,1,3,7,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [5,7,1,3,7,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm5[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[24,25,20,21,22,23,20,21,28,29] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3],zero,zero,ymm4[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,6,0,4,2,6] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,2,6,0,4,2,6] +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u,u,u,u,u,u,u,18,19,22,23,26,27,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 @@ -629,19 +629,19 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm0[u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u],zero,zero,zero,zero,ymm0[4,5,u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,3,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm1[u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u],zero,zero,zero,zero,ymm1[4,5,u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[18,19,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,4,5,12,13],zero,zero,ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[18,19,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,4,5,12,13],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[22,23,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm4[26,27],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,20,21,28,29] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm1[26,27],zero,zero,zero,zero,ymm1[u,u,u,u,u,u,20,21,28,29] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 @@ -657,7 +657,6 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-LABEL: store_i16_stride7_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero @@ -665,8 +664,6 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] @@ -676,14 +673,17 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,3,7,1,5,3,7] ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm4[26,27],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,20,21,28,29] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = mem & (ymm4 | ymm1) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,11,1,3,11,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,11,1,3,11,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 @@ -711,19 +711,19 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm0[u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u],zero,zero,zero,zero,ymm0[4,5,u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,3,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm1[u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u],zero,zero,zero,zero,ymm1[4,5,u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[18,19,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,4,5,12,13],zero,zero,ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[18,19,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,4,5,12,13],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm4[26,27],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,20,21,28,29] +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512DQ-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm1[26,27],zero,zero,zero,zero,ymm1[u,u,u,u,u,u,20,21,28,29] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 @@ -739,7 +739,6 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero @@ -747,8 +746,6 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] @@ -758,14 +755,17 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,3,7,1,5,3,7] ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,8,9],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,2,3],zero,zero,ymm4[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm4[26,27],zero,zero,zero,zero,ymm4[u,u,u,u,u,u,20,21,28,29] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = mem & (ymm4 | ymm1) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,11,1,3,11,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [9,11,1,3,11,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 @@ -793,7 +793,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -817,9 +817,9 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] ; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -844,7 +844,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -868,9 +868,9 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -1142,7 +1142,6 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-LABEL: store_i16_stride7_vf8: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-NEXT: vmovdqa (%rdx), %xmm4 @@ -1151,46 +1150,47 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9 ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11,12,13],ymm10[14],ymm6[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm12 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] -; AVX2-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm12 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0] ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21] +; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastd (%rax), %ymm11 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] -; AVX2-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] -; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -1198,20 +1198,19 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4],xmm0[5,6,7] ; AVX2-NEXT: vpsrld $16, %xmm3, %xmm2 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vpbroadcastd 12(%r10), %xmm2 +; AVX2-NEXT: vpbroadcastd 12(%rax), %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX2-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-NEXT: vmovdqa %xmm0, 96(%rcx) +; AVX2-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-NEXT: vmovdqa %ymm9, 64(%rcx) +; AVX2-NEXT: vmovdqa %ymm6, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i16_stride7_vf8: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm4 @@ -1228,38 +1227,39 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3,4,5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11,12,13],ymm6[14],ymm10[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm12 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] -; AVX2-FP-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm12 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-FP-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -1267,20 +1267,19 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4],xmm0[5,6,7] ; AVX2-FP-NEXT: vpsrld $16, %xmm3, %xmm2 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FP-NEXT: vpbroadcastd 12(%r10), %xmm2 +; AVX2-FP-NEXT: vpbroadcastd 12(%rax), %xmm2 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FP-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm9, 64(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rcx) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i16_stride7_vf8: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm4 @@ -1288,44 +1287,45 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm8 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,24,25],zero,zero,zero,zero -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,5,0,0,5,2,6,0] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[20,21,24,25] -; AVX2-FCP-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] -; AVX2-FCP-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm12 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm11, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [1,5,u,u,5,2,6,u] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,24,25] +; AVX2-FCP-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm12[1,3,1,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[20,21,28,29],zero,zero,zero,zero +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] +; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm8, %ymm11, %ymm8 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[1,3,1,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm12[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21] -; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm7 +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-FCP-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -1333,83 +1333,82 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4],xmm0[5,6,7] ; AVX2-FCP-NEXT: vpsrld $16, %xmm3, %xmm2 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2 +; AVX2-FCP-NEXT: vpbroadcastd 12(%rax), %xmm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX2-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %xmm0, 96(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm8, 64(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rcx) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i16_stride7_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-NEXT: vmovdqa (%r9), %xmm6 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512-NEXT: vmovdqa (%r8), %xmm8 +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX512-NEXT: vmovdqa (%r9), %xmm9 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,2,2,4,5,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11,12,13],ymm10[14],ymm9[15] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,2,0] +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11,12,13],ymm10[14],ymm7[15] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0] ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512-NEXT: vporq %zmm7, %zmm9, %zmm7 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512-NEXT: vporq %zmm5, %zmm7, %zmm5 +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,0,2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpbroadcastd (%rax), %ymm12 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512-NEXT: vpbroadcastd (%r10), %ymm11 -; AVX512-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm5)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] -; AVX512-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2)) +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm6[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm5[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm5[u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm7[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,ymm6[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm6[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogd {{.*#+}} ymm5 = (mem & ~ymm5) | ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (mem & (ymm5 ^ ymm2)) ; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; AVX512-NEXT: vpsrld $16, %xmm6, %xmm1 -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd 12(%r10), %xmm2 +; AVX512-NEXT: vpsrld $16, %xmm9, %xmm1 +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX512-NEXT: vpbroadcastd 12(%rax), %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-NEXT: vmovdqa %xmm0, 96(%rcx) +; AVX512-NEXT: vmovdqa %ymm5, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride7_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 @@ -1425,7 +1424,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,2,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,28,29,20,21] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,5,0,0,5,2,6,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,5,u,u,5,2,6,u] ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[20,21,24,25] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 @@ -1434,10 +1433,10 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm12 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm11 -; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] @@ -1447,6 +1446,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6)) @@ -1457,82 +1457,81 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512-FCP-NEXT: vpsrld $16, %xmm5, %xmm1 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2 +; AVX512-FCP-NEXT: vpbroadcastd 12(%rax), %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm8, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride7_vf8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm6 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm8 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm9 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,2,2,4,5,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11,12,13],ymm10[14],ymm9[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,2,0] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11,12,13],ymm10[14],ymm7[15] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512DQ-NEXT: vporq %zmm7, %zmm9, %zmm7 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512DQ-NEXT: vporq %zmm5, %zmm7, %zmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,0,2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm12 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-NEXT: vpbroadcastd (%r10), %ymm11 -; AVX512DQ-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm5)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm6[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm5[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm5[u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm7[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,ymm6[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm6[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512DQ-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm5 = (mem & ~ymm5) | ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (mem & (ymm5 ^ ymm2)) ; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; AVX512DQ-NEXT: vpsrld $16, %xmm6, %xmm1 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd 12(%r10), %xmm2 +; AVX512DQ-NEXT: vpsrld $16, %xmm9, %xmm1 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX512DQ-NEXT: vpbroadcastd 12(%rax), %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512DQ-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm5, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 @@ -1548,7 +1547,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,2,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,5,0,0,5,2,6,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,5,u,u,5,2,6,u] ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[20,21,24,25] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 @@ -1557,10 +1556,10 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm12 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm11 -; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] @@ -1570,6 +1569,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6)) @@ -1580,12 +1580,12 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm5, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd 12(%rax), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -1598,15 +1598,15 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1620,15 +1620,15 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1642,15 +1642,15 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -1664,15 +1664,15 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -2277,42 +2277,40 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2: # %bb.0: ; AVX2-NEXT: subq $40, %rsp ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa (%rdi), %ymm15 ; AVX2-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] -; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7] -; AVX2-NEXT: vmovdqa %ymm6, %ymm7 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0] +; AVX2-NEXT: vmovdqa (%r9), %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-NEXT: vpermd %ymm15, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,3,2,3,4,7,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa (%rax), %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u] ; AVX2-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4] -; AVX2-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,3,u,u,u,4,u] +; AVX2-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4] -; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [3,u,u,3,u,u,u,4] +; AVX2-NEXT: vpermd %ymm3, %ymm9, %ymm9 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rcx), %xmm10 @@ -2320,13 +2318,13 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] +; AVX2-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-NEXT: vmovdqa (%rsi), %xmm13 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa (%r9), %xmm9 ; AVX2-NEXT: vmovdqa (%r8), %xmm14 @@ -2334,106 +2332,101 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[3,3,3,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3,4],xmm1[5],xmm7[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,7,6] -; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm14 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,7,6] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,3,3] +; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm14, %ymm7, %ymm9, %ymm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm7, %ymm8 -; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX2-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8,9,10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3],ymm7[4],ymm14[5,6,7,8],ymm7[9],ymm14[10,11],ymm7[12],ymm14[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] -; AVX2-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] -; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,3,3,3,6,7,7,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm14[2],ymm7[3,4],ymm14[5],ymm7[6,7,8,9],ymm14[10],ymm7[11,12],ymm14[13],ymm7[14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u] +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[2,3,3,3,6,7,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm7, %ymm14 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,1,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,2,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-NEXT: vpbroadcastd (%rax), %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-NEXT: vpbroadcastd (%rax), %ymm10 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] -; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,2,2,2,6,6,6,6] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-NEXT: vmovdqa %ymm13, %ymm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm10 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-NEXT: vmovdqa %ymm12, %ymm6 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,2,2,3] -; AVX2-NEXT: vmovdqa %ymm3, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6,7,8],ymm11[9],ymm1[10,11],ymm11[12],ymm1[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6,7,8],ymm7[9],ymm1[10,11],ymm7[12],ymm1[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm11, %ymm12, %ymm1, %ymm3 -; AVX2-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-NEXT: vmovdqa %ymm7, %ymm2 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-NEXT: vmovdqa %ymm4, %ymm12 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm7 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] @@ -2443,24 +2436,24 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-NEXT: vmovdqa %ymm7, 160(%rax) +; AVX2-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-NEXT: vmovdqa %ymm14, 192(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2473,52 +2466,52 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FP-LABEL: store_i16_stride7_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $40, %rsp +; AVX2-FP-NEXT: pushq %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm7 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm13 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%r9), %ymm13 -; AVX2-FP-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] -; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0] -; AVX2-FP-NEXT: vpermd %ymm12, %ymm8, %ymm8 +; AVX2-FP-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FP-NEXT: vpermd %ymm6, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rax), %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm13, %ymm8, %ymm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4] -; AVX2-FP-NEXT: vpermd %ymm3, %ymm8, %ymm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,3,u,u,u,4,u] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4] -; AVX2-FP-NEXT: vpermd %ymm1, %ymm9, %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [3,u,u,3,u,u,u,4] +; AVX2-FP-NEXT: vpermd %ymm3, %ymm9, %ymm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm11 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm1 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm9 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm14 @@ -2526,125 +2519,120 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,3] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm15[2],xmm1[3,4],xmm15[5],xmm1[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm14 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm7 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FP-NEXT: vmovdqa %ymm13, %ymm12 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm10 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,2,2,2,6,6,6,6] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[2,2,2,2,6,6,6,6] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6,7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm3 -; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7,8,9],ymm1[10],ymm11[11,12],ymm1[13],ymm11[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm6 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8,9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-FP-NEXT: vmovdqa %ymm14, 192(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FP-NEXT: addq $40, %rsp +; AVX2-FP-NEXT: popq %rax ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -2654,81 +2642,81 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm7 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm15 -; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm3 -; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm1 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] +; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,2,2,6,6,6,6] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5,6,7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13,14,15] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[1,2,2,3,5,6,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,3,u,u,u,4,u] ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [3,u,u,3,u,u,u,4] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm13 @@ -2736,70 +2724,71 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm8, %ymm10 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm8 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm11, %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3,4],xmm1[5],xmm9[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3,4],xmm9[5],xmm1[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm8 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -2811,7 +2800,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm8, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm11, 64(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm10, 64(%rax) ; AVX2-FCP-NEXT: popq %rax ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -2819,524 +2808,524 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-LABEL: store_i16_stride7_vf16: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512-NEXT: vmovdqa (%r8), %ymm2 -; AVX512-NEXT: vmovdqa (%r9), %ymm3 +; AVX512-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512-NEXT: vmovdqa (%r8), %ymm1 +; AVX512-NEXT: vmovdqa (%r9), %ymm2 ; AVX512-NEXT: vmovdqa (%rax), %ymm13 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vporq %ymm1, %ymm4, %ymm16 -; AVX512-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vporq %ymm5, %ymm6, %ymm16 +; AVX512-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[12,13,14,15],zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm8[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-NEXT: vporq %ymm1, %ymm4, %ymm17 -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm8[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[12,13,14,15],zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm7[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-NEXT: vporq %ymm5, %ymm6, %ymm18 +; AVX512-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17],zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] +; AVX512-NEXT: vmovdqa (%rsi), %xmm9 ; AVX512-NEXT: vporq %ymm5, %ymm10, %ymm19 ; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u] ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm20 -; AVX512-NEXT: vprold $16, %xmm4, %xmm11 -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm20 +; AVX512-NEXT: vprold $16, %xmm9, %xmm11 +; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[1,1,2,3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,1,1,0,0,0,0,16,17,0,0,18,0] -; AVX512-NEXT: vpermi2d %zmm11, %zmm12, %zmm18 -; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,0,u,u,1,1,u,u,u,u,16,17,u,u,18,u] +; AVX512-NEXT: vpermi2d %zmm11, %zmm12, %zmm17 +; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] -; AVX512-NEXT: vmovdqa (%r9), %xmm11 -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512-NEXT: vmovdqa (%r8), %xmm15 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-NEXT: vmovdqa (%r9), %xmm15 +; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7] +; AVX512-NEXT: vmovdqa (%r8), %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX512-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8,9,10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8,9,10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512-NEXT: vpermi2d %zmm11, %zmm0, %zmm15 -; AVX512-NEXT: vprold $16, %ymm3, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15] -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm22[2,1,3,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm12[0,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,1,u,1,1,u,u,18,19,u,19,19,u,u] +; AVX512-NEXT: vpermi2d %zmm12, %zmm0, %zmm15 +; AVX512-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7,8,9],ymm0[10],ymm12[11,12],ymm0[13],ymm12[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm22[2,1,3,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-NEXT: vpermd %zmm13, %zmm3, %zmm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (mem & (zmm9 ^ zmm7)) -; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 +; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm10, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm8 ^ (mem & (zmm10 ^ zmm8)) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] +; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm9, %zmm4 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm4 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm9)) -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 -; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (mem & (zmm7 ^ zmm4)) -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm7)) -; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm18 ^ (mem & (zmm0 ^ zmm18)) -; AVX512-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm15)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm8)) -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm6)) -; AVX512-NEXT: vmovdqa %ymm0, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm10)) +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm2 +; AVX512-NEXT: vpermq {{.*#+}} zmm4 = zmm20[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (mem & (zmm4 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] +; AVX512-NEXT: vpermd %zmm13, %zmm2, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4)) +; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm6[0,0,1,1,4,4,5,5] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm17 ^ (mem & (zmm1 ^ zmm17)) +; AVX512-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,3,3,3,6,7,7,7] +; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm8 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm15)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm1)) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm7)) +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,1,3,2] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3)) +; AVX512-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride7_vf16: ; AVX512-FCP: # %bb.0: +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm16 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm18 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,0,3,2,0,10,10,11] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 -; AVX512-FCP-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-FCP-NEXT: vporq %ymm4, %ymm7, %ymm16 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u],zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[16,17,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vporq %ymm4, %ymm7, %ymm17 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17],zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm0[u,u,u,u] +; AVX512-FCP-NEXT: vporq %ymm4, %ymm7, %ymm18 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13,14,15] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,u,3,2,u,10,10,11] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7,8,9],ymm7[10],ymm4[11,12],ymm7[13],ymm4[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm19 +; AVX512-FCP-NEXT: vprold $16, %ymm1, %ymm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7,8,9,10],ymm14[11],ymm7[12,13],ymm14[14],ymm7[15] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,10,0,11,10] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 -; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,0,0,8,0,9] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,10,u,11,10] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm14 +; AVX512-FCP-NEXT: vprold $16, %xmm11, %xmm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,u,1,u,u,8,u,9] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm15 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7] -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm8 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm10 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,1,8,9,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3],xmm13[4],xmm8[5,6],xmm13[7] +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm10 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm20 +; AVX512-FCP-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm20 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,0,0,1,8,9,9,u] +; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm13 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[16,17,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm20, %zmm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm11[0,0,1,1] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm9 ^ (mem & (zmm0 ^ zmm9)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm3 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm4)) +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermd %zmm10, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [6,u,u,u,7,u,u,7] +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm19[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (mem & (zmm1 ^ zmm9)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm8[0,0,1,1,4,4,5,5] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm15 ^ (mem & (zmm1 ^ zmm15)) +; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm0)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm15 ^ (mem & (zmm0 ^ zmm15)) -; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm20)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm5)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (mem & (ymm1 ^ ymm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4)) -; AVX512-FCP-NEXT: vmovdqa %ymm1, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%rcx) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (mem & (ymm0 ^ ymm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2)) +; AVX512-FCP-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride7_vf16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2 ; AVX512DQ-NEXT: vmovdqa (%rax), %ymm13 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vporq %ymm1, %ymm4, %ymm16 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vporq %ymm5, %ymm6, %ymm16 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[12,13,14,15],zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm8[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-NEXT: vporq %ymm1, %ymm4, %ymm17 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm8[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[12,13,14,15],zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm7[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-NEXT: vporq %ymm5, %ymm6, %ymm18 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17],zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm9 ; AVX512DQ-NEXT: vporq %ymm5, %ymm10, %ymm19 ; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm20 -; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm11 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm20 +; AVX512DQ-NEXT: vprold $16, %xmm9, %xmm11 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[1,1,2,3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,1,1,0,0,0,0,16,17,0,0,18,0] -; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm12, %zmm18 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,0,u,u,1,1,u,u,u,u,16,17,u,u,18,u] +; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm12, %zmm17 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm11 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm15 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7] +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8,9,10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8,9,10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm0, %zmm15 -; AVX512DQ-NEXT: vprold $16, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm22[2,1,3,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm12[0,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,1,u,1,1,u,u,18,19,u,19,19,u,u] +; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm0, %zmm15 +; AVX512DQ-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7,8,9],ymm0[10],ymm12[11,12],ymm0[13],ymm12[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm22[2,1,3,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-NEXT: vpermd %zmm13, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (mem & (zmm9 ^ zmm7)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm8 ^ (mem & (zmm10 ^ zmm8)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm9, %zmm4 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm4 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm9)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (mem & (zmm7 ^ zmm4)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm7)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm18 ^ (mem & (zmm0 ^ zmm18)) -; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm15)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm8)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm6)) -; AVX512DQ-NEXT: vmovdqa %ymm0, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm10)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm4 = zmm20[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (mem & (zmm4 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] +; AVX512DQ-NEXT: vpermd %zmm13, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4)) +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm6[0,0,1,1,4,4,5,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm17 ^ (mem & (zmm1 ^ zmm17)) +; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,3,3,3,6,7,7,7] +; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm8 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm15)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm1)) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm7)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,1,3,2] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3)) +; AVX512DQ-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf16: ; AVX512DQ-FCP: # %bb.0: +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm18 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,0,3,2,0,10,10,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm7, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u],zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[16,17,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm7, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm0[u,u,u,u] +; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm7, %ymm18 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13,14,15] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,u,3,2,u,10,10,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7,8,9],ymm7[10],ymm4[11,12],ymm7[13],ymm4[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm19 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7,8,9,10],ymm14[11],ymm7[12,13],ymm14[14],ymm7[15] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,10,0,11,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,0,0,8,0,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,10,u,11,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm11, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,u,1,u,u,8,u,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm15 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7] -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm10 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,1,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3],xmm13[4],xmm8[5,6],xmm13[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm20 +; AVX512DQ-FCP-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,0,0,1,8,9,9,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm13 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[16,17,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm20, %zmm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm11[0,0,1,1] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm9 ^ (mem & (zmm0 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm3 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermd %zmm10, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [6,u,u,u,7,u,u,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm19[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (mem & (zmm1 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm8[0,0,1,1,4,4,5,5] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm15 ^ (mem & (zmm1 ^ zmm15)) +; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm15 ^ (mem & (zmm0 ^ zmm15)) -; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm20)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (mem & (ymm1 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 64(%rcx) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm4 ^ (mem & (ymm0 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -3351,27 +3340,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -3397,27 +3386,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -3443,27 +3432,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -3489,27 +3478,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -4778,13 +4767,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm9 ; AVX2-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX2-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] ; AVX2-NEXT: vpermd %ymm8, %ymm0, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,0,0,0,4,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,3,u,u,u,4,u,u] ; AVX2-NEXT: vpermd %ymm11, %ymm2, %ymm4 ; AVX2-NEXT: vpermd %ymm13, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,3,2,3,4,7,6,7] @@ -4792,22 +4781,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpermd %ymm12, %ymm2, %ymm2 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,3,u,u,u,4,u] ; AVX2-NEXT: vpermd %ymm6, %ymm3, %ymm5 ; AVX2-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa (%r8), %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4819,15 +4808,15 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa 32(%rax), %ymm13 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,0,0,3,0,0,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [3,u,u,3,u,u,u,4] ; AVX2-NEXT: vpermd %ymm13, %ymm4, %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqa (%rax), %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 @@ -4845,22 +4834,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -4873,18 +4862,18 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpbroadcastd 60(%r8), %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm0 @@ -4900,7 +4889,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm12 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4924,7 +4913,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-NEXT: vpbroadcastd 32(%rax), %ymm9 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm9, %ymm0 ; AVX2-NEXT: vmovdqa (%r9), %xmm5 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4936,11 +4925,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-NEXT: vpbroadcastd (%rax), %ymm14 ; AVX2-NEXT: vpblendvb %ymm3, %ymm15, %ymm14, %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm15 ; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] @@ -4951,7 +4940,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm3[2],xmm12[3,4],xmm3[5],xmm12[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm9 ; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] @@ -4966,16 +4955,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm4 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm14 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4986,7 +4975,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] @@ -5001,9 +4990,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload @@ -5012,7 +5001,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -5029,7 +5018,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] @@ -5041,9 +5030,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6] @@ -5054,7 +5043,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] @@ -5063,9 +5052,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] @@ -5081,7 +5070,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] @@ -5104,7 +5093,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] @@ -5114,7 +5103,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 ; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5145,119 +5134,120 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-LABEL: store_i16_stride7_vf32: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm15 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm15 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] -; AVX2-FP-NEXT: vpermd %ymm8, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm9, %ymm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,3,0,0,0,4,0,0] -; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm5 +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,3,u,u,u,4,u,u] +; AVX2-FP-NEXT: vpermd %ymm11, %ymm9, %ymm5 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FP-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm12, %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermd %ymm13, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm15, %ymm2 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm10, %ymm2 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm11 -; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm2 -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm15, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,0,4] -; AVX2-FP-NEXT: vpermd %ymm7, %ymm3, %ymm4 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm5, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm14, %ymm9, %ymm3 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,3,u,u,u,4,u] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm7, %ymm4, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm11, %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovdqa (%r9), %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm8, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX2-FP-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm13 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,0,3,0,0,0,4] -; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,u,u,3,u,u,u,4] +; AVX2-FP-NEXT: vpermd %ymm13, %ymm4, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-FP-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm15, %ymm2, %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpbroadcastd 60(%r8), %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm6 @@ -5273,7 +5263,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm14 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm13 @@ -5295,7 +5285,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd 32(%rax), %ymm10 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm10, %ymm0 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5305,16 +5295,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm15, %ymm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm9, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm2 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm9 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -5322,7 +5312,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm2 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] @@ -5337,13 +5327,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 36(%rax), %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5354,7 +5344,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 @@ -5368,9 +5358,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm3 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] @@ -5378,7 +5368,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -5393,7 +5383,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7] @@ -5404,9 +5394,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] @@ -5416,7 +5406,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,2,2,3,5,6,6,7] @@ -5424,12 +5414,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload @@ -5442,7 +5431,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm10, %ymm5 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] @@ -5463,7 +5452,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5] @@ -5472,7 +5461,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5487,9 +5476,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm0, 288(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm15, 256(%rax) @@ -5503,16 +5492,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FCP-LABEL: store_i16_stride7_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $312, %rsp # imm = 0x138 +; AVX2-FCP-NEXT: subq $328, %rsp # imm = 0x148 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm8 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm7 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm10 -; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm5 +; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm11 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] @@ -5520,68 +5509,66 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,5,2,2,6,6,6,6] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm1 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm1 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] @@ -5590,118 +5577,119 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm10 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm1 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[0,0,2,1,4,4,6,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7,8,9,10],ymm13[11],ymm1[12,13],ymm13[14],ymm1[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,3,3] ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm13, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8,9,10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm8, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm7 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,0,0,0,4,0,0] -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,3,u,u,u,4,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm8, %ymm2 -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,3,0,4] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,3,u,u,u,4,u] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm1, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,0,0,3,0,0,0,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [3,u,u,3,u,u,u,4] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,2,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 60(%r8), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 @@ -5719,7 +5707,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm12 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm13 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9 @@ -5731,101 +5719,101 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm11 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm11 -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm4, %ymm11, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm4, %ymm10, %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm11, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm7, %xmm10 +; AVX2-FCP-NEXT: vmovdqa %xmm7, %xmm11 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm10 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1],xmm10[2],xmm12[3,4],xmm10[5],xmm12[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3],xmm0[4],xmm10[5,6],xmm0[7] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1],xmm4[2],xmm11[3,4],xmm4[5],xmm11[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,2,3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2],xmm10[3,4],xmm4[5],xmm10[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FCP-NEXT: vpbroadcastd 36(%rax), %ymm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX2-FCP-NEXT: vpbroadcastd 36(%rax), %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm6 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm6 -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm5 +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 40(%rax), %ymm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm2 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5843,7 +5831,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rax) @@ -5853,1253 +5841,1273 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FCP-NEXT: addq $312, %rsp # imm = 0x138 +; AVX2-FCP-NEXT: addq $328, %rsp # imm = 0x148 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i16_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $664, %rsp # imm = 0x298 -; AVX512-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm13, %ymm9, %ymm0 +; AVX512-NEXT: subq $632, %rsp # imm = 0x278 +; AVX512-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm5, %ymm10, %ymm0 ; AVX512-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm26 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm11, %ymm17 +; AVX512-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] ; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm3 -; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm4 -; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm3 -; AVX512-NEXT: vpshufb %ymm14, %ymm15, %ymm4 -; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX512-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512-NEXT: vmovdqa (%r9), %ymm12 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm4, %ymm12, %ymm0 +; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%r8), %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] +; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,18,19,u,19,19,u,u,0,1,u,1,2,u,u,3] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX512-NEXT: vpshufb %ymm13, %ymm9, %ymm1 +; AVX512-NEXT: vpshufb %ymm15, %ymm11, %ymm3 +; AVX512-NEXT: vpor %ymm1, %ymm3, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX512-NEXT: vpshufb %ymm5, %ymm13, %ymm1 +; AVX512-NEXT: vpshufb %ymm14, %ymm8, %ymm3 +; AVX512-NEXT: vpor %ymm1, %ymm3, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%r9), %ymm14 +; AVX512-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512-NEXT: vpshufb %ymm4, %ymm14, %ymm1 +; AVX512-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %xmm4 -; AVX512-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512-NEXT: vpermi2d %zmm7, %zmm6, %zmm24 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,16,u,u,17,17,u,u,0,u,u,1,2,u,u,3] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%r9), %xmm5 +; AVX512-NEXT: vmovdqa (%r8), %xmm6 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,0,1,u,1,1,u,u,18,19,u,19,19,u,u] +; AVX512-NEXT: vpermi2d %zmm7, %zmm4, %zmm25 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa (%rax), %ymm6 -; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm4 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm30 +; AVX512-NEXT: vmovdqa (%rax), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7,8,9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512-NEXT: vprold $16, %ymm14, %ymm3 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm31 -; AVX512-NEXT: vprold $16, %ymm13, %ymm4 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa %ymm9, %ymm8 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512-NEXT: vmovdqa %ymm10, %ymm6 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512-NEXT: vprold $16, %xmm5, %xmm6 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] -; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7,8,9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm23[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1],xmm12[2,3],xmm3[4],xmm12[5,6],xmm3[7] +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm5 -; AVX512-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512-NEXT: vprold $16, %xmm4, %xmm4 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] -; AVX512-NEXT: vmovdqa %ymm10, %ymm14 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8,9,10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm0[0,0,1,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] -; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = mem[2,1,3,2] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm25[0,2,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm18[2,1,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,2,2,3] -; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX512-NEXT: # xmm5 = mem[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] -; AVX512-NEXT: vprold $16, %ymm14, %ymm5 -; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7,8,9],ymm5[10],ymm15[11,12],ymm5[13],ymm15[14,15] -; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm15 -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm28 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm29 & (zmm28 ^ zmm15)) -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm29 & (zmm0 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm9 & (zmm2 ^ zmm1)) -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm3 -; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[2,3,3,3,6,7,7,7] -; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512-NEXT: # ymm10 = mem[0,0,2,1] -; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] -; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512-NEXT: # ymm18 = mem[2,2,2,3] -; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload -; AVX512-NEXT: # ymm20 = mem[2,1,3,2] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm0 +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512-NEXT: vprold $16, %xmm8, %xmm8 +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7,8,9],ymm4[10],ymm8[11,12],ymm4[13],ymm8[14,15] +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15] +; AVX512-NEXT: vmovdqa %ymm7, %ymm2 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7,8,9],ymm12[10],ymm4[11,12],ymm12[13],ymm4[14,15] +; AVX512-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm31[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,1,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm21[2,1,3,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm20[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm19[0,2,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,1,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm16[2,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6,7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14,15] +; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512-NEXT: # xmm1 = mem[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm16 & (zmm3 ^ zmm1)) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm3 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm17 & (zmm3 ^ zmm0)) +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm30, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512-NEXT: vmovdqa64 (%rax), %zmm30 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] +; AVX512-NEXT: vpermd %zmm30, %zmm3, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) +; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512-NEXT: # ymm11 = mem[2,3,3,3,6,7,7,7] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm23[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm31, %zmm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,1,3,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512-NEXT: # ymm8 = mem[0,0,2,1] +; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512-NEXT: # xmm11 = mem[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] +; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512-NEXT: # ymm15 = mem[0,0,1,1] +; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512-NEXT: # xmm12 = mem[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512-NEXT: # ymm19 = mem[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,1,3,2] ; AVX512-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload ; AVX512-NEXT: # ymm26 = mem[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm23[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm22[0,0,2,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm21[2,1,3,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm16[2,2,2,3] +; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512-NEXT: # ymm27 = mem[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm22[0,0,2,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm18[2,1,3,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm2 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm28)) -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512-NEXT: vpermd %zmm6, %zmm8, %zmm8 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,0,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm9 & (zmm2 ^ zmm0)) -; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm0 -; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm17 & (zmm6 ^ zmm4)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm2 +; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm4 +; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm8 +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm2 ^ (zmm16 & (zmm8 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm8)) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm2, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm2, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm6, %zmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm2)) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm26[0,1,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm2 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm4)) -; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm27, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (mem & (zmm2 ^ zmm4)) -; AVX512-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm24)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm7 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-NEXT: vpermd %zmm6, %zmm5, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm2)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm2 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm6)) +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm6 ^ (mem & (zmm2 ^ zmm6)) +; AVX512-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512-NEXT: vprold $16, %ymm9, %ymm8 +; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm23[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] +; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm25)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm29, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm2)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm8[2,1,3,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] +; AVX512-NEXT: vpermd %zmm30, %zmm2, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm10)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512-NEXT: addq $664, %rsp # imm = 0x298 +; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512-NEXT: addq $632, %rsp # imm = 0x278 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride7_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $296, %rsp # imm = 0x128 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512-FCP-NEXT: subq $328, %rsp # imm = 0x148 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm8 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm14 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm5 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm11 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm7 -; AVX512-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX512-FCP-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 ; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm1 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm1 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,2,2,3,10,u,11,u] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm23 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,0,8,8,9] -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm23 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,2,2,3,8,u,9,u] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm21 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [8,9,9,0,0,0,1,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,2,2,3,u,8,8,9] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,0,u,1,8,8,9,u] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm27 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [8,9,9,u,0,0,1,1] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,0,9,0,0,0,1,1] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm27 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm29 -; AVX512-FCP-NEXT: vprold $16, %ymm15, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7,8,9,10],ymm8[11],ymm4[12,13],ymm8[14],ymm4[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm8 +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [8,u,9,u,0,0,1,1] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm26 +; AVX512-FCP-NEXT: vprold $16, %ymm14, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm9 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8,9,10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,10,u,11,10] +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm9, %zmm14 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7,8,9],ymm1[10],ymm9[11,12],ymm1[13],ymm9[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11] +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm16 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm8 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7,8,9],ymm11[10],ymm3[11,12],ymm11[13],ymm3[14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15] +; AVX512-FCP-NEXT: vprold $16, %ymm15, %ymm11 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7,8,9],ymm11[10],ymm13[11,12],ymm11[13],ymm13[14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm23 ^ (zmm29 & (zmm3 ^ zmm23)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm3)) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm11)) +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,u,3,10,10,11,11] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm0[1],xmm6[2,3],xmm0[4],xmm6[5,6],xmm0[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,1,1,8,8,u,9] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 +; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm11 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm28)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm27 ^ (zmm28 & (zmm6 ^ zmm27)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm6)) +; AVX512-FCP-NEXT: vprold $16, %xmm1, %xmm6 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm6[2],xmm12[3,4],xmm6[5],xmm12[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm6 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm7 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm11 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm25[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (mem & (zmm26 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm1 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7,8,9],ymm11[10],ymm15[11,12],ymm11[13],ymm15[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm11 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7,8,9],ymm1[10],ymm13[11,12],ymm1[13],ymm13[14,15] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7,8],ymm13[9],ymm15[10,11],ymm13[12],ymm15[13,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm18[0,0,1,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm12 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] +; AVX512-FCP-NEXT: vpermd %ymm13, %ymm27, %ymm13 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm11)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [6,u,u,u,7,u,u,7] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX512-FCP-NEXT: vpermd %zmm23, %zmm27, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm14)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm12)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm12 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 ^ (zmm28 & (zmm8 ^ zmm12)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm9 & mem) | zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm22, %ymm1 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm4 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) +; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7,8,9],ymm12[10],ymm0[11,12],ymm12[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7,8,9],ymm7[10],ymm0[11,12],ymm7[13],ymm0[14,15] -; AVX512-FCP-NEXT: vprold $16, %ymm13, %ymm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7,8,9],ymm7[10],ymm14[11,12],ymm7[13],ymm14[14,15] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2,3],xmm7[4],xmm15[5,6],xmm7[7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,0,9] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,1,3,2,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm17 & (zmm12 ^ zmm24)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm14)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,0,0,0,7,0,0,7] -; AVX512-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm26 ^ (zmm12 & (zmm7 ^ zmm26)) -; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm13 -; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm14 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm28)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm11)) -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm24[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm19[0,0,1,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm18[2,2,2,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm12 & (zmm3 ^ zmm0)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm25 ^ (zmm17 & (zmm21 ^ zmm25)) -; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm30 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm9 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) -; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm23)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm21)) -; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm29)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm27)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm22)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm21 ^ (zmm29 & (zmm20 ^ zmm21)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm20)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,0,0,1,8,9,9,u] +; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm26)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-FCP-NEXT: addq $296, %rsp # imm = 0x128 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512-FCP-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $664, %rsp # imm = 0x298 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm9, %ymm0 +; AVX512DQ-NEXT: subq $632, %rsp # imm = 0x278 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm10, %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm26 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm17 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm4 -; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm15, %ymm4 -; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm12 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm12, %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,18,19,u,19,19,u,u,0,1,u,1,2,u,u,3] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm9, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm3 +; AVX512DQ-NEXT: vpor %ymm1, %ymm3, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm13, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm8, %ymm3 +; AVX512DQ-NEXT: vpor %ymm1, %ymm3, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm14 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm14, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm6, %zmm24 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,16,u,u,17,17,u,u,0,u,u,1,2,u,u,3] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,0,1,u,1,1,u,u,18,19,u,19,19,u,u] +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm4, %zmm25 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm6 -; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm4 +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm30 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7,8,9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-NEXT: vprold $16, %ymm14, %ymm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm31 -; AVX512DQ-NEXT: vprold $16, %ymm13, %ymm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm8 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm6 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] -; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7,8,9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm23[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512DQ-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1],xmm12[2,3],xmm3[4],xmm12[5,6],xmm3[7] +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm14 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8,9,10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm0[0,0,1,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] -; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = mem[2,1,3,2] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm25[0,2,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm18[2,1,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,2,2,3] -; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm5 = mem[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] -; AVX512DQ-NEXT: vprold $16, %ymm14, %ymm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7,8,9],ymm5[10],ymm15[11,12],ymm5[13],ymm15[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm15 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm29 & (zmm28 ^ zmm15)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm29 & (zmm0 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm9 & (zmm2 ^ zmm1)) -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm3 -; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[2,3,3,3,6,7,7,7] -; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm10 = mem[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] -; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm18 = mem[2,2,2,3] -; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm20 = mem[2,1,3,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512DQ-NEXT: vprold $16, %xmm8, %xmm8 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7,8,9],ymm4[10],ymm8[11,12],ymm4[13],ymm8[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15] +; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7,8,9],ymm12[10],ymm4[11,12],ymm12[13],ymm4[14,15] +; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm31[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,1,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm21[2,1,3,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm20[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm19[0,2,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,1,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm16[2,2,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6,7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14,15] +; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm1 = mem[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm16 & (zmm3 ^ zmm1)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm17 & (zmm3 ^ zmm0)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm30, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm30 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] +; AVX512DQ-NEXT: vpermd %zmm30, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) +; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm11 = mem[2,3,3,3,6,7,7,7] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm23[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm31, %zmm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,1,3,2] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm8 = mem[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm11 = mem[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] +; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm15 = mem[0,0,1,1] +; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm12 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm19 = mem[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,1,3,2] ; AVX512DQ-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm26 = mem[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm23[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm22[0,0,2,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm21[2,1,3,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm16[2,2,2,3] +; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm27 = mem[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm22[0,0,2,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm18[2,1,3,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm28)) -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,0,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm9 & (zmm2 ^ zmm0)) -; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm0 -; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm17 & (zmm6 ^ zmm4)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm2 +; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm4 +; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm8 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm2 ^ (zmm16 & (zmm8 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm8)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm2, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm20, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm2, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm6, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm2)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm26[0,1,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm2 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm4)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm27, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (mem & (zmm2 ^ zmm4)) -; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm24)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm2)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm2 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm6)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm6 ^ (mem & (zmm2 ^ zmm6)) +; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512DQ-NEXT: vprold $16, %ymm9, %ymm8 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm23[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] +; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm25)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm29, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm2)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm8[2,1,3,2] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] +; AVX512DQ-NEXT: vpermd %zmm30, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm10)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQ-NEXT: addq $664, %rsp # imm = 0x298 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512DQ-NEXT: addq $632, %rsp # imm = 0x278 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $296, %rsp # imm = 0x128 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQ-FCP-NEXT: subq $328, %rsp # imm = 0x148 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm7 -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm1 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm1 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,2,2,3,10,u,11,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,0,8,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,2,2,3,8,u,9,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm21 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [8,9,9,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,2,2,3,u,8,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,0,u,1,8,8,9,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm27 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [8,9,9,u,0,0,1,1] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,0,9,0,0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm29 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm15, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7,8,9,10],ymm8[11],ymm4[12,13],ymm8[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [8,u,9,u,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm26 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm14, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8,9,10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,10,u,11,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm9, %zmm14 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7,8,9],ymm1[10],ymm9[11,12],ymm1[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm16 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7,8,9],ymm11[10],ymm3[11,12],ymm11[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vprold $16, %ymm15, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7,8,9],ymm11[10],ymm13[11,12],ymm11[13],ymm13[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm23 ^ (zmm29 & (zmm3 ^ zmm23)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm3)) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,u,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm0[1],xmm6[2,3],xmm0[4],xmm6[5,6],xmm0[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,1,1,8,8,u,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm28)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm27 ^ (zmm28 & (zmm6 ^ zmm27)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vprold $16, %xmm1, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm6[2],xmm12[3,4],xmm6[5],xmm12[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm25[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (mem & (zmm26 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7,8,9],ymm11[10],ymm15[11,12],ymm11[13],ymm15[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm11 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7,8,9],ymm1[10],ymm13[11,12],ymm1[13],ymm13[14,15] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7,8],ymm13[9],ymm15[10,11],ymm13[12],ymm15[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm18[0,0,1,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm27, %ymm13 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [6,u,u,u,7,u,u,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm27, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm12 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 ^ (zmm28 & (zmm8 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm9 & mem) | zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm8)) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm22, %ymm1 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm4 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7,8,9],ymm12[10],ymm0[11,12],ymm12[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7,8,9],ymm7[10],ymm0[11,12],ymm7[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm13, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7,8,9],ymm7[10],ymm14[11,12],ymm7[13],ymm14[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2,3],xmm7[4],xmm15[5,6],xmm7[7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,0,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm17 & (zmm12 ^ zmm24)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm14)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,0,0,0,7,0,0,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm26 ^ (zmm12 & (zmm7 ^ zmm26)) -; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm13 -; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm14 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm28)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm24[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm19[0,0,1,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm18[2,2,2,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm12 & (zmm3 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm25 ^ (zmm17 & (zmm21 ^ zmm25)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm30 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm9 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm23)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm29)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm27)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm22)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm21 ^ (zmm29 & (zmm20 ^ zmm21)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm20)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,0,0,1,8,9,9,u] +; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm26)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $296, %rsp # imm = 0x128 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -7109,31 +7117,31 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512BW-NEXT: movl $1623294726, %ecx # imm = 0x60C18306 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 @@ -7143,16 +7151,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm10 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512BW-NEXT: kmovd %ecx, %k2 @@ -7160,7 +7168,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm10, %zmm11 ; AVX512BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %ecx, %k3 @@ -7170,21 +7178,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512BW-NEXT: movl $405823681, %ecx # imm = 0x183060C1 ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm12 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm10 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -7192,7 +7200,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm13 ; AVX512BW-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -7202,40 +7210,40 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm13 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm13 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm14 ; AVX512BW-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm14 ; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -7245,31 +7253,31 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] -; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: movl $1623294726, %ecx # imm = 0x60C18306 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 @@ -7279,16 +7287,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 @@ -7296,7 +7304,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 @@ -7306,21 +7314,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: movl $405823681, %ecx # imm = 0x183060C1 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -7328,7 +7336,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm13 ; AVX512BW-FCP-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -7338,40 +7346,40 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm13 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm14 ; AVX512BW-FCP-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] +; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm14 ; AVX512BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -7381,31 +7389,31 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] -; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: movl $1623294726, %ecx # imm = 0x60C18306 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 @@ -7415,16 +7423,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 @@ -7432,7 +7440,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 @@ -7442,21 +7450,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: movl $405823681, %ecx # imm = 0x183060C1 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm10 {%k3} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -7464,7 +7472,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm13 ; AVX512DQ-BW-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -7474,40 +7482,40 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm13 +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm13 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm14 ; AVX512DQ-BW-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] +; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm14 ; AVX512DQ-BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -7517,31 +7525,31 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] -; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] +; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $1623294726, %ecx # imm = 0x60C18306 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 @@ -7551,16 +7559,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 @@ -7568,7 +7576,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 @@ -7578,21 +7586,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movl $405823681, %ecx # imm = 0x183060C1 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 @@ -7600,7 +7608,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -7610,40 +7618,40 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm13 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64 @@ -10214,15 +10222,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rax), %ymm7 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 64(%rax), %ymm6 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,3,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,3,u,u,u,4,u] ; AVX2-NEXT: vpermd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm10 = [3,0,0,3,0,0,0,4] +; AVX2-NEXT: vmovdqa 64(%rax), %ymm6 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [3,u,u,3,u,u,u,4] ; AVX2-NEXT: vpermd %ymm3, %ymm11, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] @@ -10232,7 +10240,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpermd %ymm9, %ymm10, %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 ; AVX2-NEXT: vpermd %ymm7, %ymm10, %ymm0 ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm1 @@ -10244,9 +10252,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,0,0,0,4,0,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [3,u,u,u,4,u,u,4] ; AVX2-NEXT: vpermd %ymm3, %ymm7, %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa (%rcx), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10254,13 +10262,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,0,0,0,4,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,3,u,u,u,4,u,u] ; AVX2-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm2 @@ -10316,23 +10324,23 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa 96(%r8), %ymm7 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa 96(%r9), %ymm8 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa 96(%rax), %ymm9 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -10347,22 +10355,22 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -10375,18 +10383,18 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpbroadcastd 124(%r8), %ymm1 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rsi), %xmm1 @@ -10405,7 +10413,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm13 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10467,7 +10475,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10500,7 +10508,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-NEXT: vpbroadcastd 96(%rax), %ymm14 ; AVX2-NEXT: vpblendvb %ymm6, %ymm15, %ymm14, %ymm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload @@ -10509,7 +10517,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd $165, (%rsp), %xmm1 # 16-byte Folded Reload @@ -10523,7 +10531,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -10568,9 +10576,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-NEXT: # xmm4 = mem[0,1,2,3,4,5,7,6] ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm14, %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-NEXT: # xmm5 = mem[0,1,2,3,4,5,7,6] @@ -10588,7 +10596,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-NEXT: vpbroadcastd 100(%rax), %ymm8 ; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm2 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload @@ -10607,7 +10615,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -10644,9 +10652,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload @@ -10669,7 +10677,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-NEXT: vpbroadcastd 104(%rax), %ymm9 ; AVX2-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 @@ -10678,34 +10686,34 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,3,2,3,4,7,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,0,0,0,4,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,3,u,u,u,4,u,u] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,3,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,3,u,u,u,4,u] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,3,0,0,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,3,u,u,u,4] ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -10722,7 +10730,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -10757,27 +10765,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] ; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10787,19 +10795,18 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm5 ; AVX2-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload @@ -10813,7 +10820,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10841,26 +10848,25 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = mem[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[1,2,2,3,5,6,6,7] -; AVX2-NEXT: vmovdqa %ymm15, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[1,2,2,3,5,6,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm11[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[1,2,2,3,5,6,6,7] -; AVX2-NEXT: vmovdqa %ymm4, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[1,2,2,3,5,6,6,7] +; AVX2-NEXT: vmovdqa %ymm12, %ymm14 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7,8,9],ymm9[10],ymm11[11,12],ymm9[13],ymm11[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] -; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vmovdqa %ymm4, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] @@ -10869,11 +10875,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa %ymm13, %ymm15 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vmovdqa %ymm14, %ymm3 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-NEXT: vpblendvb %ymm10, %ymm11, %ymm12, %ymm10 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 @@ -10892,7 +10898,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm4, %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] @@ -10922,7 +10928,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-NEXT: vpblendvb %ymm4, %ymm10, %ymm11, %ymm10 -; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-NEXT: # ymm12 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,3,6,6,6,7] @@ -10931,28 +10938,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-NEXT: # ymm12 = mem[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: # ymm14 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX2-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = mem[2,3,3,3,6,7,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] ; AVX2-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[2,3,3,3,6,7,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,3,3,3,6,7,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm13 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm14, %ymm8, %ymm11, %ymm8 ; AVX2-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 ; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 @@ -11019,39 +11024,39 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm5 -; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm7 -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,0,0,0,4,0,0,4] +; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm4 +; AVX2-FP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [3,u,u,u,4,u,u,4] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm14, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm2, %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,0,0,0,4,0,0] -; AVX2-FP-NEXT: vpermd %ymm3, %ymm10, %ymm4 +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm2, %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,3,u,u,u,4,u,u] +; AVX2-FP-NEXT: vpermd %ymm4, %ymm10, %ymm4 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,3,0,4] -; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm11, %ymm0, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,3,u,u,u,4,u] +; AVX2-FP-NEXT: vpermd %ymm6, %ymm0, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vmovdqa 96(%r9), %ymm4 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,3,0,0,0,4] -; AVX2-FP-NEXT: vpermd %ymm6, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa 96(%rax), %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,u,u,3,u,u,u,4] +; AVX2-FP-NEXT: vpermd %ymm5, %ymm0, %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 @@ -11081,32 +11086,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm8 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm2, %ymm14 -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm14, %ymm8 +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm8, %ymm14, %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm2, %ymm10, %ymm14 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm14 +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm14, %ymm15, %ymm14 ; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm0, %ymm11 -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm11, %ymm10 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm8, %ymm3 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm7, %ymm14, %ymm7 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm10, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm14, %ymm7 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm10, %ymm0 ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,3,0,4] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,3,u,u,u,4,u] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm12, %ymm8 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm2, %ymm10 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11125,9 +11130,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm12, %ymm9, %ymm9 ; AVX2-FP-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,0,3,0,0,0,4] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [3,u,u,3,u,u,u,4] ; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm11 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11137,7 +11142,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm11 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm8, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm1 @@ -11149,29 +11154,29 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] @@ -11179,41 +11184,41 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,2,2,3,5,6,6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpbroadcastd 124(%r8), %ymm1 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 @@ -11232,7 +11237,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm3 @@ -11289,9 +11294,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm5 ; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -11308,8 +11313,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd 64(%rax), %ymm13 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm13, %ymm0 ; AVX2-FP-NEXT: vmovdqa 96(%r8), %xmm13 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -11320,7 +11325,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FP-NEXT: vpbroadcastd 96(%rax), %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm15, %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -11329,12 +11334,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm7, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm0 ; AVX2-FP-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm4 = mem[1,1,2,2] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FP-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload @@ -11342,7 +11347,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm6 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,2] @@ -11384,7 +11389,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm9, %xmm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm9 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm9, %ymm7 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm8 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] @@ -11398,7 +11403,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FP-NEXT: vpbroadcastd 100(%rax), %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm2 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm8, %ymm4 @@ -11418,7 +11423,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] @@ -11453,9 +11458,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload @@ -11478,7 +11483,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-FP-NEXT: vpbroadcastd 104(%rax), %ymm9 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 @@ -11488,8 +11493,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm4 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] @@ -11501,7 +11505,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11541,7 +11545,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 @@ -11563,7 +11567,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm11, %ymm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload @@ -11571,13 +11575,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm0 = [151522058,0,421010202,421010202] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm8 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -11585,7 +11588,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -11619,7 +11622,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm12 @@ -11638,7 +11641,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 @@ -11650,14 +11653,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218894094,0,488382238,488382238] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm1, %ymm11 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm10, %ymm11, %ymm10 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm11 @@ -11684,8 +11687,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm12, %ymm13, %ymm12 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm6, %ymm14 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] @@ -11694,7 +11696,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm4, %ymm15 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -11712,7 +11714,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,3,3,3,6,7,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm15, %ymm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm0, %ymm0 @@ -11781,37 +11783,37 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,0,0,0,4,0,0,4] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [3,u,u,u,4,u,u,4] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,0,0,0,4,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,3,u,u,u,4,u,u] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,3,0,4] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,3,u,u,u,4,u] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa 96(%rax), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,0,3,0,0,0,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [3,u,u,3,u,u,u,4] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm11 @@ -11822,9 +11824,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm12 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11854,8 +11856,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] @@ -11868,9 +11869,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 @@ -11920,11 +11921,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,2,2,3,5,6,6,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm9, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm12 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm2 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -11945,13 +11946,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm14 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm4 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm13 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,2,2,2,6,6,6,6] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7,8,9],ymm14[10],ymm4[11,12],ymm14[13],ymm4[14,15] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm1 = [151522058,0,421010202,421010202] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm14 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm0 @@ -11960,9 +11960,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm14, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -11977,7 +11977,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11997,15 +11997,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 @@ -12027,7 +12026,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7,8,9,10],ymm4[11],ymm14[12,13],ymm4[14],ymm14[15] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [218894094,0,488382238,488382238] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload @@ -12035,9 +12034,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm14, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload @@ -12069,11 +12068,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,0,0,0,4,0,0,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,u,u,u,4,u,u,4] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm2 @@ -12081,12 +12080,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,3,0,0,0,4,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,3,u,u,u,4,u,u] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm5 @@ -12094,16 +12093,16 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,3,0,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,3,u,u,u,4,u] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -12113,15 +12112,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,0,0,3,0,0,0,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [3,u,u,3,u,u,u,4] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm0 @@ -12140,22 +12139,22 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] @@ -12166,19 +12165,19 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] @@ -12189,18 +12188,18 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpbroadcastd 124(%r8), %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm15 @@ -12219,7 +12218,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm10 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12228,11 +12227,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 @@ -12276,16 +12275,16 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm6 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd 32(%rax), %ymm5 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1 @@ -12295,8 +12294,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd 64(%rax), %ymm11 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm3, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%r9), %xmm11 ; AVX2-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12307,7 +12306,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FCP-NEXT: vpbroadcastd 96(%rax), %ymm14 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -12316,27 +12315,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm7, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm0 ; AVX2-FCP-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm2 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,3] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm0 ; AVX2-FCP-NEXT: vpshufd $165, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,2,2] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm2 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2],xmm14[3,4],xmm2[5],xmm14[6,7] @@ -12373,25 +12372,25 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm11 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm11, %ymm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm11 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] -; AVX2-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm11 +; AVX2-FCP-NEXT: vpbroadcastd 36(%rax), %ymm13 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FCP-NEXT: vpbroadcastd 68(%rax), %ymm12 -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 +; AVX2-FCP-NEXT: vpbroadcastd 68(%rax), %ymm13 +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FCP-NEXT: vpbroadcastd 100(%rax), %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm12 +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm13 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm14 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] @@ -12403,9 +12402,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] @@ -12438,9 +12437,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm6 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload @@ -12463,7 +12462,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] ; AVX2-FCP-NEXT: vpbroadcastd 104(%rax), %ymm10 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm5, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm3, %ymm3 @@ -12500,7 +12499,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm3, 512(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm12, 480(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm13, 480(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm2, 288(%rax) @@ -12607,8 +12606,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm10 ; AVX512-NEXT: vpor %ymm10, %ymm9, %ymm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm9 ; AVX512-NEXT: vmovdqa64 %ymm10, %ymm27 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,2,2,2,6,6,6,6] @@ -12879,7 +12877,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] @@ -12893,7 +12891,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm4 ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm1 ; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm2 @@ -12907,7 +12905,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3] ; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 @@ -12925,7 +12923,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u] ; AVX512-NEXT: vpermt2d %zmm4, %zmm5, %zmm2 ; AVX512-NEXT: vpbroadcastd 64(%rax), %ymm4 ; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm6 @@ -12954,10 +12952,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm1 -; AVX512-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm19 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm18 & (zmm19 ^ zmm1)) ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] @@ -13089,7 +13087,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm3 = (zmm3 & zmm1) | mem ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm20[0,1,2,3],zmm21[0,1,2,3] ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] ; AVX512-NEXT: vpermd %zmm2, %zmm21, %zmm20 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm4 & (zmm20 ^ zmm1)) @@ -13113,7 +13111,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm1 ^ (zmm14 & (zmm25 ^ zmm1)) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 32-byte Folded Reload -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] ; AVX512-NEXT: vpermd %zmm2, %zmm28, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm14)) @@ -13198,12 +13196,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm4 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm22 & (zmm4 ^ zmm0)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm7)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm0 & (zmm13 ^ zmm4)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm16)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm2)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm0 & (zmm13 ^ zmm4)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax) @@ -13212,7 +13210,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 704(%rax) ; AVX512-NEXT: vmovdqa64 %zmm10, 640(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 576(%rax) ; AVX512-NEXT: vmovdqa64 %zmm30, 512(%rax) @@ -13225,7 +13223,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-FCP-LABEL: store_i16_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1480, %rsp # imm = 0x5C8 +; AVX512-FCP-NEXT: subq $1512, %rsp # imm = 0x5E8 ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 @@ -13234,85 +13232,84 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm2 ; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm4 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm4, %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm22 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm24 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm10, %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm13 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm13 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm13 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm13 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm13 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpor %ymm15, %ymm13, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm8 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm11 -; AVX512-FCP-NEXT: vporq %ymm11, %ymm6, %ymm21 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm9 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm11 +; AVX512-FCP-NEXT: vporq %ymm11, %ymm9, %ymm20 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm17 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,1,5,5,5,5] @@ -13325,26 +13322,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm13) -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm6 -; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm9 +; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm11 & ymm16) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm29 ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,4,u,u,u,5,u,u] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512-FCP-NEXT: vprold $16, %ymm6, %ymm11 +; AVX512-FCP-NEXT: vprold $16, %ymm9, %ymm11 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm16 & (ymm11 ^ ymm12)) ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm11[0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] ; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm4 ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm11 ; AVX512-FCP-NEXT: vpandn %ymm11, %ymm13, %ymm11 @@ -13353,33 +13350,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7,8,9,10],ymm4[11],ymm11[12,13],ymm4[14],ymm11[15] -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [151522058,0,421010202,421010202] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm21 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [218894094,0,488382238,488382238] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 ; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm16 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm23 & (zmm1 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,u,u,u,6,u,u,6] ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] @@ -13397,52 +13393,52 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,3,10,10,11,11] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,u,3,10,10,11,11] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm20, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm23 & (zmm0 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vprold $16, %ymm10, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vprold $16, %ymm8, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,1,3,2,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm20 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 @@ -13450,82 +13446,82 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm17 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm1 & (zmm4 ^ zmm2)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 -; AVX512-FCP-NEXT: vprold $16, %ymm25, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm25 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (mem & (zmm4 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512-FCP-NEXT: vprold $16, %ymm22, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm21 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm24 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm12 & (zmm0 ^ zmm3)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm29[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm3 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm4, %xmm5 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [8,9,9,0,0,0,1,1] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [8,9,9,u,0,0,1,1] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm2 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 ; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vprold $16, %xmm6, %xmm4 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,1,1,8,8,0,9] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,1,1,8,8,u,9] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm4 & (zmm5 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm6 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,1,8,9,9,u] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6 ; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm30 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm20 & (zmm30 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm29 & (zmm30 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm5)) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 @@ -13536,8 +13532,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm5 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 @@ -13546,232 +13542,233 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm4 & (zmm8 ^ zmm5)) ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm4 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm5 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm29 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm20 & (zmm29 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm11 & (zmm29 ^ zmm4)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm2 & (zmm29 ^ zmm8)) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm2 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm10 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm12 ; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm11 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,2,2,3,8,9,9,0] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,2,2,3,8,9,9,u] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8,9,10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (zmm1 & (zmm2 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (mem & (zmm2 ^ zmm4)) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512-FCP-NEXT: vprold $16, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm13 ^ (zmm23 & (zmm8 ^ zmm13)) -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm0 ^ (zmm23 & (zmm8 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm25[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm4)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm2)) +; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm4 ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm3 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm23 & (zmm14 ^ zmm0)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [6,7,3,3,7,7,6,7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm13 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm0 ^ (zmm23 & (zmm15 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [6,7,3,3,7,7,6,7] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm19 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm6)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm26 & (zmm19 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm8)) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX512-FCP-NEXT: vprold $16, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15] ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm8 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,1,8,8,9,0] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,u,1,8,8,9,u] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm20 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm28 & (zmm12 ^ zmm0)) +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm0 ^ (zmm27 & (zmm11 ^ zmm0)) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm22 = ymm22 ^ (ymm23 & (ymm22 ^ ymm2)) -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm23 & (ymm20 ^ ymm2)) +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,9,9,0,0,0,1,1] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [8,9,9,u,0,0,1,1] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,0,1,1] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm31 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm7[0,0,1,1] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm15 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm14 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm1)) -; AVX512-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1 -; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm15 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm0)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ zmm14)) +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpbroadcastd 100(%rax), %ymm14 +; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm17 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm17 & (zmm14 ^ zmm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm28 & (zmm7 ^ zmm15)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm0 & (zmm15 ^ zmm12)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm11)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3],xmm1[4],xmm12[5,6],xmm1[7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm28 & (zmm1 ^ zmm9)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm10 +; AVX512-FCP-NEXT: vmovdqa %xmm13, %xmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2,3],xmm10[4],xmm13[5,6],xmm10[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm9 ^ (zmm27 & (zmm10 ^ zmm9)) +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm9 ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm6 -; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm5 -; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm18 & (zmm5 ^ zmm6)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm1)) +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm8 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm6 +; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm17 & (zmm5 ^ zmm6)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm10)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm2 = mem ^ (ymm0 & (ymm2 ^ mem)) +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm16 = mem ^ (ymm0 & (ymm16 ^ mem)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm16)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm6 & (ymm4 ^ ymm22)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm3[0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm6 & (ymm4 ^ ymm20)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm3[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (zmm2 & zmm6) | mem +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm1 = (zmm1 & zmm6) | mem -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = (zmm3 & zmm6) | mem ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload @@ -13779,8 +13776,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -13792,12 +13789,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # ymm12 = mem[1,1,1,1,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm13 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpandn %ymm13, %ymm14, %ymm13 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandn %ymm13, %ymm15, %ymm13 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] @@ -13806,46 +13805,44 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm28 & (zmm6 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm27 & (zmm6 ^ zmm4)) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm28 & (zmm8 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm1 ^ (zmm4 & (zmm6 ^ zmm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm4 & (zmm8 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (mem & (zmm3 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = zmm1 | (zmm0 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm27 & (zmm8 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm4 ^ (mem & (zmm9 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = zmm4 | (zmm0 & mem) ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm12 = zmm12 | (zmm0 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm0 & (zmm6 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm9)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm0 & (zmm8 ^ zmm1)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512-FCP-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512-FCP-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -13934,8 +13931,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm10 ; AVX512DQ-NEXT: vpor %ymm10, %ymm9, %ymm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm9 ; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm27 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,2,2,2,6,6,6,6] @@ -14206,7 +14202,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] @@ -14220,7 +14216,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm4 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm17, %zmm1 ; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm2 @@ -14234,7 +14230,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm1 @@ -14252,7 +14248,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm5, %zmm2 ; AVX512DQ-NEXT: vpbroadcastd 64(%rax), %ymm4 ; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm6 @@ -14281,10 +14277,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm1 -; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm19 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm18 & (zmm19 ^ zmm1)) ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] @@ -14416,7 +14412,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # zmm3 = (zmm3 & zmm1) | mem ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm20[0,1,2,3],zmm21[0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] ; AVX512DQ-NEXT: vpermd %zmm2, %zmm21, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm4 & (zmm20 ^ zmm1)) @@ -14440,7 +14436,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm1 ^ (zmm14 & (zmm25 ^ zmm1)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] ; AVX512DQ-NEXT: vpermd %zmm2, %zmm28, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm14)) @@ -14525,12 +14521,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm4 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm22 & (zmm4 ^ zmm0)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm7)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm0 & (zmm13 ^ zmm4)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm16)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm0 & (zmm13 ^ zmm4)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rax) @@ -14539,7 +14535,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 704(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 640(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 576(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 512(%rax) @@ -14552,7 +14548,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $1480, %rsp # imm = 0x5C8 +; AVX512DQ-FCP-NEXT: subq $1512, %rsp # imm = 0x5E8 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 @@ -14561,85 +14557,84 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm4 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm24 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm10, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpor %ymm15, %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm8 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm11 -; AVX512DQ-FCP-NEXT: vporq %ymm11, %ymm6, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm11 +; AVX512DQ-FCP-NEXT: vporq %ymm11, %ymm9, %ymm20 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm17 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,1,5,5,5,5] @@ -14652,26 +14647,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm13) -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm11 & ymm16) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm29 ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,4,u,u,u,5,u,u] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm6, %ymm11 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm9, %ymm11 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm16 & (ymm11 ^ ymm12)) ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm11[0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm4 ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm11 ; AVX512DQ-FCP-NEXT: vpandn %ymm11, %ymm13, %ymm11 @@ -14680,33 +14675,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7,8,9,10],ymm4[11],ymm11[12,13],ymm4[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [151522058,0,421010202,421010202] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm21 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [218894094,0,488382238,488382238] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm16 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm23 & (zmm1 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [5,u,u,u,6,u,u,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] @@ -14724,52 +14718,52 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,3,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,u,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm23 & (zmm0 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm10, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vprold $16, %ymm8, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm20 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 @@ -14777,82 +14771,82 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm17 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm1 & (zmm4 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm25, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm25 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (mem & (zmm4 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm22, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm21 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm24 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,u,u,u,5,u,u,5,u,u,u,6,u,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm12 & (zmm0 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm29[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [8,9,9,0,0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [8,9,9,u,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm25, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vprold $16, %xmm6, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,1,1,8,8,0,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,1,1,8,8,u,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm4 & (zmm5 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,1,8,9,9,u] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6 ; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm30 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm20 & (zmm30 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm29 & (zmm30 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 @@ -14863,8 +14857,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12 @@ -14873,232 +14867,233 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm25 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm4 & (zmm8 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm20 & (zmm29 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm11 & (zmm29 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm2 & (zmm29 ^ zmm8)) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm11 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,2,2,3,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,2,2,3,8,9,9,u] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8,9,10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (zmm1 & (zmm2 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (mem & (zmm2 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vprold $16, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm13 ^ (zmm23 & (zmm8 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm0 ^ (zmm23 & (zmm8 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm25[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm23 & (zmm14 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm0 ^ (zmm23 & (zmm15 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm19 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm26 & (zmm19 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm8)) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,1,8,8,9,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,u,1,8,8,9,u] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm20 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm28 & (zmm12 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm0 ^ (zmm27 & (zmm11 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm22 = ymm22 ^ (ymm23 & (ymm22 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm23 & (ymm20 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,9,9,0,0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [8,9,9,u,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,0,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm31 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm7[0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm14 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpbroadcastd 100(%rax), %ymm14 +; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm17 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm17 & (zmm14 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm28 & (zmm7 ^ zmm15)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm0 & (zmm15 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm9 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3],xmm1[4],xmm12[5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm28 & (zmm1 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, %xmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2,3],xmm10[4],xmm13[5,6],xmm10[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm9 ^ (zmm27 & (zmm10 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm9 ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm6 -; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm5 -; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm18 & (zmm5 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm17 & (zmm5 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm10)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm2 = mem ^ (ymm0 & (ymm2 ^ mem)) +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm16 = mem ^ (ymm0 & (ymm16 ^ mem)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm16)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm6 & (ymm4 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm6 & (ymm4 ^ ymm20)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,13,u,u,u,14,u,u,14,u,u,u,15,u,u,15] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm2 = (zmm2 & zmm6) | mem +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm1 = (zmm1 & zmm6) | mem -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = (zmm3 & zmm6) | mem ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload @@ -15106,8 +15101,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -15119,12 +15114,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # ymm12 = mem[1,1,1,1,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm13 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpandn %ymm13, %ymm14, %ymm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandn %ymm13, %ymm15, %ymm13 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] @@ -15133,46 +15130,44 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm28 & (zmm6 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm27 & (zmm6 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm28 & (zmm8 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm1 ^ (zmm4 & (zmm6 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm4 & (zmm8 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (mem & (zmm3 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 | (zmm0 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm27 & (zmm8 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm4 ^ (mem & (zmm9 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = zmm4 | (zmm0 & mem) ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm12 = zmm12 | (zmm0 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm0 & (zmm6 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm0 & (zmm8 ^ zmm1)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512DQ-FCP-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512DQ-FCP-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -15185,125 +15180,125 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm23 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm30 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm31 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm7, %zmm3 ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] -; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm31, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm3 ; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31] +; AVX512BW-NEXT: vpermi2w %zmm31, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm0 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm6 ; AVX512BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm6 {%k3} ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm23, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 +; AVX512BW-NEXT: vpermt2w %zmm29, %zmm24, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm2, %zmm0 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 -; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm29 {%k3} ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm6 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 -; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3} +; AVX512BW-NEXT: vpermt2w %zmm31, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm23, %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm29 {%k3} ; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm7 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63] +; AVX512BW-NEXT: vpermi2w %zmm19, %zmm2, %zmm21 ; AVX512BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm7 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 ; AVX512BW-NEXT: movl $202911840, %eax # imm = 0xC183060 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm24 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm25 ; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm11 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm11 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512BW-NEXT: vpermi2w %zmm19, %zmm3, %zmm25 ; AVX512BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm11 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm25, %zmm22 ; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} @@ -15311,7 +15306,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm21 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3} ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm25, %zmm3 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -15323,33 +15318,33 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 ; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm2, %zmm10 ; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1} ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm18 ; AVX512BW-NEXT: movl $405823681, %eax # imm = 0x183060C1 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm18 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 +; AVX512BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm24 ; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm10, %zmm1 ; AVX512BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1} ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -15358,44 +15353,44 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm20 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm20 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} ; AVX512BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31] ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermi2w %zmm23, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512BW-NEXT: vpermi2w %zmm23, %zmm2, %zmm1 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512BW-NEXT: addq $136, %rsp @@ -15411,125 +15406,125 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2w %zmm31, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm25, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm11, %zmm3 ; AVX512BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm31, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] +; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm6 ; AVX512BW-FCP-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm6 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] -; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm23, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm24, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm29 {%k3} ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm6 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3} +; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm23, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm2, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm31, %zmm29 {%k3} ; AVX512BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm12, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63] -; AVX512BW-FCP-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63] +; AVX512BW-FCP-NEXT: vpermi2w %zmm19, %zmm2, %zmm21 ; AVX512BW-FCP-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm7 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 ; AVX512BW-FCP-NEXT: movl $202911840, %eax # imm = 0xC183060 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3} -; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm24 +; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm25 ; AVX512BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512BW-FCP-NEXT: vpermi2w %zmm19, %zmm3, %zmm25 ; AVX512BW-FCP-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] -; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm25, %zmm22 ; AVX512BW-FCP-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} @@ -15537,7 +15532,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3} ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm24, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm25, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -15549,33 +15544,33 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] -; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm0, %zmm18 ; AVX512BW-FCP-NEXT: movl $405823681, %eax # imm = 0x183060C1 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm30, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 +; AVX512BW-FCP-NEXT: vpermi2w %zmm14, %zmm15, %zmm24 ; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] -; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm10, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -15584,44 +15579,44 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm1, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm20 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] -; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} ; AVX512BW-FCP-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31] ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-FCP-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] -; AVX512BW-FCP-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512BW-FCP-NEXT: vpermi2w %zmm23, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm23, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 384(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 512(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512BW-FCP-NEXT: addq $136, %rsp @@ -15637,125 +15632,125 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2w %zmm31, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm31, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] +; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm6 ; AVX512DQ-BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm6 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] -; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2w %zmm29, %zmm23, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2w %zmm29, %zmm24, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm29 {%k3} ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm6 {%k3} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3} +; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm23, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm2, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm31, %zmm29 {%k3} ; AVX512DQ-BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm21 -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63] -; AVX512DQ-BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63] +; AVX512DQ-BW-NEXT: vpermi2w %zmm19, %zmm2, %zmm21 ; AVX512DQ-BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 ; AVX512DQ-BW-NEXT: movl $202911840, %eax # imm = 0xC183060 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm24 +; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm25 ; AVX512DQ-BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512DQ-BW-NEXT: vpermi2w %zmm19, %zmm3, %zmm25 ; AVX512DQ-BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm25, %zmm22 ; AVX512DQ-BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} @@ -15763,7 +15758,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3} ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm25, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -15775,33 +15770,33 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm2, %zmm10 ; AVX512DQ-BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm18 ; AVX512DQ-BW-NEXT: movl $405823681, %eax # imm = 0x183060C1 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm30, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 +; AVX512DQ-BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm24 ; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm10, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -15810,44 +15805,44 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm20 {%k3} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] -; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm1, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} ; AVX512DQ-BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31] ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512DQ-BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] -; AVX512DQ-BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512DQ-BW-NEXT: vpermi2w %zmm23, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm23, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 384(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 512(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512DQ-BW-NEXT: addq $136, %rsp @@ -15863,125 +15858,125 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm31, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm25, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm11, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm31, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] +; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm6 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] -; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm23, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm24, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm29 {%k3} ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm6 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm23, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm2, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm31, %zmm29 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm12, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm19, %zmm2, %zmm21 ; AVX512DQ-BW-FCP-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm7 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movl $202911840, %eax # imm = 0xC183060 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm19, %zmm3, %zmm25 ; AVX512DQ-BW-FCP-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm25, %zmm22 ; AVX512DQ-BW-FCP-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} @@ -15989,7 +15984,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm24, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm25, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -16001,33 +15996,33 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm0, %zmm18 ; AVX512DQ-BW-FCP-NEXT: movl $405823681, %eax # imm = 0x183060C1 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm30, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm14, %zmm15, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] @@ -16036,44 +16031,44 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm1, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm1, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm20 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm1, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} ; AVX512DQ-BW-FCP-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm23, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm23, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 384(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512DQ-BW-FCP-NEXT: addq $136, %rsp diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 9c9dca82f60ca..aa8c5e2dff8dc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -27,8 +27,8 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%r11), %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] @@ -53,9 +53,9 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 ; AVX-NEXT: vmovdqa (%r8), %xmm2 -; AVX-NEXT: vmovdqa (%r11), %xmm3 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX-NEXT: vmovdqa (%r11), %xmm3 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] @@ -75,15 +75,15 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-NEXT: vmovdqa (%r11), %xmm3 +; AVX2-NEXT: vmovdqa (%r10), %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -97,15 +97,15 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FP-NEXT: vmovdqa (%r11), %xmm3 +; AVX2-FP-NEXT: vmovdqa (%r10), %xmm3 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -119,15 +119,15 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FCP-NEXT: vmovdqa (%r11), %xmm3 +; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm3 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -141,15 +141,15 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-NEXT: vmovdqa (%r11), %xmm3 +; AVX512-NEXT: vmovdqa (%r10), %xmm3 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -163,15 +163,15 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm3 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -185,15 +185,15 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm3 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -207,15 +207,15 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -233,14 +233,14 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -254,14 +254,14 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] ; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -275,14 +275,14 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] ; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -296,14 +296,14 @@ define void @store_i16_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -535,7 +535,6 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -561,6 +560,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 @@ -623,7 +623,6 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -648,6 +647,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm0 @@ -710,7 +710,6 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -735,6 +734,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm0 @@ -764,7 +764,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -789,7 +789,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -814,7 +814,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -839,7 +839,7 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,32,36,40,44,1,5,9,13,33,37,41,45,2,6,10,14,34,38,42,46,3,7,11,15,35,39,43,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -1029,25 +1029,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,151519488,0,185205506] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm13 = [151519488,0,185205506,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX2-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,218891524,0,252577542] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm15 = [218891524,0,252577542,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1093,25 +1093,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,151519488,0,185205506] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [151519488,0,185205506,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,218891524,0,252577542] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [218891524,0,252577542,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1157,25 +1157,25 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,151519488,0,185205506] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [151519488,0,185205506,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,218891524,0,252577542] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [218891524,0,252577542,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1219,27 +1219,27 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm6 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1283,27 +1283,27 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm6 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1347,27 +1347,27 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm6 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1411,27 +1411,27 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] @@ -1471,13 +1471,13 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1495,13 +1495,13 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1519,13 +1519,13 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1543,13 +1543,13 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,56,1,9,17,25,33,41,49,57,2,10,18,26,34,42,50,58,3,11,19,27,35,43,51,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,36,44,52,60,5,13,21,29,37,45,53,61,6,14,22,30,38,46,54,62,7,15,23,31,39,47,55,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -2128,32 +2128,32 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm5 ; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm4 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,u,u,1,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm6 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm9 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,0,u,u,u,1,u] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm12 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,1,1,1,1,u,u] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm14 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,1,1,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,u,1,u,1,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,2,2,2,u,u,3,3] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [2,2,3,3,3,3,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,2,3,3,3,3,u,u] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm11, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] @@ -2176,57 +2176,57 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%r10), %ymm13 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,u,u,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm15 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,u,0,u,u,u,1,u] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,1,1,1,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,1,1,u,u] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm14[0],ymm6[0],ymm14[1],ymm6[1],ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[8],ymm6[8],ymm14[9],ymm6[9],ymm14[10],ymm6[10],ymm14[11],ymm6[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6],ymm9[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm6[4],ymm14[5],ymm6[5],ymm14[6],ymm6[6],ymm14[7],ymm6[7],ymm14[12],ymm6[12],ymm14[13],ymm6[13],ymm14[14],ymm6[14],ymm14[15],ymm6[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm12, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -2273,7 +2273,7 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,u,0,16,u,u,1,17,2,2,2,18,u,u,3,19] ; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 ; AVX512-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512-NEXT: vmovdqa (%rdx), %xmm7 @@ -2281,17 +2281,17 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,16,1,u,1,17,u,u,2,18,3,3,3,19,u,u] ; AVX512-NEXT: vpermt2d %zmm15, %zmm17, %zmm5 ; AVX512-NEXT: movb $-86, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] ; AVX512-NEXT: vpermt2d %zmm14, %zmm18, %zmm15 ; AVX512-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] ; AVX512-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] @@ -2344,7 +2344,7 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,u,0,16,u,u,1,17,2,2,2,18,u,u,3,19] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 @@ -2352,17 +2352,17 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,16,1,u,1,17,u,u,2,18,3,3,3,19,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm5 ; AVX512-FCP-NEXT: movb $-86, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm15 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] @@ -2415,7 +2415,7 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,u,0,16,u,u,1,17,2,2,2,18,u,u,3,19] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm7 @@ -2423,17 +2423,17 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,16,1,u,1,17,u,u,2,18,3,3,3,19,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm17, %zmm5 ; AVX512DQ-NEXT: movb $-86, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm18, %zmm15 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] @@ -2486,7 +2486,7 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,u,0,16,u,u,1,17,2,2,2,18,u,u,3,19] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 @@ -2494,17 +2494,17 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,16,1,u,1,17,u,u,2,18,3,3,3,19,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm5 ; AVX512DQ-FCP-NEXT: movb $-86, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] @@ -2551,26 +2551,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movb $-86, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -2593,26 +2593,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movb $-86, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -2635,26 +2635,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movb $-86, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -2677,26 +2677,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,16,32,48,0,0,0,0,1,17,33,49,0,0,0,0,2,18,34,50,0,0,0,0,3,19,35,51,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $-86, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,52,0,0,0,0,5,21,37,53,0,0,0,0,6,22,38,54,0,0,0,0,7,23,39,55,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,24,40,56,0,0,0,0,9,25,41,57,0,0,0,0,10,26,42,58,0,0,0,0,11,27,43,59,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [12,28,44,60,0,0,0,0,13,29,45,61,0,0,0,0,14,30,46,62,0,0,0,0,15,31,47,63,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -3819,7 +3819,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,2,2,2,u,u,3,3] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3833,7 +3833,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,2,3,3,3,3,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,2,3,3,3,3,u,u] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm4 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3844,16 +3844,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,u,u,1,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,u,0,u,u,u,1,u] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,1,1,1,1,u,u] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,1,1,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,u,1,u,1,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] @@ -3878,11 +3878,11 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,2,2,2,u,u,3,3] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,3,3,3,3,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [2,2,3,3,3,3,u,u] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm15, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm15, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -3890,16 +3890,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,0,u,u,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,u,0,u,u,u,1,u] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,1,1,1,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,1,1,1,1,u,u] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm11, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,1,1,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,u,1,u,1,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] @@ -3946,10 +3946,10 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 @@ -3957,45 +3957,45 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm12 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm4, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm11 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3],ymm0[4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm14, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 @@ -4022,29 +4022,29 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm15, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm12, %ymm6 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] @@ -4052,7 +4052,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] @@ -4102,9 +4102,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa (%r8), %xmm5 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] ; AVX512-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] ; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512-NEXT: kmovw %r11d, %k1 ; AVX512-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} @@ -4114,9 +4114,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm11 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] ; AVX512-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] ; AVX512-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512-NEXT: kmovw %r11d, %k2 ; AVX512-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} @@ -4126,18 +4126,18 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX512-NEXT: vmovdqa 32(%r8), %ymm12 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm13 ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512-NEXT: vpermd %zmm6, %zmm20, %zmm14 ; AVX512-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] ; AVX512-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] @@ -4242,7 +4242,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm25 ; AVX512-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512-FCP-NEXT: kmovw %r11d, %k1 @@ -4253,9 +4253,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm9 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,1,1,1,1,u,u,2,u,3,u,3,u,u,u] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm6, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] ; AVX512-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512-FCP-NEXT: kmovw %r11d, %k2 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm29 {%k2} @@ -4265,18 +4265,18 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm12 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm13 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm27 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512-FCP-NEXT: vpermd %zmm6, %zmm17, %zmm27 {%k1} ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm15 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpermd %zmm6, %zmm18, %zmm30 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11] ; AVX512-FCP-NEXT: vpermd %zmm6, %zmm19, %zmm30 {%k2} ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] @@ -4295,12 +4295,12 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm31 {%k2} ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm21 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm21 {%k1} ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm20 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] @@ -4383,9 +4383,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] ; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512DQ-NEXT: kmovw %r11d, %k1 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} @@ -4395,9 +4395,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm11 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] ; AVX512DQ-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512DQ-NEXT: kmovw %r11d, %k2 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} @@ -4407,18 +4407,18 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm12 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512DQ-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm13 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm14 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] @@ -4523,7 +4523,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm25 ; AVX512DQ-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 @@ -4534,9 +4534,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,1,1,1,1,u,u,2,u,3,u,3,u,u,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm6, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] ; AVX512DQ-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm29 {%k2} @@ -4546,18 +4546,18 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm13 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm27 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm17, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm18, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11] ; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm19, %zmm30 {%k2} ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] @@ -4576,12 +4576,12 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm31 {%k2} ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm21 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] @@ -4664,16 +4664,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm4 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm9 ; AVX512BW-NEXT: movw $-30584, %cx # imm = 0x8888 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: movw $8738, %cx # imm = 0x2222 ; AVX512BW-NEXT: kmovd %ecx, %k2 @@ -4681,80 +4681,80 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-86, %cl ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm12 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -4782,16 +4782,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: movw $-30584, %cx # imm = 0x8888 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: movw $8738, %cx # imm = 0x2222 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 @@ -4799,80 +4799,80 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $-86, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm10 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -4900,16 +4900,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: movw $-30584, %cx # imm = 0x8888 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: movw $8738, %cx # imm = 0x2222 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 @@ -4917,80 +4917,80 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $-86, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm10 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -5018,16 +5018,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movw $-30584, %cx # imm = 0x8888 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movw $8738, %cx # imm = 0x2222 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 @@ -5035,80 +5035,80 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $-86, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -7258,50 +7258,50 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm4 ; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm5 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,u,u,1,1] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm6 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm7 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,u,0,u,u,u,1,u] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,1,1,1,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,1,1,1,1,u,u] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm12 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm13 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,1,1,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,u,1,u,1,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm14, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [2,2,2,2,u,u,3,3] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm15, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,2,3,3,3,3,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,2,3,3,3,3,u,u] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,0,u,u,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,u,0,u,u,u,1,u] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm12, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %xmm10 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,1,1,1,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,1,1,1,1,u,u] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,1,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,u,1,u,1,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovdqa 32(%r10), %xmm6 @@ -7312,7 +7312,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,2,3,3,3,3,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,2,3,3,3,3,u,u] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] @@ -7330,16 +7330,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm14, %ymm15 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,1,1,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,u,1,u,1,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm12, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [2,2,2,2,u,u,3,3] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm15, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,2,3,3,3,3,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,2,3,3,3,3,u,u] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm13 @@ -7348,14 +7348,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,0,0,u,u,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,u,0,u,u,u,1,u] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm10, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,1,1,1,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,1,1,1,1,u,u] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] @@ -7389,11 +7389,11 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [2,2,2,2,u,u,3,3] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm15, %ymm4 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm15, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,3,3,3,3,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,2,3,3,3,3,u,u] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm5 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm14 @@ -7402,16 +7402,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,0,u,u,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,u,0,u,u,u,1,u] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,1,1,1,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,1,1,1,1,u,u] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,1,1,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,u,1,u,1,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] @@ -7444,11 +7444,11 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,2,2,2,0,0,3,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,2,2,2,u,u,3,3] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm10, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,2,3,3,3,3,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,2,3,3,3,3,u,u] ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm12, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] @@ -7456,16 +7456,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,u,u,1,1] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,u,0,u,u,u,1,u] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,1,1,1,1,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,1,1,1,1,u,u] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,1,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,u,1,u,1,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] @@ -7483,10 +7483,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%r10), %ymm10 ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm11 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 @@ -7494,45 +7494,45 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm15 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4],ymm5[5],ymm9[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3],ymm9[4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm11, %ymm6 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm9 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 @@ -7556,35 +7556,35 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm12 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm8, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm10, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm14, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm13, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] @@ -7613,33 +7613,33 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [4,5,1,3,5,7,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,1,3,5,7,5,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm13, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm15[4],ymm12[5],ymm15[5],ymm12[6],ymm15[6],ymm12[7],ymm15[7],ymm12[12],ymm15[12],ymm12[13],ymm15[13],ymm12[14],ymm15[14],ymm12[15],ymm15[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm13, %ymm8 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] @@ -7649,7 +7649,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm14, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -7661,7 +7661,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm11 @@ -7669,43 +7669,43 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm15 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm7 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm13, %ymm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,1,6,5,6,5,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,1,6,5,6,5,7,7] ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm13, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [6,5,3,3,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,3,3,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,4,4,4,4,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,4,4,4,4,6,5] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7] ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,4,2,1,6,5,6,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,4,2,1,6,5,6,5] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,4,6,4,6,6,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm13, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,6,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,6,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] @@ -7791,9 +7791,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa (%r8), %xmm7 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] ; AVX512-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] ; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512-NEXT: kmovw %r11d, %k2 ; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} @@ -7806,9 +7806,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 96(%r9), %ymm8 ; AVX512-NEXT: vmovdqa 96(%r8), %ymm9 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 96(%rcx), %ymm10 @@ -7816,10 +7816,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm13 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512-NEXT: vpermd %zmm1, %zmm16, %zmm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] @@ -7956,9 +7956,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermd %zmm6, %zmm30, %zmm8 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] ; AVX512-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] ; AVX512-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] ; AVX512-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 @@ -8064,7 +8064,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm1 ; AVX512-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512-FCP-NEXT: kmovw %r11d, %k2 @@ -8080,9 +8080,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,1,1,u,u,2,u,3,u,3,u,u,u] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] ; AVX512-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512-FCP-NEXT: kmovw %r11d, %k1 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm17 {%k1} @@ -8092,19 +8092,19 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm13 ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm15 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512-FCP-NEXT: vpermd %zmm11, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512-FCP-NEXT: vpermd %zmm6, %zmm22, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpermd %zmm6, %zmm20, %zmm10 ; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] ; AVX512-FCP-NEXT: vpermd %zmm9, %zmm21, %zmm10 {%k1} ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] @@ -8192,7 +8192,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm0 ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm21, %zmm20 {%k1} ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm21 ; AVX512-FCP-NEXT: vmovdqa 96(%r10), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 96(%rax), %xmm3 @@ -8237,7 +8237,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm8 ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm4 ; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm5 @@ -8338,9 +8338,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm7 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] ; AVX512DQ-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] ; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512DQ-NEXT: kmovw %r11d, %k2 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} @@ -8353,9 +8353,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm8 ; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm9 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512DQ-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm10 @@ -8363,10 +8363,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm13 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm0 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512DQ-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] @@ -8503,9 +8503,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermd %zmm6, %zmm30, %zmm8 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] ; AVX512DQ-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 @@ -8611,7 +8611,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm1 ; AVX512DQ-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2 @@ -8627,9 +8627,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,1,1,u,u,2,u,3,u,3,u,u,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u] ; AVX512DQ-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm17 {%k1} @@ -8639,19 +8639,19 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm15 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] ; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] ; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm22, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm20, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] ; AVX512DQ-FCP-NEXT: vpermd %zmm9, %zmm21, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] @@ -8739,7 +8739,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm0 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm21, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %xmm3 @@ -8784,7 +8784,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm5 @@ -8880,57 +8880,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -8952,10 +8952,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -8976,28 +8976,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm20 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm21 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm22 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm23 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm24 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -9016,28 +9016,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -9169,57 +9169,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -9241,10 +9241,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] ; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -9265,28 +9265,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm2, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm3, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm6, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -9305,28 +9305,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -9458,57 +9458,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -9530,10 +9530,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] ; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -9554,28 +9554,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm20 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -9594,28 +9594,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -9747,57 +9747,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,0,0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,0,0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,0,0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,0,0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm31 = [0,0,0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -9819,10 +9819,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [0,0,0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,0,0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -9843,28 +9843,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm2, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,0,0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm3, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,0,12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm6, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,0,20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm29 = [0,0,16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [0,0,28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -9883,28 +9883,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,36,0,0,0,0,0,0,5,37,0,0,0,0,0,0,6,38,0,0,0,0,0,0,7,39,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,32,0,0,0,0,0,0,1,33,0,0,0,0,0,0,2,34,0,0,0,0,0,0,3,35,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [12,44,0,0,0,0,0,0,13,45,0,0,0,0,0,0,14,46,0,0,0,0,0,0,15,47,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [8,40,0,0,0,0,0,0,9,41,0,0,0,0,0,0,10,42,0,0,0,0,0,0,11,43,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [20,52,0,0,0,0,0,0,21,53,0,0,0,0,0,0,22,54,0,0,0,0,0,0,23,55,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,0,0,0,0,0,0,17,49,0,0,0,0,0,0,18,50,0,0,0,0,0,0,19,51,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [28,60,0,0,0,0,0,0,29,61,0,0,0,0,0,0,30,62,0,0,0,0,0,0,31,63,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [24,56,0,0,0,0,0,0,25,57,0,0,0,0,0,0,26,58,0,0,0,0,0,0,27,59,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll index c2ce612c33c2e..0bb4f30ca8820 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -186,7 +186,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512: # %bb.0: ; AVX512-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovaps %ymm0, (%rdx) ; AVX512-NEXT: vzeroupper @@ -196,7 +196,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovaps %ymm0, (%rdx) ; AVX512-FCP-NEXT: vzeroupper @@ -206,7 +206,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovaps %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -216,7 +216,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -226,7 +226,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vmovaps %ymm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -236,7 +236,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper @@ -246,7 +246,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vmovaps %ymm0, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper @@ -256,7 +256,7 @@ define void @store_i32_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -344,7 +344,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512: # %bb.0: ; AVX512-NEXT: vmovaps (%rdi), %ymm0 ; AVX512-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovaps %zmm0, (%rdx) ; AVX512-NEXT: vzeroupper @@ -354,7 +354,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX512-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512-FCP-NEXT: vzeroupper @@ -364,7 +364,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -374,7 +374,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -384,7 +384,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 ; AVX512BW-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -394,7 +394,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper @@ -404,7 +404,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper @@ -414,7 +414,7 @@ define void @store_i32_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -548,9 +548,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -561,9 +561,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -574,9 +574,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -587,9 +587,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -600,9 +600,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -613,9 +613,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -626,9 +626,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -639,9 +639,9 @@ define void @store_i32_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -872,10 +872,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -892,10 +892,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -912,10 +912,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -932,10 +932,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -952,10 +952,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -972,10 +972,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -992,10 +992,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -1012,10 +1012,10 @@ define void @store_i32_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 @@ -1490,10 +1490,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1524,10 +1524,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1558,10 +1558,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1592,10 +1592,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1626,10 +1626,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1660,10 +1660,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1694,10 +1694,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -1728,10 +1728,10 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index 39230b67d380f..2fc22acc65d8d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -99,7 +99,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,1,3,9,u,u] ; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512-NEXT: vmovq %xmm0, 16(%rcx) @@ -113,7 +113,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,1,3,9,u,u] ; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512-FCP-NEXT: vmovq %xmm0, 16(%rcx) @@ -127,7 +127,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,1,3,9,u,u] ; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-NEXT: vmovq %xmm0, 16(%rcx) @@ -141,7 +141,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,1,3,9,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, 16(%rcx) @@ -155,7 +155,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,1,3,9,u,u] ; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, 16(%rcx) @@ -169,7 +169,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,1,3,9,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm0, 16(%rcx) @@ -183,7 +183,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,1,3,9,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-NEXT: vmovq %xmm0, 16(%rcx) @@ -197,7 +197,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,1,3,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,1,3,9,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 16(%rcx) @@ -264,12 +264,12 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,u,1,5,u,2,6] +; AVX2-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vbroadcastsd (%rdx), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3] ; AVX2-NEXT: vmovaps %xmm0, 32(%rcx) ; AVX2-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-NEXT: vzeroupper @@ -281,12 +281,12 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,u,1,5,u,2,6] +; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3] ; AVX2-FP-NEXT: vmovaps %xmm0, 32(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FP-NEXT: vzeroupper @@ -296,14 +296,14 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7,3,7,3,7,3,7,3] +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [u,3,7,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,u,1,5,u,2,6] +; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm3 ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FCP-NEXT: vmovaps %xmm1, 32(%rcx) ; AVX2-FCP-NEXT: vzeroupper @@ -314,7 +314,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512-NEXT: vmovaps %ymm0, (%rcx) @@ -326,7 +326,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512-FCP-NEXT: vmovaps %ymm0, (%rcx) @@ -338,7 +338,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] +; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] ; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-NEXT: vmovaps %ymm0, (%rcx) @@ -350,7 +350,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rcx) @@ -362,7 +362,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] ; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512BW-NEXT: vmovaps %ymm0, (%rcx) @@ -374,7 +374,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rcx) @@ -386,7 +386,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-BW-NEXT: vmovaps %ymm0, (%rcx) @@ -398,7 +398,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rcx) @@ -560,8 +560,7 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [5,u,u,6,u,u,7,u] ; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vbroadcastsd 24(%rdi), %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] @@ -578,9 +577,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -592,9 +591,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -606,9 +605,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -620,9 +619,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -634,9 +633,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -648,9 +647,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -662,9 +661,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -676,9 +675,9 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [13,21,6,14,22,7,15,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -839,17 +838,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX2-NEXT: vmovaps (%rdx), %ymm6 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = mem[1,0,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,0,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] -; AVX2-NEXT: vbroadcastsd (%rdx), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] +; AVX2-NEXT: vbroadcastsd (%rdx), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] ; AVX2-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] @@ -871,9 +870,9 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-NEXT: vmovaps %ymm0, 128(%rcx) @@ -881,7 +880,7 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm9, 64(%rcx) ; AVX2-NEXT: vmovaps %ymm8, 96(%rcx) ; AVX2-NEXT: vmovaps %ymm7, 160(%rcx) -; AVX2-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -890,17 +889,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm6 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm2 = mem[1,0,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,0,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] -; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] +; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] ; AVX2-FP-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] @@ -922,9 +921,9 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rcx) @@ -932,56 +931,56 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm9, 64(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm7, 160(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i32_stride3_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm3 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm5 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[1,1,2,2] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[1,1,2,2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1,2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7] ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm7, %ymm8 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] -; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[1,1,2,2] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm7, %ymm8 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm10 +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm9, %ymm10 ; AVX2-FCP-NEXT: vbroadcastsd 56(%rdi), %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6],ymm6[7] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-FCP-NEXT: vbroadcastsd 24(%rdi), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[2,1,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm7, %ymm5 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm4 +; AVX2-FCP-NEXT: vbroadcastsd 24(%rdi), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm6, 160(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -991,17 +990,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1014,17 +1013,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1037,17 +1036,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1060,17 +1059,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1083,17 +1082,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1106,17 +1105,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1129,17 +1128,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1152,17 +1151,17 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -1520,19 +1519,19 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] +; AVX2-NEXT: vpermilps {{.*#+}} xmm14 = mem[1,0,2,2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[0,1,0,1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,0,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] ; AVX2-NEXT: vbroadcastsd 32(%rdx), %ymm14 +; AVX2-NEXT: vbroadcastsd 120(%rdi), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-NEXT: vpermilps {{.*#+}} xmm15 = mem[1,0,2,2] @@ -1629,19 +1628,19 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] +; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm14 = mem[1,0,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[0,1,0,1] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,0,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] ; AVX2-FP-NEXT: vbroadcastsd 32(%rdx), %ymm14 +; AVX2-FP-NEXT: vbroadcastsd 120(%rdi), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm15 = mem[1,0,2,2] @@ -1702,91 +1701,91 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FCP-LABEL: store_i32_stride3_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm13 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm12 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm12 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm13 ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm14 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm8, %ymm5 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [1,0,2,2,1,0,2,2] +; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm5 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] ; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7] -; AVX2-FCP-NEXT: vpermps %ymm13, %ymm9, %ymm5 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm7 = [5,6,5,6,5,6,7,7] +; AVX2-FCP-NEXT: vpermps %ymm12, %ymm7, %ymm5 ; AVX2-FCP-NEXT: vbroadcastsd 88(%rdi), %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm14[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX2-FCP-NEXT: vpermps %ymm13, %ymm8, %ymm6 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,0,2,1] +; AVX2-FCP-NEXT: vpermps %ymm12, %ymm9, %ymm6 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm10[0,0,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6],ymm6[7] ; AVX2-FCP-NEXT: vbroadcastsd 64(%rdx), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4],ymm15[5],ymm6[6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm7[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX2-FCP-NEXT: vpermps %ymm10, %ymm9, %ymm14 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,0,3,3,4,4,7,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm8[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] +; AVX2-FCP-NEXT: vpermps %ymm11, %ymm7, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd 56(%rdi), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm15 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX2-FCP-NEXT: vpermps %ymm10, %ymm8, %ymm10 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6],ymm10[7] -; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-FCP-NEXT: vpermps %ymm11, %ymm9, %ymm11 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm4[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm14 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm9, %ymm10 -; AVX2-FCP-NEXT: vbroadcastsd 120(%rdi), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm14[2,1,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7] -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6],ymm8[7] -; AVX2-FCP-NEXT: vbroadcastsd 96(%rdx), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd 24(%rdi), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7] +; AVX2-FCP-NEXT: vbroadcastsd 120(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[2,1,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6],ymm11[7] +; AVX2-FCP-NEXT: vpermps %ymm15, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FCP-NEXT: vbroadcastsd 96(%rdx), %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6],ymm9[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm11[2],ymm4[3,4],ymm11[5],ymm4[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 24(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm7, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm3, 288(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm4, 352(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm4, 288(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm3, 352(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm12, 160(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm13, 128(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm11, 224(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm13, 160(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm12, 128(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm10, 224(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm6, 192(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm5, 256(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1804,20 +1803,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -1842,20 +1841,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -1880,20 +1879,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -1918,20 +1917,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -1956,20 +1955,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -1994,20 +1993,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -2032,20 +2031,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -2070,20 +2069,20 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 @@ -2553,9 +2552,9 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1],xmm0[0,2] ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm6[0] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,1] +; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm3[3,3] @@ -2605,7 +2604,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm1[1],xmm0[1] @@ -2614,6 +2613,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX-NEXT: vbroadcastsd 128(%rdx), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 144(%rsi), %xmm0 @@ -2736,7 +2736,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm9, 448(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 384(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 352(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 288(%rcx) @@ -2765,10 +2765,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 64(%rdi), %ymm13 ; AVX2-NEXT: vmovaps (%rsi), %ymm5 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rsi), %ymm2 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 64(%rsi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rdx), %ymm6 @@ -2797,23 +2797,23 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] +; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = mem[1,0,2,2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,1,0,1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,0,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-NEXT: vbroadcastsd 64(%rdx), %ymm1 +; AVX2-NEXT: vbroadcastsd 88(%rdi), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3008,10 +3008,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm5 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm2 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm6 @@ -3040,23 +3040,23 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] +; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm2 = mem[1,0,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,1,0,1] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,0,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FP-NEXT: vbroadcastsd 64(%rdx), %ymm1 +; AVX2-FP-NEXT: vbroadcastsd 88(%rdi), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3250,12 +3250,12 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm2 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm0 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm10 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] @@ -3278,40 +3278,40 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7] ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm7, %ymm8 +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm7, %ymm8 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,0,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] ; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[1,1,2,2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm5[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm6, %ymm3 -; AVX2-FCP-NEXT: vbroadcastsd 56(%rdi), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm6, %ymm3 +; AVX2-FCP-NEXT: vbroadcastsd 56(%rdi), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm10, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,0,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FCP-NEXT: vbroadcastsd 64(%rdx), %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm10[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm10, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd 88(%rdi), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vbroadcastsd 88(%rdi), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm7, %ymm1 @@ -3326,8 +3326,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vbroadcastsd 120(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[2,1,3,3] @@ -3344,8 +3344,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vbroadcastsd 152(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -3362,8 +3362,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm0 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm2[1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FCP-NEXT: vbroadcastsd 184(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] @@ -3380,8 +3380,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm9 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vbroadcastsd 216(%rdi), %ymm10 +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] @@ -3398,9 +3398,9 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm11 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] +; AVX2-FCP-NEXT: vbroadcastsd 248(%rdi), %ymm12 ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vbroadcastsd 248(%rdi), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7] ; AVX2-FCP-NEXT: vmovaps %ymm6, 736(%rcx) @@ -3456,20 +3456,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3522,20 +3522,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3588,20 +3588,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3654,20 +3654,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3720,20 +3720,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3786,20 +3786,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3852,20 +3852,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 @@ -3918,20 +3918,20 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,0,22,6,0,23,7,0,24,8,0,25,9,0,26,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll index c15eff9141fff..2923f6b42fdf3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -116,7 +116,7 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8) ; AVX512-FCP-NEXT: vzeroupper @@ -145,7 +145,7 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -174,7 +174,7 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] ; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -203,7 +203,7 @@ define void @store_i32_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -340,7 +340,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovaps %zmm0, (%r8) ; AVX512-NEXT: vzeroupper @@ -353,7 +353,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper @@ -366,7 +366,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovaps %zmm0, (%r8) ; AVX512DQ-NEXT: vzeroupper @@ -379,7 +379,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -392,7 +392,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovaps %zmm0, (%r8) ; AVX512BW-NEXT: vzeroupper @@ -405,7 +405,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -418,7 +418,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper @@ -431,7 +431,7 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -659,9 +659,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512-NEXT: vmovdqa64 %zmm2, (%r8) @@ -674,9 +674,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -689,9 +689,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r8) @@ -704,9 +704,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -719,9 +719,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) @@ -734,9 +734,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -749,9 +749,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8) @@ -764,9 +764,9 @@ define void @store_i32_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -1195,26 +1195,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: movb $-86, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1230,26 +1230,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: movb $-86, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1265,26 +1265,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: movb $-86, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1300,26 +1300,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: movb $-86, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1335,26 +1335,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1370,26 +1370,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movb $-86, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1405,26 +1405,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movb $-86, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1440,26 +1440,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $-86, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -2367,32 +2367,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512-NEXT: movb $-86, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2428,32 +2428,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512-FCP-NEXT: movb $-86, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2489,32 +2489,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512DQ-NEXT: movb $-86, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2550,32 +2550,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: movb $-86, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2611,32 +2611,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2672,32 +2672,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movb $-86, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2733,32 +2733,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movb $-86, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -2794,32 +2794,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movb $-86, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -4727,32 +4727,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512-NEXT: movb $-86, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -4840,32 +4840,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512-FCP-NEXT: movb $-86, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -4953,32 +4953,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512DQ-NEXT: movb $-86, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5066,32 +5066,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: movb $-86, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5179,32 +5179,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5292,32 +5292,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512BW-FCP-NEXT: movb $-86, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5405,32 +5405,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512DQ-BW-NEXT: movb $-86, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5518,32 +5518,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,20,0,0,5,21,0,0,6,22,0,0,7,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $-86, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,0,0,2,18,0,0,3,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,16,0,0,1,17,0,0,2,18,0,0,3,19,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,12,28,0,0,13,29,0,0,14,30,0,0,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [12,28,0,0,13,29,0,0,14,30,0,0,15,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,8,24,0,0,9,25,0,0,10,26,0,0,11,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [8,24,0,0,9,25,0,0,10,26,0,0,11,27,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 0fba7de803488..38e57277dd9a5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -71,7 +71,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,6,0,1,3,5] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,u,1,3,5] ; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vmovd %eax, %xmm3 ; AVX2-NEXT: vpbroadcastd %xmm3, %ymm3 @@ -94,7 +94,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovq %rax, %xmm2 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,6,0,1,3,5] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,u,1,3,5] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovd %eax, %xmm3 ; AVX2-FP-NEXT: vpbroadcastd %xmm3, %ymm3 @@ -117,7 +117,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovq %rax, %xmm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,6,0,1,3,5] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,u,1,3,5] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovd %eax, %xmm3 ; AVX2-FCP-NEXT: vpbroadcastd %xmm3, %ymm3 @@ -140,7 +140,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; AVX512-NEXT: vmovlps %xmm1, 32(%r9) @@ -159,7 +159,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512-FCP-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; AVX512-FCP-NEXT: vmovlps %xmm1, 32(%r9) @@ -178,7 +178,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; AVX512DQ-NEXT: vmovlps %xmm1, 32(%r9) @@ -197,7 +197,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, 32(%r9) @@ -216,7 +216,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; AVX512BW-NEXT: vmovlps %xmm1, 32(%r9) @@ -235,7 +235,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-FCP-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; AVX512BW-FCP-NEXT: vmovlps %xmm1, 32(%r9) @@ -254,7 +254,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; AVX512DQ-BW-NEXT: vmovlps %xmm1, 32(%r9) @@ -273,7 +273,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, 32(%r9) @@ -449,7 +449,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [7,3,7,3,7,3,7,3] +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm5 = [u,3,7,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] @@ -464,13 +464,13 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-NEXT: vmovdqa %xmm1, 64(%r9) @@ -481,13 +481,13 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -498,13 +498,13 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-NEXT: vmovdqa %xmm1, 64(%r9) @@ -515,13 +515,13 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -532,13 +532,13 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -549,13 +549,13 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -566,13 +566,13 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9) @@ -583,13 +583,13 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,11,15,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [7,11,15,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) @@ -781,9 +781,9 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 16(%r8), %ymm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] -; AVX2-NEXT: vbroadcastsd 16(%r8), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4],ymm9[5],ymm7[6,7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] @@ -843,9 +843,9 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 16(%r8), %ymm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 16(%r8), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4],ymm9[5],ymm7[6,7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] @@ -904,9 +904,9 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 16(%r8), %ymm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 16(%r8), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4],ymm9[5],ymm7[6,7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] @@ -938,15 +938,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) @@ -963,15 +963,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r9) @@ -988,15 +988,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1013,15 +1013,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1038,15 +1038,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512BW-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1063,15 +1063,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1088,15 +1088,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1113,15 +1113,15 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,14,0,23,31,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,19,27,0,4,12,20,28,0,5,13,21,29,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,8,16,24,0,1,9,17,25,0,2,10,18,26,0,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) @@ -1447,13 +1447,13 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-NEXT: vmovaps (%rdx), %ymm6 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm4 ; AVX2-NEXT: vmovaps (%rcx), %ymm2 ; AVX2-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX2-NEXT: vmovaps (%r8), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%r8), %ymm6 +; AVX2-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-NEXT: vmovaps (%rdi), %xmm10 @@ -1489,7 +1489,8 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm1[0,1,2,1] +; AVX2-NEXT: vmovaps %ymm1, %ymm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] ; AVX2-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] @@ -1497,8 +1498,8 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] @@ -1515,48 +1516,48 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] -; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] ; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-NEXT: vmovaps (%rsi), %ymm0 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2],ymm8[3,4,5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] -; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX2-NEXT: vbroadcastsd 16(%r8), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3,4],ymm14[5],ymm8[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4,5,6],ymm2[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-NEXT: vmovaps %ymm3, 224(%r9) -; AVX2-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-NEXT: vmovaps %ymm8, 128(%r9) +; AVX2-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-NEXT: vmovaps %ymm1, 128(%r9) ; AVX2-NEXT: vmovaps %ymm13, 288(%r9) ; AVX2-NEXT: vmovaps %ymm12, 256(%r9) ; AVX2-NEXT: vmovaps %ymm11, 160(%r9) @@ -1572,13 +1573,13 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm6 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm4 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm2 ; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX2-FP-NEXT: vmovaps (%r8), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm6 +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm10 @@ -1614,7 +1615,8 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vmovaps %ymm1, %ymm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] @@ -1622,8 +1624,8 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-FP-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] @@ -1640,48 +1642,48 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FP-NEXT: vbroadcastsd 56(%r8), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm0 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2],ymm8[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 16(%r8), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3,4],ymm14[5],ymm8[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FP-NEXT: vmovaps %ymm3, 224(%r9) -; AVX2-FP-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-FP-NEXT: vmovaps %ymm8, 128(%r9) +; AVX2-FP-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9) ; AVX2-FP-NEXT: vmovaps %ymm13, 288(%r9) ; AVX2-FP-NEXT: vmovaps %ymm12, 256(%r9) ; AVX2-FP-NEXT: vmovaps %ymm11, 160(%r9) @@ -1695,15 +1697,17 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FCP-LABEL: store_i32_stride5_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm4 +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm9 -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm12 = [0,1,0,1,u,u,2,2] ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm5 ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm8 @@ -1714,9 +1718,9 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm5 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,2,2] @@ -1725,90 +1729,88 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm7 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm8 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,3,2,3,2,3,2] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm11 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm3 = [0,1,3,2,3,2,3,2] +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm3, %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6],ymm8[7] ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm11 ; AVX2-FCP-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm2[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm1[1,2],ymm13[3,4],ymm1[5,6],ymm13[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm4[1,2],ymm13[3,4],ymm4[5,6],ymm13[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 48(%r8), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4],ymm13[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 48(%r8), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4],ymm13[5],ymm8[6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm8[1,2,3,4],ymm0[5],ymm8[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3,4],ymm0[5,6],ymm13[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3,4],ymm5[5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 56(%r8), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1],ymm13[2],ymm5[3,4,5,6],ymm13[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm10 -; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6],ymm5[7] -; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r8), %ymm9, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm9, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r8), %ymm9, %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm0[1,2,3],ymm9[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3,4],ymm0[5,6],ymm5[7] -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3,4],ymm0[5,6],ymm3[7] +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm14 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,1,3,3] +; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm15[3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm3[1,2],ymm15[3,4],ymm3[5,6],ymm15[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4],ymm15[5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 16(%r8), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2,3,4],ymm15[5],ymm6[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4],ymm15[5],ymm1[6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,1,3,0,4,5,7,4] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,3,0,4,5,7,4] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,0,4,5,7,4] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm13, 288(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm8, 256(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm6, (%r9) +; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FCP-NEXT: vzeroupper @@ -1816,528 +1818,528 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i32_stride5_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 ; AVX512-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 +; AVX512-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 +; AVX512-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqa32 %zmm9, %zmm3 {%k2} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i32_stride5_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 ; AVX512-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm3 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i32_stride5_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 ; AVX512DQ-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 +; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm3 {%k2} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i32_stride5_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 ; AVX512DQ-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm3 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride5_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i32_stride5_vf16: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 ; AVX512BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i32_stride5_vf16: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i32_stride5_vf16: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm6 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -3040,7 +3042,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-LABEL: store_i32_stride5_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $552, %rsp # imm = 0x228 +; AVX2-NEXT: subq $584, %rsp # imm = 0x248 ; AVX2-NEXT: vmovaps (%r8), %ymm15 ; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%r8), %ymm14 @@ -3136,7 +3138,8 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-NEXT: vmovaps (%rcx), %ymm14 +; AVX2-NEXT: vmovaps (%rcx), %ymm4 +; AVX2-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-NEXT: vmovaps 96(%r8), %ymm3 @@ -3148,139 +3151,140 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps (%rsi), %ymm12 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 16(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 16(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[6],ymm6[6],ymm12[7],ymm6[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[6],ymm6[6],ymm14[7],ymm6[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdx), %ymm11 -; AVX2-NEXT: vmovaps 32(%rcx), %ymm10 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-NEXT: vmovaps 32(%rcx), %ymm11 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm8 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 48(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 48(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rdx), %ymm7 -; AVX2-NEXT: vmovaps 64(%rcx), %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-NEXT: vmovaps 64(%rdx), %ymm8 +; AVX2-NEXT: vmovaps 64(%rcx), %ymm7 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 64(%rsi), %ymm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 80(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 80(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%rdx), %ymm3 -; AVX2-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-NEXT: vmovaps 96(%rcx), %ymm3 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4],ymm15[5,6,7] -; AVX2-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4],ymm15[5],ymm13[6,7] -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4],ymm13[5,6,7] -; AVX2-NEXT: vbroadcastsd 120(%r8), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] -; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] -; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4],ymm12[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,3,0,4,5,7,4] -; AVX2-NEXT: vpermilps $78, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: vbroadcastsd 112(%r8), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3,4],ymm15[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1,2,3,4],ymm0[5],ymm12[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1,2],ymm12[3,4],ymm0[5,6],ymm12[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm12 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4],ymm0[5,6,7] +; AVX2-NEXT: vbroadcastsd 120(%r8), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm12[2],ymm0[3,4,5,6],ymm12[7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] +; AVX2-NEXT: vpermilps $52, (%rsp), %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1,2,3],ymm12[4,5],ymm14[6,7] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1,3,0,4,5,7,4] -; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4,5,6],ymm9[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps %ymm0, 544(%r9) -; AVX2-NEXT: vmovaps %ymm4, 384(%r9) -; AVX2-NEXT: vmovaps %ymm8, 224(%r9) -; AVX2-NEXT: vmovaps %ymm12, 64(%r9) -; AVX2-NEXT: vmovaps %ymm13, 608(%r9) +; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps %ymm1, 544(%r9) +; AVX2-NEXT: vmovaps %ymm5, 384(%r9) +; AVX2-NEXT: vmovaps %ymm9, 224(%r9) +; AVX2-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-NEXT: vmovaps %ymm12, 608(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3311,13 +3315,13 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r9) -; AVX2-NEXT: addq $552, %rsp # imm = 0x228 +; AVX2-NEXT: addq $584, %rsp # imm = 0x248 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i32_stride5_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $552, %rsp # imm = 0x228 +; AVX2-FP-NEXT: subq $584, %rsp # imm = 0x248 ; AVX2-FP-NEXT: vmovaps (%r8), %ymm15 ; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm14 @@ -3413,7 +3417,8 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FP-NEXT: vmovaps (%rcx), %ymm14 +; AVX2-FP-NEXT: vmovaps (%rcx), %ymm4 +; AVX2-FP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm3 @@ -3425,139 +3430,140 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps (%rsi), %ymm12 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 16(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 16(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[6],ymm6[6],ymm12[7],ymm6[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[6],ymm6[6],ymm14[7],ymm6[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm11 -; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm10 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm11 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm8 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 48(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 48(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm7 -; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm8 +; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm7 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 80(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 80(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm3 -; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm3 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4],ymm15[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4],ymm15[5],ymm13[6,7] -; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4],ymm13[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 120(%r8), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] -; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4],ymm12[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,3,0,4,5,7,4] -; AVX2-FP-NEXT: vpermilps $78, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: vbroadcastsd 112(%r8), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3,4],ymm15[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1,2,3,4],ymm0[5],ymm12[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1,2],ymm12[3,4],ymm0[5,6],ymm12[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 120(%r8), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm12[2],ymm0[3,4,5,6],ymm12[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] +; AVX2-FP-NEXT: vpermilps $52, (%rsp), %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-FP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1,2,3],ymm12[4,5],ymm14[6,7] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1,3,0,4,5,7,4] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4,5,6],ymm9[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovaps %ymm0, 544(%r9) -; AVX2-FP-NEXT: vmovaps %ymm4, 384(%r9) -; AVX2-FP-NEXT: vmovaps %ymm8, 224(%r9) -; AVX2-FP-NEXT: vmovaps %ymm12, 64(%r9) -; AVX2-FP-NEXT: vmovaps %ymm13, 608(%r9) +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps %ymm1, 544(%r9) +; AVX2-FP-NEXT: vmovaps %ymm5, 384(%r9) +; AVX2-FP-NEXT: vmovaps %ymm9, 224(%r9) +; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-FP-NEXT: vmovaps %ymm12, 608(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3588,259 +3594,262 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FP-NEXT: addq $552, %rsp # imm = 0x228 +; AVX2-FP-NEXT: addq $584, %rsp # imm = 0x248 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i32_stride5_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $600, %rsp # imm = 0x258 +; AVX2-FCP-NEXT: subq $616, %rsp # imm = 0x268 ; AVX2-FCP-NEXT: vmovaps (%r8), %ymm11 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm10 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm14 +; AVX2-FCP-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm12 = [0,1,0,1,u,u,2,2] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm8 ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm9 = xmm8[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2],xmm9[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4,5],ymm9[6],ymm0[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm9 = xmm3[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2],xmm9[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4,5],ymm9[6],ymm0[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm10 -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm9 +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm11 ; AVX2-FCP-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2],xmm11[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm11 -; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm10 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm11 -; AVX2-FCP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7] +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm10 ; AVX2-FCP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vpermps %ymm10, %ymm12, %ymm10 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FCP-NEXT: vpermps %ymm9, %ymm12, %ymm9 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %xmm14 ; AVX2-FCP-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rcx), %xmm13 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm13[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm11 -; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX2-FCP-NEXT: vmovaps 96(%rcx), %xmm10 +; AVX2-FCP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm10 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm7 -; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm11 = [0,1,3,2,3,2,3,2] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm11, %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7] +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm12, %ymm6 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm14 = [0,1,3,2,3,2,3,2] +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm14, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm8 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm9 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm7, %ymm8 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm15 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm15[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2],ymm6[3,4],ymm10[5,6],ymm6[7] ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm14 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1,2],ymm7[3,4],ymm10[5,6],ymm7[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4],ymm7[5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FCP-NEXT: vbroadcastsd 16(%r8), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4],ymm7[5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4],ymm5[5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm12, %ymm2 -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm11, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r8), %ymm3, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm10 +; AVX2-FCP-NEXT: vmovaps %ymm14, %ymm4 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm14, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm14 ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm9 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3,4],ymm8[5,6],ymm3[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3,4],ymm8[5,6],ymm2[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FCP-NEXT: vbroadcastsd 48(%r8), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 56(%r8), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4],ymm2[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 56(%r8), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm13[0],mem[0],xmm13[1],mem[1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vmovaps %ymm4, %ymm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm5 -; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm4 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm6 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm3 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 80(%r8), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 80(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm12 -; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm13 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4],ymm11[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 112(%r8), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2],ymm15[3,4],ymm6[5,6],ymm15[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 120(%r8), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4,5,6],ymm15[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-FCP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm11 +; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm12 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1,2],ymm10[3,4],ymm2[5,6],ymm10[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 112(%r8), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4],ymm10[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4],ymm0[5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3,4],ymm0[5,6],ymm5[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 120(%r8), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm5[2],ymm0[3,4,5,6],ymm5[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,0,2,3,7,4,6,7] +; AVX2-FCP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-FCP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3],ymm14[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[0,1,2],mem[3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4,5,6],ymm8[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,3,0,4,5,7,4] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm2, 384(%r9) +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm13[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps %ymm1, 544(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm3, 384(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm7, 224(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm11, 64(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm6, 608(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm5, 608(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%r9) @@ -3866,7 +3875,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FCP-NEXT: addq $600, %rsp # imm = 0x258 +; AVX2-FCP-NEXT: addq $616, %rsp # imm = 0x268 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -3875,17 +3884,17 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 @@ -3893,68 +3902,68 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512-NEXT: vpermt2d %zmm14, %zmm18, %zmm9 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512-NEXT: vpermt2d %zmm10, %zmm8, %zmm7 ; AVX512-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-NEXT: vpermt2d %zmm11, %zmm12, %zmm9 +; AVX512-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 +; AVX512-NEXT: vpermt2d %zmm10, %zmm23, %zmm24 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512-NEXT: vpermt2d %zmm14, %zmm25, %zmm16 +; AVX512-NEXT: vmovdqa32 %zmm24, %zmm16 {%k2} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-NEXT: vpermt2d %zmm13, %zmm24, %zmm16 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm27 ; AVX512-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm29 ; AVX512-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-NEXT: kmovw %eax, %k3 ; AVX512-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512-NEXT: vpermt2d %zmm14, %zmm15, %zmm11 +; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 ; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} +; AVX512-NEXT: vmovdqa32 %zmm18, %zmm8 {%k2} ; AVX512-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 ; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 +; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm12 +; AVX512-NEXT: vmovdqa32 %zmm21, %zmm12 {%k1} +; AVX512-NEXT: vpermt2d %zmm5, %zmm22, %zmm12 ; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} @@ -3965,11 +3974,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm12, 192(%r9) ; AVX512-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512-NEXT: vmovdqa64 %zmm15, 448(%r9) +; AVX512-NEXT: vmovdqa64 %zmm16, 448(%r9) ; AVX512-NEXT: vmovdqa64 %zmm9, 512(%r9) ; AVX512-NEXT: vmovdqa64 %zmm7, 576(%r9) ; AVX512-NEXT: vmovdqa64 %zmm3, (%r9) @@ -3981,17 +3990,17 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 @@ -3999,68 +4008,68 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm8, %zmm7 ; AVX512-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm9 +; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm23, %zmm24 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm16 +; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm16 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm27 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512-FCP-NEXT: kmovw %eax, %k3 ; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm15, %zmm11 +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} +; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm8 {%k2} ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 +; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm12 +; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm22, %zmm12 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 ; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} @@ -4071,11 +4080,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 448(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 448(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 512(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 576(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) @@ -4087,17 +4096,17 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 @@ -4105,68 +4114,68 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm18, %zmm9 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm8, %zmm7 ; AVX512DQ-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm12, %zmm9 +; AVX512DQ-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm23, %zmm24 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm25, %zmm16 +; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm24, %zmm16 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm27 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512DQ-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm15, %zmm11 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512DQ-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} +; AVX512DQ-NEXT: vmovdqa32 %zmm18, %zmm8 {%k2} ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 +; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm12 +; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm12 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm22, %zmm12 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 ; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} @@ -4177,11 +4186,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 448(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 448(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 512(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 576(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%r9) @@ -4193,17 +4202,17 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 @@ -4211,68 +4220,68 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm23, %zmm24 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm16 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm16 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512DQ-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm15, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm18, %zmm8 {%k2} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm22, %zmm12 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} @@ -4283,11 +4292,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 448(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 448(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 512(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 576(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) @@ -4299,17 +4308,17 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 @@ -4317,68 +4326,68 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm18, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm8, %zmm7 ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm23, %zmm24 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm16 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm15, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k2} ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm12 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm22, %zmm12 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} @@ -4389,11 +4398,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 576(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) @@ -4405,17 +4414,17 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 @@ -4423,68 +4432,68 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm23, %zmm24 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm16 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm15, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm8 {%k2} ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm22, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} @@ -4495,11 +4504,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 448(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 512(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 576(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) @@ -4511,17 +4520,17 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 @@ -4529,68 +4538,68 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm18, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm23, %zmm24 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm16 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512DQ-BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm15, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k2} ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm22, %zmm12 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} @@ -4601,11 +4610,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 448(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 448(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 512(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 576(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) @@ -4617,17 +4626,17 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 @@ -4635,68 +4644,68 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm19, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm21, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm23, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm25, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm16 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 ; AVX512DQ-BW-FCP-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm15, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm8 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm22, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} @@ -4707,11 +4716,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 320(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 448(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 512(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 576(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) @@ -6183,7 +6192,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-LABEL: store_i32_stride5_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $1736, %rsp # imm = 0x6C8 +; AVX2-NEXT: subq $1768, %rsp # imm = 0x6E8 ; AVX2-NEXT: vmovaps (%r8), %ymm15 ; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%r8), %ymm14 @@ -6298,8 +6307,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-NEXT: vinsertf128 $1, 128(%r8), %ymm5, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -6324,8 +6333,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-NEXT: vinsertf128 $1, 160(%r8), %ymm5, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -6350,8 +6359,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-NEXT: vinsertf128 $1, 192(%r8), %ymm5, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -6376,8 +6385,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-NEXT: vinsertf128 $1, 224(%r8), %ymm5, %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -6392,27 +6401,28 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rdx), %ymm15 -; AVX2-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-NEXT: vmovaps (%rdx), %ymm3 +; AVX2-NEXT: vmovaps (%rcx), %ymm4 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 16(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 16(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -6426,21 +6436,21 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rsi), %ymm13 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm14 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 48(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 48(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -6454,21 +6464,21 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rsi), %ymm13 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vmovaps 64(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 64(%rsi), %ymm14 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 80(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 80(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -6482,20 +6492,20 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%rdi), %ymm13 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm14 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 112(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 112(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -6503,49 +6513,49 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rdx), %ymm4 -; AVX2-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%rcx), %ymm3 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 128(%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 128(%rsi), %ymm5 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 144(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 144(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rdx), %ymm2 +; AVX2-NEXT: vmovaps 160(%rdx), %ymm3 ; AVX2-NEXT: vmovaps 160(%rcx), %ymm12 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm9 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 176(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 176(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6555,8 +6565,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rdx), %ymm8 -; AVX2-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovaps 192(%rcx), %ymm7 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 192(%rdi), %ymm5 @@ -6564,11 +6574,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-NEXT: vbroadcastsd 208(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-NEXT: vbroadcastsd 208(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] @@ -6582,77 +6592,78 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-NEXT: vmovaps 224(%rsi), %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4],ymm11[5,6,7] -; AVX2-NEXT: vbroadcastsd 240(%r8), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4],ymm11[5],ymm7[6,7] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vbroadcastsd 240(%r8), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4],ymm11[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2,3,4],ymm15[5],ymm6[6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3,4],ymm7[5,6],ymm11[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2],ymm11[3,4],ymm6[5,6],ymm11[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4],ymm7[5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3,4],ymm6[5,6,7] ; AVX2-NEXT: vbroadcastsd 248(%r8), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4,5,6],ymm11[7] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4,5,6],ymm11[7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm11[1,2,3],ymm6[4,5],ymm11[6,7] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm11[1,2,3],ymm6[4,5],ymm11[6,7] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm11[1,2,3],ymm6[4,5],ymm11[6,7] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] ; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3],ymm7[4,5],ymm14[6,7] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1,2,3],ymm6[4,5],ymm14[6,7] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4],ymm14[5,6,7] ; AVX2-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] ; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload @@ -6672,9 +6683,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] @@ -6690,7 +6701,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm4, 1024(%r9) ; AVX2-NEXT: vmovaps %ymm9, 864(%r9) ; AVX2-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-NEXT: vmovaps %ymm7, 544(%r9) +; AVX2-NEXT: vmovaps %ymm6, 544(%r9) ; AVX2-NEXT: vmovaps %ymm11, 384(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 224(%r9) @@ -6760,13 +6771,13 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r9) -; AVX2-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX2-NEXT: addq $1768, %rsp # imm = 0x6E8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i32_stride5_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $1736, %rsp # imm = 0x6C8 +; AVX2-FP-NEXT: subq $1768, %rsp # imm = 0x6E8 ; AVX2-FP-NEXT: vmovaps (%r8), %ymm15 ; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm14 @@ -6881,8 +6892,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vinsertf128 $1, 128(%r8), %ymm5, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -6907,8 +6918,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vinsertf128 $1, 160(%r8), %ymm5, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -6933,8 +6944,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vinsertf128 $1, 192(%r8), %ymm5, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -6959,8 +6970,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vinsertf128 $1, 224(%r8), %ymm5, %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -6975,27 +6986,28 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm15 -; AVX2-FP-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm3 +; AVX2-FP-NEXT: vmovaps (%rcx), %ymm4 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-FP-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 16(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 16(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -7009,21 +7021,21 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm13 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm14 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 48(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 48(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -7037,21 +7049,21 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm13 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm14 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 80(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 80(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -7065,20 +7077,20 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm14 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 112(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 112(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -7086,49 +7098,49 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm4 -; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm3 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm5 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 144(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 144(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm2 +; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm3 ; AVX2-FP-NEXT: vmovaps 160(%rcx), %ymm12 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm9 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 176(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 176(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -7138,8 +7150,8 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm8 -; AVX2-FP-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovaps 192(%rcx), %ymm7 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm5 @@ -7147,11 +7159,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vbroadcastsd 208(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 208(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] @@ -7165,77 +7177,78 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm0 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4],ymm11[5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 240(%r8), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4],ymm11[5],ymm7[6,7] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FP-NEXT: vbroadcastsd 240(%r8), %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4],ymm11[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2,3,4],ymm15[5],ymm6[6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,2,3,3,4,6,7,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3,4],ymm7[5,6],ymm11[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2],ymm11[3,4],ymm6[5,6],ymm11[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4],ymm7[5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3,4],ymm6[5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 248(%r8), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4,5,6],ymm11[7] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4,5,6],ymm11[7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-FP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm11[1,2,3],ymm6[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-FP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-FP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] -; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm11[1,2,3],ymm6[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = mem[3,0,2,3,7,4,6,7] +; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-FP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-FP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm11[1,2,3],ymm6[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4],ymm6[5,6,7] ; AVX2-FP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] ; AVX2-FP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3],ymm7[4,5],ymm14[6,7] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-FP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1,2,3],ymm6[4,5],ymm14[6,7] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4],ymm14[5,6,7] ; AVX2-FP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] ; AVX2-FP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload @@ -7255,9 +7268,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] @@ -7273,7 +7286,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm4, 1024(%r9) ; AVX2-FP-NEXT: vmovaps %ymm9, 864(%r9) ; AVX2-FP-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-FP-NEXT: vmovaps %ymm7, 544(%r9) +; AVX2-FP-NEXT: vmovaps %ymm6, 544(%r9) ; AVX2-FP-NEXT: vmovaps %ymm11, 384(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%r9) @@ -7343,7 +7356,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FP-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX2-FP-NEXT: addq $1768, %rsp # imm = 0x6E8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -7360,13 +7373,13 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm14 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-FCP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm9 = [0,1,0,1,u,u,2,2] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm15 -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm6 ; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm4 ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm12 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,2,2] @@ -7379,17 +7392,17 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm12[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm5[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm8 -; AVX2-FCP-NEXT: vmovaps 64(%rcx), %xmm6 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm6[1,1,2,2] +; AVX2-FCP-NEXT: vmovaps 64(%rcx), %xmm10 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm10[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] @@ -7495,105 +7508,106 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [0,1,3,2,3,2,3,2] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm11 = [0,1,3,2,3,2,3,2] +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm14 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm15 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1,2,2,5,5,6,6] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 16(%r8), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 16(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm10 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm7 ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm2 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm6 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 48(%r8), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 48(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm5 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm4 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm7 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm6 +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 80(%r8), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 80(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] @@ -7609,32 +7623,32 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm5 -; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm3 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm14 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 112(%r8), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 112(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -7648,32 +7662,32 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm5 -; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm3 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm5 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1,2],ymm1[3,4],ymm13[5,6],ymm1[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 144(%r8), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 144(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -7687,32 +7701,31 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm1 -; AVX2-FCP-NEXT: vmovaps 160(%rcx), %ymm3 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm3 +; AVX2-FCP-NEXT: vmovaps 160(%rcx), %ymm4 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm10 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1,2],ymm1[3,4],ymm11[5,6],ymm1[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3,4],ymm12[5,6],ymm1[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 176(%r8), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 176(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -7726,7 +7739,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] @@ -7741,9 +7754,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 208(%r8), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 208(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] @@ -7762,7 +7775,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] @@ -7777,9 +7790,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 240(%r8), %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4],ymm9[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 240(%r8), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2,3,3,4,6,7,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm3[2,3,2,3,6,7,6,7] @@ -7796,9 +7809,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm15[0,1,3,0,4,5,7,4] -; AVX2-FCP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4,5,6],ymm9[7] +; AVX2-FCP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] @@ -7809,12 +7822,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] ; AVX2-FCP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] -; AVX2-FCP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4,5,6],ymm9[7] +; AVX2-FCP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] ; AVX2-FCP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -7838,9 +7851,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2,3],ymm4[4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-FCP-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-FCP-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4],ymm14[5,6,7] ; AVX2-FCP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] ; AVX2-FCP-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload @@ -7850,13 +7863,13 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0,2,3,7,4,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7] -; AVX2-FCP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4],ymm10[5,6,7] +; AVX2-FCP-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm12 = mem[0,1,3,0,4,5,7,4] ; AVX2-FCP-NEXT: vpermilps $78, (%rsp), %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4,5,6],ymm11[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4,5,6],ymm12[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] @@ -7881,7 +7894,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm13, 704(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm4, 544(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm9, 384(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm12, 224(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm11, 224(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8030,7 +8043,6 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 @@ -8039,6 +8051,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm21 ; AVX512-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 ; AVX512-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 @@ -8090,16 +8103,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -8244,7 +8257,6 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 @@ -8253,6 +8265,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 @@ -8304,16 +8317,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -8458,7 +8471,6 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 @@ -8467,6 +8479,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 @@ -8518,16 +8531,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -8672,7 +8685,6 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 @@ -8681,6 +8693,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 @@ -8732,16 +8745,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -8886,7 +8899,6 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 @@ -8895,6 +8907,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 @@ -8946,16 +8959,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -9100,7 +9113,6 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 @@ -9109,6 +9121,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 @@ -9160,16 +9173,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -9314,7 +9327,6 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 @@ -9323,6 +9335,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 @@ -9374,16 +9387,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} @@ -9528,7 +9541,6 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 @@ -9537,6 +9549,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 @@ -9588,16 +9601,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index 7fe0bcc0f3d8d..9aa80236d22f7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -148,7 +148,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] ; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) ; AVX512-NEXT: vmovaps %ymm0, (%rax) @@ -169,7 +169,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) ; AVX512-FCP-NEXT: vmovaps %ymm0, (%rax) @@ -190,7 +190,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] +; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] ; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-NEXT: vmovaps %ymm0, (%rax) @@ -211,7 +211,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rax) @@ -232,7 +232,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] ; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) ; AVX512BW-NEXT: vmovaps %ymm0, (%rax) @@ -253,7 +253,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) ; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rax) @@ -274,7 +274,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-NEXT: vmovaps %ymm0, (%rax) @@ -295,7 +295,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rax) @@ -403,20 +403,20 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovaps (%r9), %xmm5 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,4,0,4,0,4,0,4] -; AVX2-NEXT: vpermps %ymm7, %ymm9, %ymm10 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm8 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm9 +; AVX2-NEXT: vpermps %ymm7, %ymm8, %ymm10 ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,4,1,5,0,4,1,5] ; AVX2-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-NEXT: vpermps %ymm6, %ymm11, %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vpermps %ymm8, %ymm9, %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-NEXT: vpermps %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5],ymm10[6,7] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] +; AVX2-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6] -; AVX2-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-NEXT: vpermps %ymm7, %ymm10, %ymm7 +; AVX2-NEXT: vpermps %ymm7, %ymm11, %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] @@ -426,11 +426,11 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7] ; AVX2-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-NEXT: vpermps %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpermps %ymm9, %ymm1, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-NEXT: vmovaps %ymm9, (%rax) +; AVX2-NEXT: vmovaps %ymm8, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -445,20 +445,20 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovaps (%r9), %xmm5 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 -; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,4,0,4,0,4,0,4] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm9, %ymm10 +; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm8 = [0,4,0,4,0,4,0,4] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm9 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm8, %ymm10 ; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,4,1,5,0,4,1,5] ; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpermps %ymm6, %ymm11, %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm9, %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FP-NEXT: vpermps %ymm9, %ymm8, %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5],ymm10[6,7] ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] +; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] +; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6] -; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm10, %ymm7 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm11, %ymm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] @@ -468,11 +468,11 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7] ; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpermps %ymm9, %ymm1, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm9, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm8, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -487,20 +487,20 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovaps (%r9), %xmm5 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,4,0,4,0,4,0,4] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm9, %ymm10 +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm8 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm9 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm8, %ymm10 ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,4,1,5,0,4,1,5] ; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vpermps %ymm9, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] +; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6] -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm10, %ymm7 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm11, %ymm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] @@ -510,11 +510,11 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7] ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm9, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm9, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm8, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -523,14 +523,14 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vmovdqa %ymm2, 64(%rax) @@ -542,14 +542,14 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -561,14 +561,14 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rax) @@ -580,14 +580,14 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -599,14 +599,14 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -618,14 +618,14 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -637,14 +637,14 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rax) @@ -656,14 +656,14 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rax) @@ -843,78 +843,78 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 -; AVX2-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3] -; AVX2-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] -; AVX2-NEXT: vmovdqa (%r8), %xmm10 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm12 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] -; AVX2-NEXT: vpbroadcastd (%rcx), %xmm12 -; AVX2-NEXT: vpbroadcastd (%rdx), %xmm13 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vpbroadcastq %xmm10, %ymm11 -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-NEXT: vpbroadcastd %xmm12, %ymm11 -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; AVX2-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,2,2,3] +; AVX2-NEXT: vmovdqa (%rdx), %xmm8 +; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,2,2,3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-NEXT: vmovdqa (%r8), %xmm9 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm9[0],zero,xmm9[1],zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] +; AVX2-NEXT: vpbroadcastd (%rcx), %xmm11 +; AVX2-NEXT: vpbroadcastd (%rdx), %xmm12 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpbroadcastq %xmm9, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] +; AVX2-NEXT: vmovdqa (%r9), %xmm11 +; AVX2-NEXT: vpbroadcastd %xmm11, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[1,1,2,3,5,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4],ymm10[5],ymm13[6],ymm10[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm13 +; AVX2-NEXT: vmovdqa (%r8), %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5,6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm12[2,3],ymm7[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 16(%r9), %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-NEXT: vmovdqa %ymm8, 160(%rax) -; AVX2-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm11, 128(%rax) -; AVX2-NEXT: vmovdqa %ymm6, (%rax) -; AVX2-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-NEXT: vmovdqa %ymm7, 160(%rax) +; AVX2-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-NEXT: vmovdqa %ymm10, 128(%rax) +; AVX2-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-NEXT: vmovdqa %ymm4, 32(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -923,89 +923,89 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3] -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm10 -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm12 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm12 -; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm13 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq %xmm10, %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FP-NEXT: vpbroadcastd %xmm12, %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,2,2,3] +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm8 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,2,2,3] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm9 +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm9[0],zero,xmm9[1],zero +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm11 +; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm12 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastq %xmm9, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovdqa (%r9), %xmm11 +; AVX2-FP-NEXT: vpbroadcastd %xmm11, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[1,1,2,3,5,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4],ymm10[5],ymm13[6],ymm10[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm13 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,3,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm12[2,3],ymm7[2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[2,1,3,3,6,5,7,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = mem[0,2,2,3,4,6,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 16(%r9), %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm8, 160(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm11, 128(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm6, (%rax) -; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm7, 160(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm10, 128(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i32_stride6_vf8: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm5 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm7 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm11 @@ -1020,9 +1020,9 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm12 ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero +; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm14[3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm13 ; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm14 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] @@ -1033,7 +1033,7 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5],ymm7[6,7] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,2,3,5,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] @@ -1048,27 +1048,27 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm9, %ymm10 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [6,0,0,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,u,u,u,u,u,7,u] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,6,0,0,0,0,0,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,6,u,u,u,u,u,7] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm10, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm5, 160(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm8, 64(%rax) @@ -1080,24 +1080,24 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i32_stride6_vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1107,24 +1107,24 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-FCP-LABEL: store_i32_stride6_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1134,24 +1134,24 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-LABEL: store_i32_stride6_vf8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1161,24 +1161,24 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-FCP-LABEL: store_i32_stride6_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1188,24 +1188,24 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512BW-LABEL: store_i32_stride6_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1215,24 +1215,24 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512BW-FCP-LABEL: store_i32_stride6_vf8: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1242,24 +1242,24 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-BW-LABEL: store_i32_stride6_vf8: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1269,24 +1269,24 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf8: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -1650,19 +1650,19 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX2-NEXT: vmovdqa 32(%r8), %xmm8 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm4 @@ -1671,24 +1671,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 36(%r9), %ymm4 +; AVX2-NEXT: vpbroadcastd 32(%rcx), %xmm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-NEXT: vpbroadcastd 32(%rdx), %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-NEXT: vpbroadcastd 32(%rdx), %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpbroadcastq %xmm6, %ymm2 +; AVX2-NEXT: vpbroadcastq %xmm8, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vmovdqa 32(%r9), %xmm15 ; AVX2-NEXT: vpbroadcastd %xmm15, %ymm2 @@ -1696,13 +1696,13 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 52(%r9), %ymm3 @@ -1720,64 +1720,64 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] +; AVX2-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[1,1,2,3,5,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] ; AVX2-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-NEXT: vmovdqa (%rsi), %ymm5 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] -; AVX2-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload -; AVX2-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX2-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6],ymm6[7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm12, %ymm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6],ymm8[7] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[4],ymm9[4],ymm11[5],ymm9[5] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5],ymm8[6,7] ; AVX2-NEXT: vpbroadcastd 48(%r9), %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] ; AVX2-NEXT: vmovdqa (%r8), %ymm9 @@ -1787,20 +1787,20 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-NEXT: vpbroadcastd 16(%r9), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] +; AVX2-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-NEXT: vmovdqa %ymm8, 288(%rax) ; AVX2-NEXT: vmovdqa %ymm6, 256(%rax) -; AVX2-NEXT: vmovdqa %ymm4, 352(%rax) +; AVX2-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 128(%rax) @@ -1830,19 +1830,19 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm8 ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm4 @@ -1851,24 +1851,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 36(%r9), %ymm4 +; AVX2-FP-NEXT: vpbroadcastd 32(%rcx), %xmm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-FP-NEXT: vpbroadcastd 32(%rdx), %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-FP-NEXT: vpbroadcastd 32(%rdx), %xmm3 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq %xmm6, %ymm2 +; AVX2-FP-NEXT: vpbroadcastq %xmm8, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm15 ; AVX2-FP-NEXT: vpbroadcastd %xmm15, %ymm2 @@ -1876,13 +1876,13 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 52(%r9), %ymm3 @@ -1900,64 +1900,64 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[1,1,2,3,5,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm5 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] -; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6],ymm6[7] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm12, %ymm6 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,2,3,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6],ymm8[7] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[4],ymm9[4],ymm11[5],ymm9[5] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5],ymm8[6,7] ; AVX2-FP-NEXT: vpbroadcastd 48(%r9), %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm9 @@ -1967,20 +1967,20 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vpbroadcastd 16(%r9), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm8, 288(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm6, 256(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm4, 352(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) @@ -2028,133 +2028,133 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm6 +; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm5 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm3 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm2 +; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,2,2,4,5,6,6] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,1,2,3] ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm14 -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm3[2],ymm14[2],ymm3[3],ymm14[3],ymm3[6],ymm14[6],ymm3[7],ymm14[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm2 ; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm3[2,1,2,3] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,1,2,3,5,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,1,2,3] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,2,3,3,2,2,3,3] -; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm9 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm12, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm12[1],ymm7[2,3,4,5,6],ymm12[7] ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm6, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,2,3,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm6 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm12, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm6 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[4],ymm14[4],ymm6[5],ymm14[5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm15[2],ymm5[2],ymm15[3],ymm5[3],ymm15[6],ymm5[6],ymm15[7],ymm5[7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm7[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm15[0],ymm5[0],ymm15[1],ymm5[1],ymm15[4],ymm5[4],ymm15[5],ymm5[5] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[4],ymm3[4],ymm13[5],ymm3[5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 288(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm2, 256(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 288(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 256(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2186,26 +2186,26 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: movb $36, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2216,52 +2216,51 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $-110, %cl ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm7 {%k2} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512-NEXT: vpermi2d %zmm1, %zmm7, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-NEXT: vpermi2d %zmm0, %zmm10, %zmm7 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] -; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 ; AVX512-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] -; AVX512-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm11[2,3,2,3,2,3,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512-NEXT: vpermi2d %zmm1, %zmm10, %zmm11 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-NEXT: vpermi2d %zmm0, %zmm11, %zmm10 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i32_stride6_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -2274,24 +2273,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512-FCP-NEXT: movb $-110, %cl -; AVX512-FCP-NEXT: kmovw %ecx, %k2 +; AVX512-FCP-NEXT: movb $-110, %al +; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 -; AVX512-FCP-NEXT: movb $36, %cl -; AVX512-FCP-NEXT: kmovw %ecx, %k1 +; AVX512-FCP-NEXT: movb $36, %al +; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2300,9 +2299,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2311,30 +2310,31 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -2356,26 +2356,26 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512DQ-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-NEXT: movb $36, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2386,52 +2386,51 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $-110, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm7 {%k2} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm7, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm10, %zmm7 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm11[2,3,2,3,2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm10, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm11, %zmm10 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i32_stride6_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -2444,24 +2443,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: movb $-110, %cl -; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FCP-NEXT: movb $-110, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: movb $36, %cl -; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 +; AVX512DQ-FCP-NEXT: movb $36, %al +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2470,9 +2469,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2481,30 +2480,31 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -2526,26 +2526,26 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512BW-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: movb $36, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2556,52 +2556,51 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-110, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm11[2,3,2,3,2,3,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i32_stride6_vf16: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -2614,24 +2613,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: movb $-110, %cl -; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 +; AVX512BW-FCP-NEXT: movb $-110, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: movb $36, %cl -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512BW-FCP-NEXT: movb $36, %al +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2640,9 +2639,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2651,30 +2650,31 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -2696,26 +2696,26 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12] ; AVX512DQ-BW-NEXT: vpermi2d (%rcx), %ymm7, %ymm8 +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512DQ-BW-NEXT: movb $36, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2726,52 +2726,51 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $-110, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm7 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm11[2,3,2,3,2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] +; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf16: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -2784,24 +2783,24 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: movb $-110, %cl -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 +; AVX512DQ-BW-FCP-NEXT: movb $-110, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: movb $36, %cl -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-FCP-NEXT: movb $36, %al +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2810,9 +2809,9 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2821,30 +2820,31 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) @@ -3598,10 +3598,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-NEXT: vmovdqa (%r8), %xmm11 -; AVX2-NEXT: vmovdqa 32(%r8), %xmm12 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm11[0],zero,xmm11[1],zero -; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%r8), %xmm12 +; AVX2-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero +; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] @@ -3614,8 +3614,8 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero -; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm13[0],zero,xmm13[1],zero +; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 36(%r9), %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] @@ -3647,26 +3647,26 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] -; AVX2-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX2-NEXT: vmovdqa 96(%rsi), %xmm6 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero +; AVX2-NEXT: vmovdqa 96(%r8), %xmm9 +; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd (%rcx), %xmm8 -; AVX2-NEXT: vpbroadcastd (%rdx), %xmm9 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-NEXT: vpbroadcastd (%rcx), %xmm10 +; AVX2-NEXT: vpbroadcastd (%rdx), %xmm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpbroadcastq %xmm11, %ymm1 +; AVX2-NEXT: vpbroadcastq %xmm12, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3675,34 +3675,35 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] ; AVX2-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-NEXT: vmovdqa (%rsi), %ymm8 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm12 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm11 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastd 32(%rcx), %xmm10 ; AVX2-NEXT: vpbroadcastd 32(%rdx), %xmm11 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpbroadcastq %xmm12, %ymm3 +; AVX2-NEXT: vpbroadcastq %xmm13, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] @@ -3711,17 +3712,17 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 52(%r9), %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd 64(%rcx), %xmm12 -; AVX2-NEXT: vpbroadcastd 64(%rdx), %xmm13 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-NEXT: vpbroadcastd 64(%rcx), %xmm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastd 64(%rdx), %xmm12 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] @@ -3730,53 +3731,52 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 64(%rdx), %ymm5 ; AVX2-NEXT: vmovdqa 64(%rcx), %ymm4 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,1,2,3] ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm12 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpbroadcastd %xmm7, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX2-NEXT: vmovdqa 96(%rcx), %ymm6 +; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,1,2,3] ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 96(%rsi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 116(%r9), %ymm15 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3796,24 +3796,25 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -3894,11 +3895,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # xmm11 = mem[2,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] @@ -3906,18 +3906,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-NEXT: vpbroadcastd 112(%r9), %ymm13 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = mem[2,3],ymm6[2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm0, 736(%rax) +; AVX2-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-NEXT: vmovdqa %ymm11, 672(%rax) ; AVX2-NEXT: vmovaps %ymm5, 640(%rax) ; AVX2-NEXT: vmovdqa %ymm4, 544(%rax) @@ -3926,7 +3926,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa %ymm2, 352(%rax) ; AVX2-NEXT: vmovdqa %ymm9, 288(%rax) ; AVX2-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-NEXT: vmovdqa %ymm8, 96(%rax) ; AVX2-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3982,10 +3982,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm11 -; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm12 -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm11[0],zero,xmm11[1],zero -; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm12 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero +; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] @@ -3998,8 +3998,8 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero -; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm13[0],zero,xmm13[1],zero +; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 36(%r9), %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] @@ -4031,26 +4031,26 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] -; AVX2-FP-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX2-FP-NEXT: vmovdqa 96(%rsi), %xmm6 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FP-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero +; AVX2-FP-NEXT: vmovdqa 96(%r8), %xmm9 +; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm8 -; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm9 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm10 +; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq %xmm11, %ymm1 +; AVX2-FP-NEXT: vpbroadcastq %xmm12, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4059,34 +4059,35 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm12 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastd 32(%rcx), %xmm10 ; AVX2-FP-NEXT: vpbroadcastd 32(%rdx), %xmm11 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq %xmm12, %ymm3 +; AVX2-FP-NEXT: vpbroadcastq %xmm13, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] @@ -4095,17 +4096,17 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 52(%r9), %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd 64(%rcx), %xmm12 -; AVX2-FP-NEXT: vpbroadcastd 64(%rdx), %xmm13 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FP-NEXT: vpbroadcastd 64(%rcx), %xmm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastd 64(%rdx), %xmm12 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] @@ -4114,53 +4115,52 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm5 ; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm4 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,1,2,3] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm12 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-FP-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpbroadcastd %xmm7, %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm6 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,1,2,3] ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 116(%r9), %ymm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4180,24 +4180,25 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FP-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -4278,11 +4279,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # xmm11 = mem[2,2,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] @@ -4290,18 +4290,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-FP-NEXT: vpbroadcastd 112(%r9), %ymm13 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = mem[2,3],ymm6[2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = mem[0,2,2,3,4,6,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa %ymm0, 736(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm11, 672(%rax) ; AVX2-FP-NEXT: vmovaps %ymm5, 640(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm4, 544(%rax) @@ -4310,7 +4310,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa %ymm2, 352(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm9, 288(%rax) ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm8, 96(%rax) ; AVX2-FP-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4344,338 +4344,344 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-LABEL: store_i32_stride6_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $872, %rsp # imm = 0x368 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm6 -; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm7 +; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,3] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm10 ; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,2,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,2,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,2,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,2,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm14 -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,2,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,2,1] -; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm11 -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm10 -; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm12 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm8, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm0 +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,3] +; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,2,1] +; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm2[0],zero,xmm2[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm13 +; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm14 +; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq %xmm8, %ymm6 +; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm10 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm7 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero +; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm14[3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm8 ; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm12 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm13, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,2,4,5,6,6] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,1,2,2,4,5,6,6] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,1,2,3,5,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 52(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd 64(%rcx), %xmm1 -; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FCP-NEXT: vpbroadcastd 64(%rcx), %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm3 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] +; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm4 = xmm9[0],mem[0],xmm9[1],mem[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm14, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 64(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpbroadcastd 64(%r9), %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4],ymm1[5],ymm9[6],ymm1[7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm12 -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm3[2],ymm14[2],ymm3[3],ymm14[3],ymm3[6],ymm14[6],ymm3[7],ymm14[7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm9 = mem[0],zero,mem[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 84(%r9), %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd %xmm15, %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm11, %ymm1 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpbroadcastd 96(%r9), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[1,1,2,3,5,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 116(%r9), %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 116(%r9), %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckhdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm15 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm14 -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4,5,6],ymm11[7] -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[2,3],ymm5[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7] -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckhdq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm11, %ymm5 -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm14 -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4,5,6],ymm13[7] -; AVX2-FCP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[4],mem[4],ymm13[5],mem[5] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,3,3,6,5,7,7] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[4],ymm8[4],ymm11[5],ymm8[5] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = mem[2,3],ymm5[2,3] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 80(%r9), %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[2,3],ymm4[2,3] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm15, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 -; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm11 -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 112(%r9), %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[6],ymm8[6],ymm11[7],ymm8[7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[2,3],ymm8[2,3] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm15, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm8[1],ymm2[2,3,4,5,6],ymm8[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[4],ymm14[4],ymm2[5],ymm14[5] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpbroadcastd 80(%r9), %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm15, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm10, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm15, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpbroadcastd 112(%r9), %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5],ymm2[6,7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm3 = mem[2,3],ymm3[2,3] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm10, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm15, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm2, 736(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 672(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm3, 736(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm2, 672(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 640(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm4, 544(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm12, 480(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 544(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 480(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm8, 448(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm5, 352(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm13, 288(%rax) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 352(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) @@ -4743,27 +4749,27 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 ; AVX512-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] ; AVX512-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512-NEXT: movb $36, %dl ; AVX512-NEXT: kmovw %edx, %k1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 -; AVX512-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 +; AVX512-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] ; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 ; AVX512-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -4772,18 +4778,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $-110, %cl ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 ; AVX512-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} @@ -4805,23 +4811,23 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 ; AVX512-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] ; AVX512-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 -; AVX512-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 ; AVX512-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] @@ -4856,19 +4862,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm19 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] @@ -4889,7 +4895,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4899,42 +4905,42 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 ; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 ; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 ; AVX512-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm1 ; AVX512-FCP-NEXT: movb $-110, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512-FCP-NEXT: movb $36, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 @@ -4945,21 +4951,21 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 @@ -5018,27 +5024,27 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] ; AVX512DQ-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512DQ-NEXT: movb $36, %dl ; AVX512DQ-NEXT: kmovw %edx, %k1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 -; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512DQ-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -5047,18 +5053,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $-110, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} @@ -5080,23 +5086,23 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] @@ -5131,19 +5137,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm19 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] @@ -5164,7 +5170,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -5174,42 +5180,42 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: movb $-110, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512DQ-FCP-NEXT: movb $36, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 @@ -5220,21 +5226,21 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 @@ -5293,27 +5299,27 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] ; AVX512BW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512BW-NEXT: movb $36, %dl ; AVX512BW-NEXT: kmovd %edx, %k1 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512BW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -5322,18 +5328,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-110, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} @@ -5355,23 +5361,23 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] @@ -5406,19 +5412,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm19 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] @@ -5439,7 +5445,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -5449,42 +5455,42 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 ; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: movb $-110, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512BW-FCP-NEXT: movb $36, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 @@ -5495,21 +5501,21 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 @@ -5568,27 +5574,27 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] ; AVX512DQ-BW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512DQ-BW-NEXT: movb $36, %dl ; AVX512DQ-BW-NEXT: kmovd %edx, %k1 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 -; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 +; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -5597,18 +5603,18 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $-110, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} @@ -5630,23 +5636,23 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] @@ -5681,19 +5687,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm12, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] @@ -5714,7 +5720,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -5724,42 +5730,42 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movb $-110, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512DQ-BW-FCP-NEXT: movb $36, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 @@ -5770,21 +5776,21 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 @@ -7295,45 +7301,46 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-LABEL: store_i32_stride6_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $2504, %rsp # imm = 0x9C8 +; AVX2-NEXT: subq $2472, %rsp # imm = 0x9A8 ; AVX2-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 64(%rcx), %xmm7 -; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] ; AVX2-NEXT: vmovdqa (%rdx), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-NEXT: vmovdqa (%r8), %xmm3 ; AVX2-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 4(%r9), %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] @@ -7345,13 +7352,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1] -; AVX2-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX2-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] @@ -7391,18 +7399,19 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,1,2,1] -; AVX2-NEXT: vmovdqa 128(%rsi), %xmm8 +; AVX2-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm9 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] ; AVX2-NEXT: vmovdqa 128(%r8), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpbroadcastd 132(%r9), %ymm12 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 132(%r9), %ymm11 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 160(%rcx), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7453,70 +7462,69 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,2,1] -; AVX2-NEXT: vmovdqa 224(%rsi), %xmm14 -; AVX2-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] -; AVX2-NEXT: vmovdqa 224(%r8), %xmm14 -; AVX2-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm14[0,1,2,1] +; AVX2-NEXT: vmovdqa 224(%rsi), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 224(%r8), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 228(%r9), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd (%rcx), %xmm0 -; AVX2-NEXT: vpbroadcastd (%rdx), %xmm14 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX2-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-NEXT: vpbroadcastd (%rdx), %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpbroadcastq %xmm2, %ymm1 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-NEXT: vmovdqa (%rdx), %ymm8 +; AVX2-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,1,2,3,5,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm14[3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 32(%rcx), %xmm2 -; AVX2-NEXT: vbroadcastss 32(%rdx), %xmm14 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; AVX2-NEXT: vbroadcastss 32(%rcx), %xmm1 +; AVX2-NEXT: vbroadcastss 32(%rdx), %xmm3 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX2-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 32(%r9), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vbroadcastss %xmm0, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7535,25 +7543,27 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 52(%r9), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd 64(%rcx), %xmm14 -; AVX2-NEXT: vpbroadcastd 64(%rdx), %xmm15 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vpbroadcastd 64(%rcx), %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastd 64(%rdx), %xmm14 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX2-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-NEXT: vmovdqa 64(%rdx), %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 @@ -7596,26 +7606,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 116(%r9), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd 128(%rcx), %xmm14 -; AVX2-NEXT: vpbroadcastd 128(%rdx), %xmm15 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpbroadcastd 116(%r9), %ymm15 +; AVX2-NEXT: vpbroadcastd 128(%rcx), %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastd 128(%rdx), %xmm14 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-NEXT: vmovdqa 128(%r9), %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 128(%r9), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 128(%rdx), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 128(%rdx), %ymm9 -; AVX2-NEXT: vmovdqa 128(%rcx), %ymm8 -; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7] +; AVX2-NEXT: vmovdqa 128(%rcx), %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0 @@ -7688,40 +7700,39 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero +; AVX2-NEXT: vpbroadcastd 212(%r9), %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vpbroadcastd 212(%r9), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 224(%rcx), %xmm14 -; AVX2-NEXT: vbroadcastss 224(%rdx), %xmm15 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-NEXT: vmovaps 224(%r9), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vbroadcastss %xmm0, %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX2-NEXT: vbroadcastss 224(%rcx), %xmm0 +; AVX2-NEXT: vbroadcastss 224(%rdx), %xmm14 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vbroadcastsd (%rsp), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-NEXT: vmovaps 224(%r9), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vbroadcastss %xmm1, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 224(%rdx), %ymm14 -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 224(%rcx), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-NEXT: vmovdqa 224(%rdx), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 224(%rcx), %ymm14 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3] -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm14 -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,1,2,3] +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 224(%rsi), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpbroadcastd 244(%r9), %ymm15 @@ -7731,8 +7742,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-NEXT: # xmm15 = mem[2,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] @@ -7742,31 +7753,29 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps (%r8), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX2-NEXT: vbroadcastss 16(%r9), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[4],ymm4[4],ymm8[5],ymm4[5] +; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa (%r8), %ymm15 +; AVX2-NEXT: vpbroadcastd 16(%r9), %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] @@ -7785,17 +7794,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vpbroadcastd 48(%r9), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-NEXT: vpbroadcastd 48(%r9), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -7824,7 +7833,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -7833,7 +7843,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpbroadcastd 80(%r9), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -7877,9 +7887,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = mem[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -7898,15 +7908,16 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vpbroadcastd 144(%r9), %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -7937,15 +7948,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa 160(%r8), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-NEXT: vmovdqa 160(%r8), %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX2-NEXT: vpbroadcastd 176(%r9), %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] @@ -7968,73 +7979,72 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa 192(%r8), %ymm14 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] -; AVX2-NEXT: vpbroadcastd 208(%r9), %ymm15 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = mem[2,3],ymm12[2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 192(%r8), %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-NEXT: vpbroadcastd 208(%r9), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[2,3],ymm4[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6],ymm12[7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload -; AVX2-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 -; AVX2-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = mem[2,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] -; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = mem[2,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vmovaps 224(%r8), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-NEXT: vbroadcastss 240(%r9), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = mem[2,3],ymm14[2,3] -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-NEXT: vpermilps $250, (%rsp), %xmm13 # 16-byte Folded Reload +; AVX2-NEXT: # xmm13 = mem[2,2,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-NEXT: # xmm13 = mem[2,2,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] +; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vmovdqa 224(%r8), %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7] +; AVX2-NEXT: vpbroadcastd 240(%r9), %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5],ymm13[6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[2,3],ymm13[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm14, 1504(%rax) -; AVX2-NEXT: vmovaps %ymm4, 1440(%rax) -; AVX2-NEXT: vmovaps %ymm13, 1408(%rax) +; AVX2-NEXT: vmovdqa %ymm13, 1504(%rax) +; AVX2-NEXT: vmovdqa %ymm5, 1440(%rax) +; AVX2-NEXT: vmovaps %ymm4, 1408(%rax) ; AVX2-NEXT: vmovdqa %ymm12, 1312(%rax) -; AVX2-NEXT: vmovdqa %ymm2, 1248(%rax) +; AVX2-NEXT: vmovdqa %ymm1, 1248(%rax) ; AVX2-NEXT: vmovaps %ymm11, 1216(%rax) ; AVX2-NEXT: vmovdqa %ymm10, 1120(%rax) -; AVX2-NEXT: vmovdqa %ymm3, 1056(%rax) +; AVX2-NEXT: vmovdqa %ymm2, 1056(%rax) ; AVX2-NEXT: vmovaps %ymm9, 1024(%rax) ; AVX2-NEXT: vmovdqa %ymm8, 928(%rax) -; AVX2-NEXT: vmovdqa %ymm5, 864(%rax) +; AVX2-NEXT: vmovdqa %ymm3, 864(%rax) ; AVX2-NEXT: vmovaps %ymm7, 832(%rax) ; AVX2-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8107,51 +8117,52 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX2-NEXT: addq $2472, %rsp # imm = 0x9A8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i32_stride6_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $2504, %rsp # imm = 0x9C8 +; AVX2-FP-NEXT: subq $2472, %rsp # imm = 0x9A8 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 64(%rcx), %xmm7 -; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm3 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero -; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 4(%r9), %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] @@ -8163,13 +8174,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,2,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1] -; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] @@ -8209,18 +8221,19 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,1,2,1] -; AVX2-FP-NEXT: vmovdqa 128(%rsi), %xmm8 +; AVX2-FP-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm9 -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] ; AVX2-FP-NEXT: vmovdqa 128(%r8), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm0[0],zero,xmm0[1],zero +; AVX2-FP-NEXT: vpbroadcastd 132(%r9), %ymm12 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 132(%r9), %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 160(%rcx), %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8271,70 +8284,69 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,2,1] -; AVX2-FP-NEXT: vmovdqa 224(%rsi), %xmm14 -; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovdqa 224(%r8), %xmm14 -; AVX2-FP-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vmovdqa 224(%rsi), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 224(%r8), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 228(%r9), %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm0 -; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm14 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm1 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX2-FP-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-FP-NEXT: vpbroadcastd (%rdx), %xmm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastq %xmm2, %ymm1 +; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm8 +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,1,2,3,5,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-FP-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm14[3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 32(%rcx), %xmm2 -; AVX2-FP-NEXT: vbroadcastss 32(%rdx), %xmm14 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; AVX2-FP-NEXT: vbroadcastss 32(%rcx), %xmm1 +; AVX2-FP-NEXT: vbroadcastss 32(%rdx), %xmm3 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vbroadcastss %xmm0, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8353,25 +8365,27 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 52(%r9), %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd 64(%rcx), %xmm14 -; AVX2-FP-NEXT: vpbroadcastd 64(%rdx), %xmm15 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 64(%rcx), %xmm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastd 64(%rdx), %xmm14 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpbroadcastd %xmm1, %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 @@ -8415,25 +8429,27 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 116(%r9), %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd 128(%rcx), %xmm14 -; AVX2-FP-NEXT: vpbroadcastd 128(%rdx), %xmm15 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd 128(%rcx), %xmm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastd 128(%rdx), %xmm14 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FP-NEXT: vmovdqa 128(%r9), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpbroadcastd %xmm0, %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 128(%r9), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpbroadcastd %xmm1, %ymm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 128(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 128(%rdx), %ymm9 -; AVX2-FP-NEXT: vmovdqa 128(%rcx), %ymm8 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[0,1,2,2,4,5,6,6] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7] +; AVX2-FP-NEXT: vmovdqa 128(%rcx), %ymm9 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0 @@ -8506,40 +8522,39 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero +; AVX2-FP-NEXT: vpbroadcastd 212(%r9), %ymm0 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd 212(%r9), %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 224(%rcx), %xmm14 -; AVX2-FP-NEXT: vbroadcastss 224(%rdx), %xmm15 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vbroadcastss %xmm0, %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FP-NEXT: vbroadcastss 224(%rcx), %xmm0 +; AVX2-FP-NEXT: vbroadcastss 224(%rdx), %xmm14 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd (%rsp), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vbroadcastss %xmm1, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 224(%rdx), %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 224(%rcx), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-FP-NEXT: vmovdqa 224(%rdx), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 224(%rcx), %ymm14 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3] -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,1,2,3] +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 224(%rsi), %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastd 244(%r9), %ymm15 @@ -8549,8 +8564,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm15 = mem[2,2,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] @@ -8560,31 +8575,29 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovaps (%r8), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vbroadcastss 16(%r9), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[4],ymm4[4],ymm8[5],ymm4[5] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm15 +; AVX2-FP-NEXT: vpbroadcastd 16(%r9), %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] @@ -8603,17 +8616,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vpbroadcastd 48(%r9), %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FP-NEXT: vpbroadcastd 48(%r9), %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -8642,7 +8655,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -8651,7 +8665,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpbroadcastd 80(%r9), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -8695,9 +8709,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = mem[0,2,2,3,4,6,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -8716,15 +8730,16 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 128(%r8), %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpbroadcastd 144(%r9), %ymm2 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -8755,15 +8770,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 160(%r8), %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa 160(%r8), %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vpbroadcastd 176(%r9), %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,1,3,3,6,5,7,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] @@ -8786,73 +8801,72 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa 192(%r8), %ymm14 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vpbroadcastd 208(%r9), %ymm15 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = mem[2,3],ymm12[2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 192(%r8), %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vpbroadcastd 208(%r9), %ymm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = mem[2,3],ymm4[2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = mem[0,2,2,3,4,6,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6],ymm12[7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 -; AVX2-FP-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = mem[2,2,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] -; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = mem[2,2,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FP-NEXT: vbroadcastss 240(%r9), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = mem[2,3],ymm14[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vpermilps $250, (%rsp), %xmm13 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm13 = mem[2,2,3,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm13 = mem[2,2,3,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 224(%r8), %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7] +; AVX2-FP-NEXT: vpbroadcastd 240(%r9), %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5],ymm13[6,7] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[2,3],ymm13[2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm14, 1504(%rax) -; AVX2-FP-NEXT: vmovaps %ymm4, 1440(%rax) -; AVX2-FP-NEXT: vmovaps %ymm13, 1408(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm13, 1504(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm5, 1440(%rax) +; AVX2-FP-NEXT: vmovaps %ymm4, 1408(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm12, 1312(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm2, 1248(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm1, 1248(%rax) ; AVX2-FP-NEXT: vmovaps %ymm11, 1216(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm10, 1120(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm3, 1056(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm2, 1056(%rax) ; AVX2-FP-NEXT: vmovaps %ymm9, 1024(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm8, 928(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm5, 864(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm3, 864(%rax) ; AVX2-FP-NEXT: vmovaps %ymm7, 832(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8925,28 +8939,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FP-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX2-FP-NEXT: addq $2472, %rsp # imm = 0x9A8 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i32_stride6_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $2376, %rsp # imm = 0x948 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3] ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8957,70 +8971,70 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm13 ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,2,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,2,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm15 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm4 -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] -; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm7 +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm10 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm15 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm12 +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm12[0],zero,xmm12[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm8 -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 128(%rcx), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] ; AVX2-FCP-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FCP-NEXT: vmovdqa 128(%rsi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9028,98 +9042,101 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 128(%r8), %xmm5 -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 132(%r9), %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rcx), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] -; AVX2-FCP-NEXT: vmovdqa 160(%rdx), %xmm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovdqa 128(%r8), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vmovdqa 160(%rsi), %xmm0 +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; AVX2-FCP-NEXT: vpbroadcastd 132(%r9), %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rcx), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-FCP-NEXT: vmovdqa 160(%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,2,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FCP-NEXT: vmovdqa 160(%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 160(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 164(%r9), %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 160(%r8), %xmm6 -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 164(%r9), %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 192(%rcx), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] -; AVX2-FCP-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vmovdqa 192(%rsi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-FCP-NEXT: vmovdqa 192(%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,2,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FCP-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 192(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 196(%r9), %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 192(%r8), %xmm0 +; AVX2-FCP-NEXT: vmovdqa 224(%rcx), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 196(%r9), %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rcx), %xmm3 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-FCP-NEXT: vmovdqa 224(%rdx), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,2,2,3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FCP-NEXT: vmovdqa 224(%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FCP-NEXT: vmovdqa 224(%rdx), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FCP-NEXT: vmovdqa 224(%rsi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 224(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovdqa 224(%r8), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 228(%r9), %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm3 -; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm14 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 228(%r9), %ymm3 +; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm4 +; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 @@ -9130,9 +9147,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm0 ; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm1 @@ -9142,7 +9159,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm11, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastq %xmm13, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] @@ -9165,15 +9182,15 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastd 52(%r9), %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd 64(%rcx), %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd 64(%rcx), %xmm0 -; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm1 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastq %xmm7, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpbroadcastd 64(%r9), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] @@ -9198,144 +9215,144 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpbroadcastd 84(%r9), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpbroadcastd %xmm10, %xmm0 +; AVX2-FCP-NEXT: vpbroadcastd %xmm9, %xmm1 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm8, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastq %xmm12, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpbroadcastd 96(%r9), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4],ymm0[5],ymm6[6],ymm0[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 116(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd 128(%rcx), %xmm0 -; AVX2-FCP-NEXT: vpbroadcastd 128(%rdx), %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm5, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 128(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 116(%r9), %ymm6 +; AVX2-FCP-NEXT: vbroadcastss 128(%rcx), %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 128(%rdx), %ymm12 +; AVX2-FCP-NEXT: vbroadcastss 128(%rdx), %xmm0 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastss 128(%r9), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 128(%rdx), %ymm11 ; AVX2-FCP-NEXT: vmovdqa 128(%rcx), %ymm9 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[1,1,2,3,5,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4],ymm0[5],ymm6[6],ymm0[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FCP-NEXT: vmovdqa 128(%rsi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 148(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq %xmm6, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 160(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 148(%r9), %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdx), %ymm10 -; AVX2-FCP-NEXT: vmovdqa 160(%rcx), %ymm7 -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX2-FCP-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastss 160(%r9), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdx), %ymm12 +; AVX2-FCP-NEXT: vmovdqa 160(%rcx), %ymm10 +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,2,2,4,5,6,6] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[1,1,2,3,5,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4],ymm0[5],ymm6[6],ymm0[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rsi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rsi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 180(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 180(%r9), %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FCP-NEXT: vpbroadcastd %xmm11, %xmm0 -; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FCP-NEXT: vpbroadcastd %xmm13, %xmm0 +; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 192(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpbroadcastd 192(%r9), %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4],ymm0[5],ymm6[6],ymm0[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FCP-NEXT: vmovdqa 192(%rsi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd 212(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero +; AVX2-FCP-NEXT: vpbroadcastd 212(%r9), %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 224(%rcx), %xmm0 -; AVX2-FCP-NEXT: vbroadcastss 224(%rdx), %xmm4 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FCP-NEXT: vbroadcastss 224(%rdx), %xmm6 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vbroadcastss 224(%r9), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastss 224(%r9), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 224(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9346,10 +9363,10 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rsi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rsi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] @@ -9362,20 +9379,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm15 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7] +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] @@ -9383,60 +9401,60 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm4 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm5, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6],ymm14[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm14 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovdqa 64(%r9), %ymm14 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15 @@ -9451,7 +9469,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpbroadcastd 80(%r9), %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9459,19 +9477,19 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm14 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm14 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm14 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15 @@ -9481,21 +9499,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpbroadcastd 112(%r9), %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -9505,31 +9523,31 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqa 128(%r8), %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovdqa 128(%r9), %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[4],ymm9[4],ymm11[5],ymm9[5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 144(%r9), %ymm14 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpbroadcastd 144(%r9), %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] @@ -9541,27 +9559,27 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqa 160(%r9), %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpbroadcastd 176(%r9), %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vperm2i128 $19, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 @@ -9569,70 +9587,70 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovdqa 192(%r9), %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6],ymm12[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[4],ymm13[4],ymm14[5],ymm13[5] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpbroadcastd 208(%r9), %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[2,3],ymm15[2,3] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm11, %ymm3 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm13, %ymm3 ; AVX2-FCP-NEXT: vmovdqa 224(%r8), %ymm15 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vmovdqa 224(%r9), %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovdqa 224(%r9), %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpbroadcastd 240(%r9), %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpbroadcastd 240(%r9), %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm13[5],ymm3[6,7] ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vperm2i128 $19, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm8 = mem[2,3],ymm8[2,3] -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5,6],ymm5[7] +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4,5,6],ymm6[7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm5, 1504(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm6, 1504(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm3, 1440(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 1408(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm2, 1312(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm4, 1248(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm10, 1216(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm7, 1120(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm9, 1056(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm12, 1024(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm14, 928(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm12, 1216(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm10, 1120(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm5, 1056(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm11, 1024(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm9, 928(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 864(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9715,108 +9733,109 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i32_stride6_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm29 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm28 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm22 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm16 -; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512-NEXT: vpermt2d %zmm22, %zmm6, %zmm20 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512-NEXT: vpermt2d %zmm22, %zmm7, %zmm23 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-NEXT: vpermt2d %zmm22, %zmm30, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512-NEXT: vpermt2d %zmm18, %zmm6, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512-NEXT: vpermt2d %zmm18, %zmm7, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-NEXT: vpermt2d %zmm18, %zmm30, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 +; AVX512-NEXT: vpermt2d %zmm16, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512-NEXT: vpermt2d %zmm16, %zmm7, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm19 -; AVX512-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 -; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 +; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm18 ; AVX512-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 ; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 ; AVX512-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 +; AVX512-NEXT: vpermi2d %zmm15, %zmm29, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 -; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512-NEXT: vpermi2d %zmm15, %zmm29, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2d %zmm15, %zmm29, %zmm30 +; AVX512-NEXT: vpermi2d %zmm15, %zmm29, %zmm21 +; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 ; AVX512-NEXT: vmovdqa 128(%rdx), %ymm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] -; AVX512-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512-NEXT: vpermt2d (%rcx), %ymm2, %ymm13 ; AVX512-NEXT: movb $36, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm13[0,1,0,1,2,3,6,7] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-NEXT: vpermt2d %zmm5, %zmm24, %zmm13 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512-NEXT: vpermt2d %zmm17, %zmm24, %zmm15 ; AVX512-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512-NEXT: vmovdqa 192(%rdx), %ymm0 ; AVX512-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm24 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm3 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm15 ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 +; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] @@ -9824,9 +9843,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $-110, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm23 @@ -9837,119 +9856,122 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 ; AVX512-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm25 ; AVX512-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm1 = zmm9[2],zmm8[2],zmm9[3],zmm8[3],zmm9[6],zmm8[6],zmm9[7],zmm8[7],zmm9[10],zmm8[10],zmm9[11],zmm8[11],zmm9[14],zmm8[14],zmm9[15],zmm8[15] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512-NEXT: vpermt2d %zmm17, %zmm31, %zmm8 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512-NEXT: vpermt2d %zmm17, %zmm5, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm27 +; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm14[2],zmm17[2],zmm14[3],zmm17[3],zmm14[6],zmm17[6],zmm14[7],zmm17[7],zmm14[10],zmm17[10],zmm14[11],zmm17[11],zmm14[14],zmm17[14],zmm14[15],zmm17[15] +; AVX512-NEXT: vpermt2d %zmm17, %zmm11, %zmm27 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 ; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 ; AVX512-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm30 +; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm30 = ymm30[2],mem[2],ymm30[3],mem[3],ymm30[6],mem[6],ymm30[7],mem[7] ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm30[2,3,2,3,2,3,2,3] +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm30 = ymm30[2],mem[2],ymm30[3],mem[3],ymm30[6],mem[6],ymm30[7],mem[7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm30[2,3,2,3,2,3,2,3] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] -; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm17[6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm4 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm3[6,7,6,7,6,7,6,7] ; AVX512-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 +; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm29 ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 -; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 -; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 +; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm24 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 ; AVX512-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 -; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 +; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512-NEXT: vmovdqa64 %zmm29, 1472(%rax) ; AVX512-NEXT: vmovdqa64 %zmm11, 1408(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 1344(%rax) ; AVX512-NEXT: vmovdqa64 %zmm31, 1152(%rax) @@ -9961,26 +9983,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm26, 640(%rax) ; AVX512-NEXT: vmovdqa64 %zmm25, 576(%rax) ; AVX512-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm23, 256(%rax) ; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm21, 1280(%rax) -; AVX512-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, 1216(%rax) ; AVX512-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 832(%rax) ; AVX512-NEXT: vmovdqa64 %zmm22, 512(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rax) ; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX512-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i32_stride6_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 @@ -9997,21 +10019,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 @@ -10032,7 +10054,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 @@ -10050,7 +10072,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 @@ -10059,13 +10081,12 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 ; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 @@ -10074,14 +10095,16 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] @@ -10092,8 +10115,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 @@ -10106,7 +10128,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 @@ -10135,21 +10157,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} @@ -10207,28 +10229,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 @@ -10282,108 +10304,109 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-LABEL: store_i32_stride6_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512DQ-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm29 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm6, %zmm20 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm7, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm30, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm6, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm7, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm30, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm7, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm19 -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 -; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 +; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm29, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 -; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm29, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm29, %zmm30 +; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm29, %zmm21 +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 ; AVX512DQ-NEXT: vmovdqa 128(%rdx), %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] -; AVX512DQ-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512DQ-NEXT: vpermt2d (%rcx), %ymm2, %ymm13 ; AVX512DQ-NEXT: movb $36, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm13[0,1,0,1,2,3,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm24, %zmm13 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm24, %zmm15 ; AVX512DQ-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512DQ-NEXT: vmovdqa 192(%rdx), %ymm0 ; AVX512DQ-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm24 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm24 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] @@ -10391,9 +10414,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $-110, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm23 @@ -10404,119 +10427,122 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm1 = zmm9[2],zmm8[2],zmm9[3],zmm8[3],zmm9[6],zmm8[6],zmm9[7],zmm8[7],zmm9[10],zmm8[10],zmm9[11],zmm8[11],zmm9[14],zmm8[14],zmm9[15],zmm8[15] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm31, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm5, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm27 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm14[2],zmm17[2],zmm14[3],zmm17[3],zmm14[6],zmm17[6],zmm14[7],zmm17[7],zmm14[10],zmm17[10],zmm14[11],zmm17[11],zmm14[14],zmm17[14],zmm14[15],zmm17[15] +; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm11, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm30 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm30 = ymm30[2],mem[2],ymm30[3],mem[3],ymm30[6],mem[6],ymm30[7],mem[7] ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm30[2,3,2,3,2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm30 = ymm30[2],mem[2],ymm30[3],mem[3],ymm30[6],mem[6],ymm30[7],mem[7] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm30[2,3,2,3,2,3,2,3] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm17[6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm4 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm3[6,7,6,7,6,7,6,7] ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1472(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1408(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1344(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1152(%rax) @@ -10528,26 +10554,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 640(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 576(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1280(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 1216(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 832(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 512(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX512DQ-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i32_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 @@ -10564,21 +10590,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 @@ -10599,7 +10625,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 @@ -10617,7 +10643,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 @@ -10626,13 +10652,12 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 @@ -10641,14 +10666,16 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] @@ -10659,8 +10686,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 @@ -10673,7 +10699,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 @@ -10702,21 +10728,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} @@ -10774,28 +10800,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 @@ -10849,108 +10875,109 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512BW-LABEL: store_i32_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512BW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm29 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm28 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm22 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm16 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm30, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm7, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 ; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm29, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm29, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm29, %zmm30 +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm29, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 ; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] -; AVX512BW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512BW-NEXT: vpermt2d (%rcx), %ymm2, %ymm13 ; AVX512BW-NEXT: movb $36, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm13[0,1,0,1,2,3,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm13 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm15 ; AVX512BW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm0 ; AVX512BW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm24 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm15 ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] @@ -10958,9 +10985,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-110, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 @@ -10971,119 +10998,122 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm1 = zmm9[2],zmm8[2],zmm9[3],zmm8[3],zmm9[6],zmm8[6],zmm9[7],zmm8[7],zmm9[10],zmm8[10],zmm9[11],zmm8[11],zmm9[14],zmm8[14],zmm9[15],zmm8[15] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm27 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm14[2],zmm17[2],zmm14[3],zmm17[3],zmm14[6],zmm17[6],zmm14[7],zmm17[7],zmm14[10],zmm17[10],zmm14[11],zmm17[11],zmm14[14],zmm17[14],zmm14[15],zmm17[15] +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm30 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm30 = ymm30[2],mem[2],ymm30[3],mem[3],ymm30[6],mem[6],ymm30[7],mem[7] ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm30[2,3,2,3,2,3,2,3] +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm30 = ymm30[2],mem[2],ymm30[3],mem[3],ymm30[6],mem[6],ymm30[7],mem[7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm30[2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm17[6,7,6,7,6,7,6,7] +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm4 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm3[6,7,6,7,6,7,6,7] ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm29 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1472(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 1408(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 1344(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 1152(%rax) @@ -11095,26 +11125,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm26, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 1216(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 832(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512BW-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX512BW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i32_stride6_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 @@ -11131,21 +11161,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 @@ -11166,7 +11196,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 @@ -11184,7 +11214,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 @@ -11193,13 +11223,12 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 ; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 @@ -11208,14 +11237,16 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] @@ -11226,8 +11257,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 @@ -11240,7 +11270,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 @@ -11269,21 +11299,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} @@ -11341,28 +11371,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 @@ -11416,108 +11446,109 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-BW-LABEL: store_i32_stride6_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512DQ-BW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm13 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm20 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm30, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm7, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm29, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm29, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm29, %zmm30 +; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm29, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] -; AVX512DQ-BW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512DQ-BW-NEXT: vpermt2d (%rcx), %ymm2, %ymm13 ; AVX512DQ-BW-NEXT: movb $36, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm13[0,1,0,1,2,3,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm13 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %ymm0 ; AVX512DQ-BW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm24 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm0[0,1,0,1,2,3,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] @@ -11525,9 +11556,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $-110, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm23 @@ -11538,119 +11569,122 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm1 = zmm9[2],zmm8[2],zmm9[3],zmm8[3],zmm9[6],zmm8[6],zmm9[7],zmm8[7],zmm9[10],zmm8[10],zmm9[11],zmm8[11],zmm9[14],zmm8[14],zmm9[15],zmm8[15] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm27 +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm14[2],zmm17[2],zmm14[3],zmm17[3],zmm14[6],zmm17[6],zmm14[7],zmm17[7],zmm14[10],zmm17[10],zmm14[11],zmm17[11],zmm14[14],zmm17[14],zmm14[15],zmm17[15] +; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm30 +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm30 = ymm30[2],mem[2],ymm30[3],mem[3],ymm30[6],mem[6],ymm30[7],mem[7] ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm30[2,3,2,3,2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm30 = ymm30[2],mem[2],ymm30[3],mem[3],ymm30[6],mem[6],ymm30[7],mem[7] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm30[2,3,2,3,2,3,2,3] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] -; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] +; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm17[6,7,6,7,6,7,6,7] +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm4 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm3[6,7,6,7,6,7,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 1472(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 1408(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 1344(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1152(%rax) @@ -11662,26 +11696,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 640(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 576(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 1280(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 1216(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 832(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 512(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 448(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-BW-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX512DQ-BW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 @@ -11698,21 +11732,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 @@ -11733,7 +11767,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 @@ -11751,7 +11785,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 @@ -11760,13 +11794,12 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 @@ -11775,14 +11808,16 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] @@ -11793,8 +11828,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 @@ -11807,7 +11841,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 @@ -11836,21 +11870,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} @@ -11908,28 +11942,28 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index bead2c94cf121..e9713b8672b59 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -171,8 +171,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [3,5,7,u] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,4,2,0,0,4,2,0] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [u,u,u,u,0,4,2,u] ; AVX2-FCP-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [0,2,4,6,u,u,u,1] ; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm2 @@ -200,7 +199,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -224,9 +223,9 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] ; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -251,7 +250,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -275,9 +274,9 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -302,7 +301,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -326,9 +325,9 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] ; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -353,7 +352,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 @@ -377,9 +376,9 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,4,6,16,18,20,1,3,5,7,17,19,21,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1 @@ -608,53 +607,53 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm3 -; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-FCP-NEXT: vmovaps (%r8), %xmm4 -; AVX2-FCP-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FCP-NEXT: vmovaps (%r10), %xmm0 -; AVX2-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm2, %ymm2 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm2 +; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm0 +; AVX2-FCP-NEXT: vmovaps (%r8), %xmm3 +; AVX2-FCP-NEXT: vmovaps (%r9), %xmm4 +; AVX2-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm1, %ymm5 +; AVX2-FCP-NEXT: vmovaps (%r10), %xmm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm6 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,1,5,1,5,1,5,1] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm8 = [u,1,5,u,u,u,u,u] +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [5,0,2,6,5,0,2,6] ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm8 = [7,3,7,3,7,3,7,3] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm8 = [u,u,u,u,u,3,7,u] ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [2,6,0,3,2,6,0,3] ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm5 = [0,4,0,4,0,4,0,4] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm3 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3,4],ymm8[5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm6 ; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [0,4,0,1,0,4,0,1] ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm8, %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm3 -; AVX2-FCP-NEXT: vbroadcastss (%r10), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) -; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FCP-NEXT: vpermps %ymm5, %ymm8, %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss (%r10), %ymm6 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6],ymm5[7] +; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm5 = [u,3,7,u,u,u,u,u] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vmovaps %xmm0, 96(%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -668,15 +667,15 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -690,15 +689,15 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -712,15 +711,15 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -734,15 +733,15 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -756,15 +755,15 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -778,15 +777,15 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -800,15 +799,15 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -822,15 +821,15 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -1046,8 +1045,8 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7] ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6],ymm9[7] ; AVX-NEXT: vbroadcastsd 24(%rax), %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6],ymm9[7] ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] ; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm12[3,3],xmm11[3,3] ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 @@ -1407,27 +1406,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [u,7,15,23,31,u,u,u] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,u,u,u,23,31,7] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] ; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512-NEXT: kmovw %ecx, %k1 @@ -1451,27 +1450,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,7,15,23,31,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,u,u,u,23,31,7] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512-FCP-NEXT: kmovw %ecx, %k1 @@ -1495,27 +1494,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [u,7,15,23,31,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,u,u,u,23,31,7] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512DQ-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512DQ-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-NEXT: kmovw %ecx, %k1 @@ -1539,27 +1538,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,7,15,23,31,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,u,u,u,23,31,7] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 @@ -1583,27 +1582,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [u,7,15,23,31,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,u,u,u,23,31,7] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -1627,27 +1626,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,7,15,23,31,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,u,u,u,23,31,7] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512BW-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -1671,27 +1670,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [u,7,15,23,31,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,u,u,u,23,31,7] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -1715,27 +1714,27 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,7,15,23,31,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,7,15,23,31,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,0,0,0,0,23,31,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,u,u,u,23,31,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,8,16,0,0,0,0,1,9,17,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,8,16,24,0,0,0,1,9,17,25,0,0,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,2,10,18,0,0,0,0,3,11,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [2,10,0,0,0,19,27,3,11,0,0,0,20,28,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,5,13,21,29,0,0,0,6,14,22,30,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,12,20,0,0,0,0,5,13,21,0,0,0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -2137,10 +2136,10 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm13[1] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1],xmm0[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps (%rcx), %xmm7 -; AVX-NEXT: vmovaps (%rdx), %xmm12 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm7[1],zero -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rcx), %xmm5 +; AVX-NEXT: vmovaps (%rdx), %xmm11 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm11[1],xmm5[1],zero +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovaps (%r9), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2160,7 +2159,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[2,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] @@ -2244,6 +2243,8 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX-NEXT: vbroadcastsd 8(%rax), %ymm3 +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -2297,13 +2298,13 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2: # %bb.0: ; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps (%rax), %xmm6 -; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rax), %xmm4 +; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps (%r8), %xmm8 -; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%r8), %xmm6 +; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%r8), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps (%r9), %xmm9 @@ -2314,46 +2315,45 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vmovaps (%rcx), %xmm3 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero -; AVX2-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-NEXT: vmovaps (%rcx), %xmm11 ; AVX2-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX2-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm10[1],zero +; AVX2-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] +; AVX2-NEXT: vmovaps 32(%rsi), %xmm3 +; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2],ymm5[3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-NEXT: vbroadcastsd %xmm2, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2],xmm2[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX2-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm12[1],xmm11[1],zero +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] +; AVX2-NEXT: vmovaps (%rcx), %ymm13 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovaps (%r8), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2362,165 +2362,166 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 16(%rax), %xmm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm6 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] +; AVX2-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] +; AVX2-NEXT: vmovaps 32(%rcx), %ymm6 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps 32(%r9), %ymm9 -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-NEXT: vmovaps 32(%r9), %ymm2 +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-NEXT: vbroadcastss 60(%r8), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vbroadcastss 60(%r9), %ymm1 +; AVX2-NEXT: vbroadcastsd 56(%rax), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vbroadcastsd 56(%rax), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm3[3,3] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,2,2] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] ; AVX2-NEXT: vbroadcastsd 40(%rax), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss %xmm12, %xmm3 -; AVX2-NEXT: vbroadcastss %xmm10, %xmm1 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-NEXT: vbroadcastss %xmm10, %xmm2 +; AVX2-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX2-NEXT: vbroadcastsd %xmm15, %ymm3 +; AVX2-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload +; AVX2-NEXT: vbroadcastsd %xmm9, %ymm3 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6],ymm2[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm10[3,3] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,2,2,2] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] ; AVX2-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss %xmm2, %xmm0 -; AVX2-NEXT: vbroadcastss %xmm13, %xmm3 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vbroadcastss %xmm12, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] +; AVX2-NEXT: vmovaps %xmm7, %xmm15 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-NEXT: vbroadcastsd %xmm12, %ymm5 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] +; AVX2-NEXT: vbroadcastsd %xmm12, %ymm3 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6],ymm2[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[6],ymm0[6],ymm13[7],ymm0[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] -; AVX2-NEXT: vmovaps %ymm9, %ymm0 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] +; AVX2-NEXT: vmovaps %ymm4, %ymm0 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm8[3,3],ymm1[7,7],ymm8[7,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3,4],ymm7[5,6],ymm8[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[4],ymm5[4],ymm11[5],ymm5[5] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-NEXT: vbroadcastss 48(%rdx), %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX2-NEXT: # xmm6 = xmm1[3,3],mem[3,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = xmm1[3,3],mem[3,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-NEXT: vbroadcastss 16(%rdx), %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-NEXT: # xmm7 = xmm0[3,3],mem[3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5],ymm7[6,7] +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX2-NEXT: # xmm7 = xmm15[3,3],mem[3,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-NEXT: vmovaps %ymm5, 96(%rax) ; AVX2-NEXT: vmovaps %ymm4, 320(%rax) ; AVX2-NEXT: vmovaps %ymm2, 192(%rax) -; AVX2-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-NEXT: vmovaps %ymm3, 128(%rax) ; AVX2-NEXT: vmovaps %ymm10, 352(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 160(%rax) @@ -2548,13 +2549,13 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps (%rax), %xmm6 -; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rax), %xmm4 +; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps (%r8), %xmm8 -; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%r8), %xmm6 +; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps (%r9), %xmm9 @@ -2565,46 +2566,45 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vmovaps (%rcx), %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-FP-NEXT: vmovaps (%rcx), %xmm11 ; AVX2-FP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm10[1],zero +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-FP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm3 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2],ymm5[3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-FP-NEXT: vbroadcastsd %xmm2, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2],xmm2[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FP-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm12[1],xmm11[1],zero +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] +; AVX2-FP-NEXT: vmovaps (%rcx), %ymm13 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovaps (%r8), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2613,165 +2613,166 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FP-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 16(%rax), %xmm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm6 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] +; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] +; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm6 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm9 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm2 +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 60(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vbroadcastss 60(%r9), %ymm1 +; AVX2-FP-NEXT: vbroadcastsd 56(%rax), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vbroadcastsd 56(%rax), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm3[3,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,2,2] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vbroadcastsd 40(%rax), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm3 -; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm1 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm2 +; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FP-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastsd %xmm15, %ymm3 +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastsd %xmm9, %ymm3 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm10[3,3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,2,2,2] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm0 -; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm3 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] +; AVX2-FP-NEXT: vmovaps %xmm7, %xmm15 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastsd %xmm12, %ymm5 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vbroadcastsd %xmm12, %ymm3 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm2 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[6],ymm0[6],ymm13[7],ymm0[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] -; AVX2-FP-NEXT: vmovaps %ymm9, %ymm0 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] +; AVX2-FP-NEXT: vmovaps %ymm4, %ymm0 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] -; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm8[3,3],ymm1[7,7],ymm8[7,7] +; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3,4],ymm7[5,6],ymm8[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[4],ymm5[4],ymm11[5],ymm5[5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FP-NEXT: vbroadcastss 48(%rdx), %ymm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm6 = xmm1[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = xmm1[3,3],mem[3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-FP-NEXT: vbroadcastss 16(%rdx), %ymm8 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm7 = xmm0[3,3],mem[3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm7 = xmm15[3,3],mem[3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rax) ; AVX2-FP-NEXT: vmovaps %ymm4, 320(%rax) ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, 128(%rax) ; AVX2-FP-NEXT: vmovaps %ymm10, 352(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax) @@ -2797,52 +2798,51 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FCP-LABEL: store_i32_stride7_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $536, %rsp # imm = 0x218 +; AVX2-FCP-NEXT: subq $552, %rsp # imm = 0x228 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps (%rax), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps (%r8), %xmm5 -; AVX2-FCP-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps (%r9), %xmm7 -; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm14 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm14[1,1,1,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm13 +; AVX2-FCP-NEXT: vmovaps (%r9), %xmm6 +; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm15 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1,1,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-FCP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm8 -; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm10[1],xmm11[1],zero -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm14 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm14[1],zero +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm4 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1,1,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FCP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm8[1],zero +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm7[1],zero ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2869,164 +2869,163 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm12 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm10 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[6],ymm10[6],ymm6[7],ymm10[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] +; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,6,5,6,5,6,5,6] ; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm4 -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm15 = [5,6,5,6,5,6,5,6] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm15, %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FCP-NEXT: vbroadcastss 60(%r8), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 60(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FCP-NEXT: vbroadcastss 60(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm6[3,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm8[3,3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,2,2,0,1,2,2] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm15, %ymm5, %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] -; AVX2-FCP-NEXT: vmovaps %xmm14, %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] -; AVX2-FCP-NEXT: vbroadcastsd 40(%rax), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss %xmm11, %xmm2 -; AVX2-FCP-NEXT: vbroadcastss %xmm10, %xmm10 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastsd %xmm14, %ymm9 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,1,2,2,0,1,2,2] +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovaps %xmm15, %xmm9 +; AVX2-FCP-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 40(%rax), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss %xmm14, %xmm1 +; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm2 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm15, %ymm8 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm7[1,1],ymm3[5,5],ymm7[5,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,0,1,4,5,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FCP-NEXT: vbroadcastsd 48(%rax), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm6[3,3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm6[3,3] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm12[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] ; AVX2-FCP-NEXT: vbroadcastsd 8(%rax), %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4],ymm2[5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm3 +; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm4 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm0 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm4 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm14, %ymm4 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1],ymm1[1,1],ymm7[5,5],ymm1[5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,0,0,4,4,4,4] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm3 +; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2],ymm9[3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[6],ymm0[6],ymm5[7],ymm0[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[6],ymm0[6],ymm8[7],ymm0[7] ; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm0 -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm2[3,3],ymm10[3,3],ymm2[7,7],ymm10[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4],ymm9[5,6,7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm8[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm7 = xmm2[3,3],mem[3,3] +; AVX2-FCP-NEXT: vmovaps %ymm8, %ymm0 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm6[3,3],ymm3[7,7],ymm6[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0],ymm5[1,2],ymm8[3,4],ymm5[5,6],ymm8[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = xmm3[3,3],mem[3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm11[3,3],xmm12[3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6],ymm8[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm8 = xmm15[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm5, 320(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm13, (%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3045,7 +3044,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FCP-NEXT: addq $536, %rsp # imm = 0x218 +; AVX2-FCP-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -3060,96 +3059,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512-NEXT: kmovw %ecx, %k3 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512-NEXT: kmovw %ecx, %k1 @@ -3175,96 +3174,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512-FCP-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512-FCP-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512-FCP-NEXT: kmovw %ecx, %k3 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512-FCP-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512-FCP-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512-FCP-NEXT: kmovw %ecx, %k1 @@ -3290,96 +3289,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512DQ-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512DQ-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-NEXT: kmovw %ecx, %k3 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512DQ-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512DQ-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512DQ-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512DQ-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512DQ-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512DQ-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512DQ-NEXT: kmovw %ecx, %k1 @@ -3405,96 +3404,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512DQ-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512DQ-FCP-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512DQ-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 @@ -3520,96 +3519,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512BW-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512BW-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512BW-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512BW-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512BW-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512BW-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -3635,96 +3634,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512BW-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512BW-FCP-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512BW-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -3750,96 +3749,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512DQ-BW-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512DQ-BW-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512DQ-BW-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -3865,96 +3864,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $24769, %cx # imm = 0x60C1 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movw $3096, %cx # imm = 0xC18 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm6, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movw $-31994, %cx # imm = 0x8306 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm11, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -4601,20 +4600,18 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX-LABEL: store_i32_stride7_vf32: ; AVX: # %bb.0: -; AVX-NEXT: subq $1624, %rsp # imm = 0x658 +; AVX-NEXT: subq $1656, %rsp # imm = 0x678 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 96(%rsi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%rdx), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%rcx), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rax), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -4638,9 +4635,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vmovaps (%rcx), %xmm8 -; AVX-NEXT: vmovaps (%rdx), %xmm9 -; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rdx), %xmm7 +; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] @@ -4656,7 +4653,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm8[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4674,8 +4671,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX-NEXT: vmovaps (%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%r9), %ymm8 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX-NEXT: vmovaps (%r9), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX-NEXT: vmovaps (%rax), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4684,24 +4682,24 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm7[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,1] +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 32(%rcx), %xmm9 +; AVX-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 32(%r9), %xmm3 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 32(%r8), %xmm4 +; AVX-NEXT: vmovaps 32(%r9), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX-NEXT: vmovaps 32(%r8), %xmm6 +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps 32(%rax), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4709,28 +4707,28 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm6[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm7[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm9[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] +; AVX-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX-NEXT: vmovaps 32(%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4745,24 +4743,24 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX-NEXT: vmovaps 64(%rsi), %xmm7 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm6[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX-NEXT: vmovaps 64(%rsi), %xmm11 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm7[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,1] +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 64(%rcx), %xmm9 -; AVX-NEXT: vmovaps 64(%rdx), %xmm10 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX-NEXT: vmovaps 64(%rcx), %xmm8 +; AVX-NEXT: vmovaps 64(%rdx), %xmm9 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 64(%r9), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 64(%r8), %xmm5 -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX-NEXT: vmovaps 64(%r8), %xmm6 +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps 64(%rax), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4770,33 +4768,34 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm6[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm7[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm11[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm9[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] -; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX-NEXT: vmovaps 64(%rsi), %ymm13 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1],ymm10[1,1],ymm13[5,5],ymm10[5,5] +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 64(%rdx), %ymm13 -; AVX-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm9[1,1],ymm13[5,5],ymm9[5,5] -; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX-NEXT: vmovaps 64(%r8), %ymm7 -; AVX-NEXT: vmovaps 64(%r9), %ymm10 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[2,1],ymm1[6,4],ymm7[6,5] +; AVX-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX-NEXT: vmovaps 64(%rcx), %ymm14 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm14[1,1],ymm7[5,5],ymm14[5,5] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX-NEXT: vmovaps 64(%r8), %ymm8 +; AVX-NEXT: vmovaps 64(%r9), %ymm9 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm8[2,1],ymm1[6,4],ymm8[6,5] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 64(%rax), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -4804,11 +4803,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rdi), %xmm6 +; AVX-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm6[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm5[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 ; AVX-NEXT: vmovaps 96(%rcx), %xmm4 ; AVX-NEXT: vmovaps 96(%rdx), %xmm3 @@ -4817,47 +4816,47 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX-NEXT: vmovaps 96(%r9), %xmm5 -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 96(%r9), %xmm6 +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 96(%r8), %xmm11 ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm15 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm15 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX-NEXT: vinsertf128 $1, 96(%rax), %ymm0, %ymm0 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm11[1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm11[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm2[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm2[1] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm3[1],xmm4[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm1[0,2],ymm8[5,5],ymm1[4,6] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX-NEXT: vmovaps 16(%rax), %xmm15 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -4872,15 +4871,13 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] -; AVX-NEXT: vmovaps %ymm13, %ymm14 +; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm14[2],ymm7[3],ymm14[3],ymm7[6],ymm14[6],ymm7[7],ymm14[7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[0,2],ymm10[5,5],ymm1[4,6] +; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[0,2],ymm9[5,5],ymm1[4,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX-NEXT: vmovaps 80(%rax), %xmm15 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] @@ -4890,24 +4887,21 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm15[3,3],ymm8[7,7],ymm15[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3],ymm12[3,3],ymm5[7,7],ymm12[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vbroadcastss 124(%r8), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX-NEXT: vbroadcastss 124(%r9), %ymm1 +; AVX-NEXT: vbroadcastsd 120(%rax), %ymm15 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vbroadcastsd 120(%rax), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps %ymm3, %ymm1 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX-NEXT: vmovaps %ymm2, %ymm3 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] -; AVX-NEXT: vmovaps %ymm1, %ymm2 -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[4],ymm15[4],ymm8[5],ymm15[5] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,1],ymm0[0,2],ymm3[7,5],ymm0[4,6] +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm12[0],ymm5[1],ymm12[1],ymm5[4],ymm12[4],ymm5[5],ymm12[5] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vbroadcastss 108(%r8), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -4916,8 +4910,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm2[1,1],ymm3[5,5],ymm2[5,5] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm8[1,1],ymm15[5,5],ymm8[5,5] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1],ymm3[1,1],ymm2[5,5],ymm3[5,5] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm5[1,1],ymm12[5,5],ymm5[5,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX-NEXT: vbroadcastsd 112(%r8), %ymm1 @@ -4930,23 +4924,24 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[2,2,2,2] +; AVX-NEXT: vbroadcastsd 8(%rax), %ymm15 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX-NEXT: vbroadcastsd 8(%rax), %ymm15 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm12[3,3],ymm4[7,7],ymm12[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3],ymm4[3,3],ymm11[7,7],ymm4[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm5[3,3],ymm11[7,7],ymm5[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,3],ymm6[3,3],ymm3[7,7],ymm6[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4969,19 +4964,19 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[2,2,2,2] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] ; AVX-NEXT: vbroadcastsd 40(%rax), %ymm15 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3],ymm12[3,3],ymm11[7,7],ymm12[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3],ymm13[3,3],ymm12[7,7],ymm13[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm6, %ymm3 -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm6[3,3],ymm10[7,7],ymm6[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm11[3,3],ymm4[7,7],ymm11[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -5002,21 +4997,21 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,2,2,2] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] ; AVX-NEXT: vbroadcastsd 72(%rax), %ymm6 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm14, %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,3],ymm14[3,3],ymm6[7,7],ymm14[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm14[3,3],ymm4[7,7],ymm14[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3],ymm6[3,3],ymm5[7,7],ymm6[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -5043,12 +5038,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm2[0,2],ymm3[7,5],ymm2[4,6] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -5056,20 +5051,20 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX-NEXT: # xmm2 = xmm2[0,1,2],mem[3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,1],ymm3[0,2],ymm11[7,5],ymm3[4,6] +; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm4[0],ymm11[0],ymm4[1],ymm11[1],ymm4[4],ymm11[4],ymm4[5],ymm11[5] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm12[3,1],ymm3[0,2],ymm12[7,5],ymm3[4,6] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm13[3,3],mem[3,3] +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = xmm14[3,3],mem[3,3] ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[4],ymm14[4],ymm4[5],ymm14[5] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1],ymm5[0,2],ymm6[7,5],ymm5[4,6] +; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm5[0,2],ymm7[7,5],ymm5[4,6] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm9[3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm10[3,3] ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX-NEXT: # xmm5 = xmm5[0,1,2],mem[3] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5,6,7] @@ -5125,7 +5120,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm0, 864(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 832(%rax) -; AVX-NEXT: addq $1624, %rsp # imm = 0x658 +; AVX-NEXT: addq $1656, %rsp # imm = 0x678 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5134,47 +5129,49 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: subq $1320, %rsp # imm = 0x528 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps (%rax), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps (%r8), %xmm14 -; AVX2-NEXT: vmovaps 32(%r8), %xmm7 -; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%r9), %xmm10 -; AVX2-NEXT: vmovaps 32(%r9), %xmm9 -; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] +; AVX2-NEXT: vmovaps (%r8), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%r8), %xmm5 +; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%r9), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%r9), %xmm8 +; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-NEXT: vmovaps (%rcx), %xmm13 ; AVX2-NEXT: vmovaps 32(%rcx), %xmm4 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm12[1],zero -; AVX2-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX2-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm13[1],zero +; AVX2-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm13[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero +; AVX2-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm4[1],zero ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5197,16 +5194,16 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovaps 64(%rdx), %xmm13 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm13[1],xmm2[1],zero +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 64(%rdx), %xmm14 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm14[1],xmm2[1],zero ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 96(%r9), %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX2-NEXT: vmovaps 96(%r9), %xmm8 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-NEXT: vmovaps 96(%rax), %xmm1 @@ -5274,11 +5271,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5297,170 +5294,169 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 96(%rdx), %ymm4 -; AVX2-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] +; AVX2-NEXT: vmovaps 96(%rdx), %ymm10 +; AVX2-NEXT: vmovaps 96(%rcx), %ymm3 +; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] ; AVX2-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] -; AVX2-NEXT: vbroadcastss 112(%r9), %xmm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vbroadcastss 112(%rax), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss %xmm12, %xmm0 -; AVX2-NEXT: vbroadcastss %xmm8, %xmm15 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps %xmm14, %xmm11 -; AVX2-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6],ymm0[7] +; AVX2-NEXT: vbroadcastss 112(%r9), %xmm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastss 112(%rax), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-NEXT: vbroadcastss %xmm13, %xmm0 +; AVX2-NEXT: vbroadcastss %xmm9, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6],ymm2[7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm7[3,3],xmm6[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-NEXT: vbroadcastsd 8(%rax), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3,4],ymm2[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm2, %xmm0 +; AVX2-NEXT: vbroadcastss %xmm12, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm11[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm11[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] ; AVX2-NEXT: vbroadcastsd 40(%rax), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-NEXT: vbroadcastss %xmm12, %xmm0 -; AVX2-NEXT: vbroadcastss %xmm13, %xmm5 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm14, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm10[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm2[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] ; AVX2-NEXT: vbroadcastsd 72(%rax), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-NEXT: vbroadcastss %xmm12, %xmm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm11, %xmm5 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-NEXT: vbroadcastss %xmm11, %xmm1 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm9[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm7[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-NEXT: vbroadcastsd 104(%rax), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] ; AVX2-NEXT: vbroadcastss 108(%r8), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm3[2],ymm10[3],ymm3[3],ymm10[6],ymm3[6],ymm10[7],ymm3[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm8 = mem[1,2,2,3,5,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm6[1,2,3,4,5,6],ymm8[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6],ymm6[7] ; AVX2-NEXT: vmovaps 96(%rax), %ymm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm7[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-NEXT: vbroadcastss 124(%r8), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vbroadcastss 124(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-NEXT: vbroadcastss 124(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vbroadcastsd 120(%rax), %ymm1 @@ -5468,17 +5464,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm5[1,1],ymm10[5,5],ymm5[5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm6[1,1],ymm9[5,5],ymm6[5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -5487,12 +5483,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] ; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] @@ -5510,101 +5506,100 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] ; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-NEXT: vbroadcastsd 80(%rax), %ymm5 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-NEXT: vbroadcastsd 80(%rax), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vbroadcastss 16(%rdx), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 16(%rdx), %ymm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm5[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] -; AVX2-NEXT: vmovaps %ymm6, %ymm1 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-NEXT: # xmm7 = xmm3[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = xmm3[3,3],mem[3,3] +; AVX2-NEXT: vblendps $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = xmm5[0,1,2],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm10[3,3],ymm11[3,3],ymm10[7,7],ymm11[7,7] ; AVX2-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] -; AVX2-NEXT: vbroadcastss 48(%rdx), %ymm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-NEXT: # xmm7 = xmm3[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4],ymm5[5,6],ymm10[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-NEXT: vbroadcastss 48(%rdx), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6],ymm4[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = xmm2[3,3],mem[3,3] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-NEXT: # xmm5 = xmm5[0,1,2],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3,4],ymm7[5,6],ymm12[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] -; AVX2-NEXT: vbroadcastss 80(%rdx), %ymm7 -; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6],ymm12[7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0],ymm5[1,2],ymm13[3,4],ymm5[5,6],ymm13[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-NEXT: vbroadcastss 80(%rdx), %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload -; AVX2-NEXT: # xmm12 = xmm5[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm15[0],ymm0[1],ymm15[1],ymm0[4],ymm15[4],ymm0[5],ymm15[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX2-NEXT: # xmm13 = xmm2[3,3],mem[3,3] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-NEXT: vmovaps %ymm7, 544(%rax) +; AVX2-NEXT: vmovaps %ymm13, 640(%rax) +; AVX2-NEXT: vmovaps %ymm5, 544(%rax) ; AVX2-NEXT: vmovaps %ymm4, 416(%rax) -; AVX2-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-NEXT: vmovaps %ymm11, 320(%rax) ; AVX2-NEXT: vmovaps %ymm10, 192(%rax) -; AVX2-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-NEXT: vmovaps %ymm3, 96(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm4, 608(%rax) -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm3, 384(%rax) @@ -5655,47 +5650,49 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: subq $1320, %rsp # imm = 0x528 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps (%rax), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps (%r8), %xmm14 -; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm7 -; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%r9), %xmm10 -; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm9 -; AVX2-FP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] +; AVX2-FP-NEXT: vmovaps (%r8), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm5 +; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm8 +; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-FP-NEXT: vmovaps (%rcx), %xmm13 ; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm4 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm12[1],zero -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX2-FP-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm13[1],zero +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-FP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-FP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm13[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero +; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm4[1],zero ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5718,16 +5715,16 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm13 -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm13[1],xmm2[1],zero +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm14 +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm14[1],xmm2[1],zero ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%r9), %xmm7 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX2-FP-NEXT: vmovaps 96(%r9), %xmm8 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 96(%rax), %xmm1 @@ -5795,11 +5792,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5818,170 +5815,169 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm4 -; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] +; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm10 +; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm3 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] ; AVX2-FP-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] -; AVX2-FP-NEXT: vbroadcastss 112(%r9), %xmm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 112(%rax), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm0 -; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm15 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps %xmm14, %xmm11 -; AVX2-FP-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vbroadcastss 112(%r9), %xmm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 112(%rax), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm0 +; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6],ymm2[7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm7[3,3],xmm6[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vbroadcastsd 8(%rax), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3,4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm0 +; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm11[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm11[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vbroadcastsd 40(%rax), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm0 -; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm5 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm10[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm2[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vbroadcastsd 72(%rax), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm0 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm11, %xmm5 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FP-NEXT: vbroadcastss %xmm11, %xmm1 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm9[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm7[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vbroadcastsd 104(%rax), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] ; AVX2-FP-NEXT: vbroadcastss 108(%r8), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm3[2],ymm10[3],ymm3[3],ymm10[6],ymm3[6],ymm10[7],ymm3[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm8 = mem[1,2,2,3,5,6,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm6[1,2,3,4,5,6],ymm8[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6],ymm6[7] ; AVX2-FP-NEXT: vmovaps 96(%rax), %ymm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm7[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FP-NEXT: vbroadcastss 124(%r8), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 124(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FP-NEXT: vbroadcastss 124(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vbroadcastsd 120(%rax), %ymm1 @@ -5989,17 +5985,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm5[1,1],ymm10[5,5],ymm5[5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm6[1,1],ymm9[5,5],ymm6[5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,0,0,4,4,4,4] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -6008,12 +6004,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] @@ -6031,101 +6027,100 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] ; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FP-NEXT: vbroadcastsd 80(%rax), %ymm5 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FP-NEXT: vbroadcastsd 80(%rax), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 16(%rdx), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 16(%rdx), %ymm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm5[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] -; AVX2-FP-NEXT: vmovaps %ymm6, %ymm1 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm7 = xmm3[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = xmm3[3,3],mem[3,3] +; AVX2-FP-NEXT: vblendps $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = xmm5[0,1,2],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm10[3,3],ymm11[3,3],ymm10[7,7],ymm11[7,7] ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] -; AVX2-FP-NEXT: vbroadcastss 48(%rdx), %ymm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm7 = xmm3[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4],ymm5[5,6],ymm10[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FP-NEXT: vbroadcastss 48(%rdx), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6],ymm4[7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = xmm2[3,3],mem[3,3] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm5 = xmm5[0,1,2],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3,4],ymm7[5,6],ymm12[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] -; AVX2-FP-NEXT: vbroadcastss 80(%rdx), %ymm7 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6],ymm12[7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0],ymm5[1,2],ymm13[3,4],ymm5[5,6],ymm13[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FP-NEXT: vbroadcastss 80(%rdx), %ymm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6],ymm5[7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm12 = xmm5[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm15[0],ymm0[1],ymm15[1],ymm0[4],ymm15[4],ymm0[5],ymm15[5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm13 = xmm2[3,3],mem[3,3] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-FP-NEXT: vmovaps %ymm7, 544(%rax) +; AVX2-FP-NEXT: vmovaps %ymm13, 640(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 544(%rax) ; AVX2-FP-NEXT: vmovaps %ymm4, 416(%rax) -; AVX2-FP-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-FP-NEXT: vmovaps %ymm11, 320(%rax) ; AVX2-FP-NEXT: vmovaps %ymm10, 192(%rax) -; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm4, 608(%rax) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm3, 384(%rax) @@ -6178,7 +6173,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6202,10 +6197,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm11 +; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,2,2] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] @@ -6255,7 +6251,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm7 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] -; AVX2-FCP-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 96(%rax), %xmm1 @@ -6277,24 +6273,24 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm0[2],ymm12[3],ymm0[3],ymm12[6],ymm0[6],ymm12[7],ymm0[7] +; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm8 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm10 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[6],ymm10[6],ymm6[7],ymm10[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm8 -; AVX2-FCP-NEXT: vmovaps (%r9), %ymm14 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm14 +; AVX2-FCP-NEXT: vmovaps (%r9), %ymm11 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1,2,2,5,5,6,6] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm14[2],ymm1[3,4,5],ymm14[6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4,5],ymm8[6],ymm1[7] -; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] @@ -6346,67 +6342,68 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm5 -; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm5[1,1],ymm1[1,1],ymm5[5,5],ymm1[5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm15[5,6],ymm3[7] +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm6 +; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm0 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,1],ymm0[1,1],ymm6[5,5],ymm0[5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5,6],ymm1[7] ; AVX2-FCP-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0],ymm3[1,2,3,4,5,6],ymm15[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] ; AVX2-FCP-NEXT: vbroadcastss 112(%r9), %xmm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 112(%rax), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm15[2],ymm3[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm3 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6],ymm15[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vbroadcastss 108(%r8), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6] -; AVX2-FCP-NEXT: vpermps 96(%r9), %ymm12, %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1,2,3,4,5,6],ymm12[7] -; AVX2-FCP-NEXT: vmovaps 96(%rax), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-FCP-NEXT: vbroadcastss 112(%rax), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6],ymm4[7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastss 108(%r8), %ymm13 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm13[1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm13 = xmm7[2,2,3,3] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,6,5,6,5,6,5,6] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm13[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpermps 96(%r9), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovaps 96(%rax), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm15[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 124(%r8), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vbroadcastss 124(%r9), %ymm1 +; AVX2-FCP-NEXT: vbroadcastsd 120(%rax), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 120(%rax), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm0 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm1 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps %xmm11, %xmm15 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,1,2,2,0,1,2,2] -; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps %xmm9, %xmm15 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2] +; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -6416,29 +6413,28 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[3,3],xmm15[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm15[3,3],xmm13[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps %xmm5, %xmm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX2-FCP-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm10[1,1],ymm6[5,5],ymm10[5,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1],ymm10[1,1],ymm8[5,5],ymm10[5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm3 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6450,20 +6446,20 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm5[3,3] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3] @@ -6476,9 +6472,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = ymm0[1,1],mem[1,1],ymm0[5,5],mem[5,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1],ymm11[1,1],ymm0[5,5],ymm11[5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] @@ -6496,159 +6492,160 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm2 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm0[3,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm6[3,3],xmm0[3,3] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-FCP-NEXT: vbroadcastsd 72(%rax), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 72(%rax), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm10[1,1],ymm13[5,5],ymm10[5,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1],ymm10[1,1],ymm14[5,5],ymm10[5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 80(%rax), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 80(%rax), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm3 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm5, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm4 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vpermps %ymm3, %ymm11, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6],ymm8[7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm1[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-FCP-NEXT: vbroadcastsd 104(%rax), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm3[2,3,4],ymm7[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3],xmm9[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[6],ymm1[6],ymm7[7],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6],ymm8[7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3],xmm1[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FCP-NEXT: vpermps %ymm6, %ymm9, %ymm6 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm4[5,6],ymm6[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,2,2] +; AVX2-FCP-NEXT: vbroadcastsd 104(%rax), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6],ymm4[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4],ymm4[5,6],ymm7[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm4 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = xmm3[3,3],mem[3,3] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4],ymm6[5,6],ymm8[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vmovaps %ymm11, %ymm0 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6],ymm6[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm7 = xmm5[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[4],ymm1[4],ymm11[5],ymm1[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm8 = xmm2[3,3],mem[3,3] +; AVX2-FCP-NEXT: vblendps $8, (%rsp), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 80(%rdx), %ymm8 +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0],ymm9[1,2],ymm11[3,4],ymm9[5,6],ymm11[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FCP-NEXT: vbroadcastss 80(%rdx), %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4],ymm9[5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6],ymm9[7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = xmm0[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm11 = xmm0[3,3],mem[3,3] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm12 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3,4],ymm12[5,6],ymm13[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4],ymm12[5,6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm9, 640(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm8, 544(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm6, 736(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm11, 640(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm9, 544(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm8, 416(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm6, 320(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm4, 192(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm5, 736(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6712,42 +6709,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -6758,7 +6755,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -6768,7 +6765,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -6776,14 +6773,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512-NEXT: kmovw %eax, %k2 @@ -6791,56 +6788,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512-NEXT: kmovw %eax, %k3 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512-NEXT: kmovw %eax, %k2 @@ -6851,17 +6848,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -6869,15 +6866,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -6915,42 +6912,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -6961,7 +6958,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -6971,7 +6968,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -6979,14 +6976,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512-FCP-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512-FCP-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512-FCP-NEXT: kmovw %eax, %k2 @@ -6994,56 +6991,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512-FCP-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512-FCP-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512-FCP-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512-FCP-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512-FCP-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512-FCP-NEXT: kmovw %eax, %k3 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512-FCP-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512-FCP-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512-FCP-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512-FCP-NEXT: kmovw %eax, %k2 @@ -7054,17 +7051,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7072,15 +7069,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7118,42 +7115,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512DQ-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512DQ-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512DQ-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7164,7 +7161,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7174,7 +7171,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7182,14 +7179,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512DQ-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512DQ-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512DQ-NEXT: kmovw %eax, %k2 @@ -7197,56 +7194,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512DQ-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512DQ-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512DQ-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512DQ-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512DQ-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512DQ-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512DQ-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512DQ-NEXT: kmovw %eax, %k2 @@ -7257,17 +7254,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512DQ-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7275,15 +7272,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7321,42 +7318,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512DQ-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512DQ-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7367,7 +7364,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7377,7 +7374,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7385,14 +7382,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512DQ-FCP-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512DQ-FCP-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 @@ -7400,56 +7397,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512DQ-FCP-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512DQ-FCP-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512DQ-FCP-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512DQ-FCP-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 @@ -7460,17 +7457,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7478,15 +7475,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7524,42 +7521,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7570,7 +7567,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7580,7 +7577,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7588,14 +7585,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512BW-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-NEXT: kmovd %eax, %k2 @@ -7603,56 +7600,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512BW-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512BW-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512BW-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512BW-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512BW-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512BW-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %eax, %k2 @@ -7663,17 +7660,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7681,15 +7678,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7727,42 +7724,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512BW-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512BW-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7773,7 +7770,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7783,7 +7780,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7791,14 +7788,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512BW-FCP-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512BW-FCP-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 @@ -7806,56 +7803,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512BW-FCP-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512BW-FCP-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512BW-FCP-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512BW-FCP-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 @@ -7866,17 +7863,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -7884,15 +7881,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -7930,42 +7927,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512DQ-BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512DQ-BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -7976,7 +7973,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -7986,7 +7983,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -7994,14 +7991,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512DQ-BW-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512DQ-BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 @@ -8009,56 +8006,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512DQ-BW-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512DQ-BW-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512DQ-BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512DQ-BW-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512DQ-BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 @@ -8069,17 +8066,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -8087,15 +8084,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -8133,42 +8130,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm27 @@ -8179,7 +8176,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} @@ -8189,7 +8186,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 @@ -8197,14 +8194,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512DQ-BW-FCP-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512DQ-BW-FCP-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 @@ -8212,56 +8209,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512DQ-BW-FCP-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512DQ-BW-FCP-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 @@ -8272,17 +8269,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} @@ -8290,15 +8287,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) @@ -9592,24 +9589,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX-LABEL: store_i32_stride7_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $3432, %rsp # imm = 0xD68 +; AVX-NEXT: subq $3416, %rsp # imm = 0xD58 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX-NEXT: vmovaps 224(%rsi), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rdx), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rcx), %ymm5 +; AVX-NEXT: vmovaps 224(%rcx), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 224(%r8), %ymm5 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%r8), %ymm4 -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rax), %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] -; AVX-NEXT: vmovaps %ymm1, %ymm5 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] @@ -9622,35 +9618,35 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%r9), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%r8), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%r8), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rdi), %xmm6 -; AVX-NEXT: vmovaps (%rsi), %xmm5 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm6[0] +; AVX-NEXT: vmovaps (%rsi), %xmm4 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm6[0] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps (%rcx), %xmm7 -; AVX-NEXT: vmovaps (%rdx), %xmm8 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rcx), %xmm14 +; AVX-NEXT: vmovaps (%rdx), %xmm7 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm5[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm4[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm14[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9668,8 +9664,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX-NEXT: vmovaps (%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%r9), %ymm8 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX-NEXT: vmovaps (%r9), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX-NEXT: vmovaps (%rax), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9678,17 +9675,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm9[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,1] -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm8[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,1] +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX-NEXT: vmovaps 32(%rdx), %xmm12 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX-NEXT: vmovaps 32(%rcx), %xmm9 +; AVX-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 32(%r9), %xmm3 @@ -9706,25 +9703,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm6[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm5[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm10[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm9[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] +; AVX-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX-NEXT: vmovaps 32(%rcx), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rcx), %ymm10 -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm10[1,1],ymm1[5,5],ymm10[5,5] -; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm1[1,1],ymm14[5,5],ymm1[5,5] +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX-NEXT: vmovaps 32(%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9739,17 +9736,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm7[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,1] -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm8[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,1] +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 64(%rcx), %xmm9 -; AVX-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 64(%r9), %xmm3 @@ -9767,11 +9764,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm5[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm9[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm9[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9781,11 +9778,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX-NEXT: vmovaps 64(%rcx), %ymm9 +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm9[1,1],ymm1[5,5],ymm9[5,5] +; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX-NEXT: vmovaps 64(%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9800,24 +9797,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm5[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX-NEXT: vmovaps 96(%rsi), %xmm5 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm8[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,1] +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 96(%rcx), %xmm7 -; AVX-NEXT: vmovaps 96(%rdx), %xmm9 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 96(%rcx), %xmm10 +; AVX-NEXT: vmovaps 96(%rdx), %xmm11 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 96(%r9), %xmm4 +; AVX-NEXT: vmovaps 96(%r9), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 96(%r8), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 96(%r8), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps 96(%rax), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9825,33 +9822,34 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm6[1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm5[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm11[1],xmm10[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] +; AVX-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 96(%rdx), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 96(%rdx), %ymm8 ; AVX-NEXT: vmovaps 96(%rcx), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm1[1,1],ymm8[5,5],ymm1[5,5] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX-NEXT: vmovaps 96(%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%r9), %ymm14 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm2[0],ymm14[2],ymm2[2] +; AVX-NEXT: vmovaps 96(%r9), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX-NEXT: vmovaps 96(%rax), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9860,24 +9858,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rdi), %xmm5 -; AVX-NEXT: vmovaps 128(%rsi), %xmm4 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm5[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 128(%rdi), %xmm11 +; AVX-NEXT: vmovaps 128(%rsi), %xmm10 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm11[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 128(%rcx), %xmm7 -; AVX-NEXT: vmovaps 128(%rdx), %xmm9 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 128(%rcx), %xmm6 +; AVX-NEXT: vmovaps 128(%rdx), %xmm5 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 128(%r9), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 128(%r8), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX-NEXT: vmovaps 128(%r8), %xmm4 +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps 128(%rax), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9885,14 +9883,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm6[1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm10[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm6[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9902,11 +9900,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 128(%rdx), %ymm9 -; AVX-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] -; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX-NEXT: vmovaps 128(%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9921,24 +9919,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX-NEXT: vmovaps 160(%rdi), %xmm10 ; AVX-NEXT: vmovaps 160(%rsi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm11[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm10[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,1] +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 160(%rcx), %xmm7 -; AVX-NEXT: vmovaps 160(%rdx), %xmm3 -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 160(%rcx), %xmm11 +; AVX-NEXT: vmovaps 160(%rdx), %xmm5 +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm11[0],xmm5[1],xmm11[1] +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 160(%r9), %xmm4 +; AVX-NEXT: vmovaps 160(%r9), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 160(%r8), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 160(%r8), %xmm5 -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps 160(%rax), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9946,22 +9944,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm6[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm6[1] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm3[1],xmm7[1],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm11[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX-NEXT: vmovaps 160(%rsi), %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX-NEXT: vmovaps 160(%rdi), %ymm13 +; AVX-NEXT: vmovaps 160(%rsi), %ymm15 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm13[1,1],ymm15[5,5],ymm13[5,5] ; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX-NEXT: vmovaps 160(%rdx), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9969,11 +9967,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX-NEXT: vmovaps 160(%r8), %ymm11 -; AVX-NEXT: vmovaps 160(%r9), %ymm13 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,1],ymm1[6,4],ymm11[6,5] -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 160(%r8), %ymm10 +; AVX-NEXT: vmovaps 160(%r9), %ymm12 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,1],ymm1[6,4],ymm10[6,5] +; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 160(%rax), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -9981,11 +9979,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %xmm12 -; AVX-NEXT: vmovaps 192(%rsi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm12[0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[2,1] -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX-NEXT: vmovaps 192(%rsi), %xmm5 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm11[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 192(%rcx), %xmm7 ; AVX-NEXT: vmovaps 192(%rdx), %xmm6 @@ -9994,11 +9992,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 192(%r9), %xmm4 +; AVX-NEXT: vmovaps 192(%r9), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 192(%r8), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 192(%r8), %xmm5 -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps 192(%rax), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10006,12 +10004,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm3[1] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm5[1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm7[1],zero ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] @@ -10019,131 +10017,130 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rsi), %ymm7 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[1,1],ymm7[5,5],ymm0[5,5] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovaps 192(%rdx), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rcx), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] -; AVX-NEXT: vmovaps 192(%r8), %ymm1 +; AVX-NEXT: vmovaps 192(%rsi), %ymm6 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[1,1],ymm6[5,5],ymm0[5,5] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX-NEXT: vmovaps 192(%r8), %ymm4 ; AVX-NEXT: vmovaps 192(%r9), %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,1],ymm12[6,4],ymm1[6,5] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3] -; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6],ymm0[7] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm1[2,0],ymm4[2,1],ymm1[6,4],ymm4[6,5] +; AVX-NEXT: vmovaps 192(%rdx), %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 192(%rcx), %ymm5 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm5[1,1],ymm0[5,5],ymm5[5,5] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm11[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6],ymm3[7] +; AVX-NEXT: vextractf128 $1, %ymm11, %xmm3 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm4[0,2],ymm8[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 16(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 16(%rax), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 48(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 48(%rax), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 80(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm3[1],ymm14[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 80(%rax), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm14[1],ymm4[3],ymm14[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm14[1,1],ymm4[0,2],ymm14[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 112(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 112(%rax), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1],ymm4[0,2],ymm6[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 144(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 144(%rax), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm4[0,2],ymm13[5,5],ymm4[4,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX-NEXT: vmovaps 176(%rax), %xmm12 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[0,2],ymm12[5,5],ymm1[4,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX-NEXT: vmovaps 176(%rax), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX-NEXT: vmovaps %ymm6, %ymm10 +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX-NEXT: vmovaps %ymm5, %ymm8 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX-NEXT: vmovaps 208(%rax), %xmm2 @@ -10154,63 +10151,63 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm3[1] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vmovaps 224(%rcx), %xmm4 -; AVX-NEXT: vmovaps 224(%rdx), %xmm12 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm12[1],xmm4[1],zero -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3],ymm1[4,5,6,7] -; AVX-NEXT: vbroadcastss 228(%r9), %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] -; AVX-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm5 +; AVX-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX-NEXT: vmovaps 224(%rdx), %xmm11 +; AVX-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm11[1],xmm1[1],zero +; AVX-NEXT: vbroadcastss 228(%r8), %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6,7] +; AVX-NEXT: vbroadcastss 228(%r9), %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX-NEXT: vinsertf128 $1, 224(%rax), %ymm4, %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm12[1] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[2,3] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm11[1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3],xmm3[3,3] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6],ymm0[7] ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vbroadcastss 232(%r9), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX-NEXT: vbroadcastss 232(%rax), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX-NEXT: vbroadcastss 232(%r9), %xmm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] +; AVX-NEXT: vbroadcastss 232(%rax), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm6[3,3],ymm8[7,7],ymm6[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm7[3,3],ymm11[7,7],ymm7[7,7] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vbroadcastss 220(%r8), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX-NEXT: vbroadcastss 220(%r9), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vbroadcastsd 216(%rax), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,3],ymm10[3,3],ymm9[7,7],ymm10[7,7] +; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vbroadcastss 220(%r8), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX-NEXT: vbroadcastss 220(%r9), %ymm4 +; AVX-NEXT: vbroadcastsd 216(%rax), %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6],ymm5[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 +; AVX-NEXT: vbroadcastss 224(%r9), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX-NEXT: vbroadcastss 224(%r9), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX-NEXT: vbroadcastss 224(%rax), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm3[3,3],ymm8[7,7],ymm3[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[3,3],ymm4[3,3],ymm5[7,7],ymm4[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm2[3,3],ymm4[7,7],ymm2[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm3[3,3],ymm2[7,7],ymm3[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vbroadcastss 252(%r8), %ymm1 @@ -10220,19 +10217,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,1],ymm0[0,2],ymm8[7,5],ymm0[4,6] -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX-NEXT: vmovaps %ymm5, %ymm1 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX-NEXT: vmovaps %ymm4, %ymm5 +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX-NEXT: vmovaps %ymm1, %ymm8 +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX-NEXT: vmovaps %ymm3, %ymm4 +; AVX-NEXT: vmovaps %ymm2, %ymm3 +; AVX-NEXT: vbroadcastss 236(%r8), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX-NEXT: vbroadcastss 236(%r8), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vbroadcastss 236(%r9), %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm4[1,1],ymm2[5,5],ymm4[5,5] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm8[1,1],ymm5[5,5],ymm8[5,5] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm3[1,1],ymm4[5,5],ymm3[5,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX-NEXT: vbroadcastsd 240(%r8), %ymm1 @@ -10259,8 +10261,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm5[3,3],mem[3,3],ymm5[7,7],mem[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10297,9 +10300,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm9[3,3],mem[3,3],ymm9[7,7],mem[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10332,14 +10335,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm14[3,3],mem[3,3],ymm14[7,7],mem[7,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] @@ -10357,14 +10359,15 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[2,2,2,2] +; AVX-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm10[3,3],ymm0[7,7],ymm10[7,7] +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -10397,20 +10400,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3],ymm15[3,3],ymm12[7,7],ymm15[7,7] +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm7[3,3],mem[3,3],ymm7[7,7],mem[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm11[3,3],ymm2[7,7],ymm11[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm13[3,3],ymm2[7,7],ymm13[7,7] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3],ymm1[1,2],ymm3[6,7],ymm1[5,6] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3],ymm1[1,2],ymm4[6,7],ymm1[5,6] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] @@ -10419,125 +10421,125 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = xmm0[3,3],mem[3,3] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] -; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[2,2,2,2] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] -; AVX-NEXT: vbroadcastsd 168(%rax), %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6],ymm4[7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3] +; AVX-NEXT: vbroadcastsd 168(%rax), %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm4[2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm12[3,3],ymm11[7,7],ymm12[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3],ymm9[3,3],ymm0[7,7],ymm9[7,7] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = ymm3[3,3],mem[3,3],ymm3[7,7],mem[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,3],ymm10[3,3],ymm0[7,7],ymm10[7,7] +; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3],ymm3[1,2],ymm5[6,7],ymm3[5,6] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4],ymm3[5,6,7] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,3],ymm4[1,2],ymm5[6,7],ymm4[5,6] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4],ymm4[5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6],ymm4[7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX-NEXT: vbroadcastsd 200(%rax), %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4],ymm1[5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,1],ymm3[0,2],ymm6[7,5],ymm3[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vshufps $255, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,1],ymm4[0,2],ymm6[7,5],ymm4[4,6] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm4[1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[4],mem[4],ymm9[5],mem[5] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[2],ymm9[2] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,1],ymm4[0,2],ymm9[7,5],ymm4[4,6] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vshufps $255, (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm4[1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[3,1],ymm4[0,2],ymm3[7,5],ymm4[4,6] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm3[3,3],mem[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm4[1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[3,1],ymm4[0,2],ymm3[7,5],ymm4[4,6] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[4],ymm11[4],ymm2[5],ymm11[5] -; AVX-NEXT: vmovaps %ymm12, %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1],ymm12[0,2],ymm2[7,5],ymm12[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm2[3,3],mem[3,3] -; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[4],ymm9[4],ymm0[5],ymm9[5] +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm3[3,3],mem[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[4],ymm13[4],ymm2[5],ymm13[5] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm10[3,1],ymm15[0,2],ymm10[7,5],ymm15[4,6] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm13[3,3],mem[3,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm2[3,1],ymm13[0,2],ymm2[7,5],ymm13[4,6] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5],ymm13[6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX-NEXT: # xmm13 = xmm2[3,3],mem[3,3] +; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm13[1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5] +; AVX-NEXT: vmovaps %ymm11, %ymm0 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,1],ymm11[0,2],ymm0[7,5],ymm11[4,6] +; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm14[3,3],xmm15[3,3] ; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3] ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] @@ -10556,11 +10558,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps %ymm10, 1440(%rax) ; AVX-NEXT: vmovaps %ymm11, 1216(%rax) -; AVX-NEXT: vmovaps %ymm3, 992(%rax) +; AVX-NEXT: vmovaps %ymm4, 992(%rax) ; AVX-NEXT: vmovaps %ymm1, 768(%rax) -; AVX-NEXT: vmovaps %ymm5, 544(%rax) -; AVX-NEXT: vmovaps %ymm6, 320(%rax) -; AVX-NEXT: vmovaps %ymm14, 96(%rax) +; AVX-NEXT: vmovaps %ymm9, 544(%rax) +; AVX-NEXT: vmovaps %ymm5, 320(%rax) +; AVX-NEXT: vmovaps %ymm6, 96(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 1504(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10659,7 +10661,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm0, 1632(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX-NEXT: addq $3432, %rsp # imm = 0xD68 +; AVX-NEXT: addq $3416, %rsp # imm = 0xD58 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -10673,27 +10675,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps (%r8), %xmm13 ; AVX2-NEXT: vmovaps 32(%r8), %xmm4 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%r9), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%r9), %xmm11 ; AVX2-NEXT: vmovaps 32(%r9), %xmm5 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,1,1] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vmovaps (%rcx), %xmm10 +; AVX2-NEXT: vmovaps (%rcx), %xmm14 ; AVX2-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero -; AVX2-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] +; AVX2-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm14[1],zero +; AVX2-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] @@ -10705,11 +10706,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero +; AVX2-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm3[1],zero ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10768,7 +10769,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 128(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 128(%r9), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 @@ -10792,7 +10793,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%r8), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-NEXT: vmovaps 160(%r9), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] @@ -11004,10 +11005,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX2-NEXT: vmovaps 224(%rcx), %xmm7 +; AVX2-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-NEXT: vmovaps 224(%rdx), %xmm6 +; AVX2-NEXT: vbroadcastss %xmm6, %xmm3 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm5 @@ -11018,30 +11019,30 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 224(%r8), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-NEXT: vbroadcastss %xmm3, %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] -; AVX2-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] -; AVX2-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] -; AVX2-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] -; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] -; AVX2-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-NEXT: vbroadcastss %xmm3, %ymm1 +; AVX2-NEXT: vbroadcastss 224(%rax), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm6[1],xmm7[1],zero +; AVX2-NEXT: vbroadcastss 228(%r8), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-NEXT: vinsertf128 $1, 224(%rax), %ymm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-NEXT: vmovaps 224(%r8), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] @@ -11049,276 +11050,275 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-NEXT: vmovaps 224(%rsi), %ymm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps 224(%rdx), %ymm12 -; AVX2-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] -; AVX2-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] -; AVX2-NEXT: vbroadcastss 240(%r9), %xmm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-NEXT: vbroadcastss 240(%rax), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss %xmm10, %xmm14 -; AVX2-NEXT: vbroadcastss %xmm9, %xmm15 -; AVX2-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm13, %xmm1 -; AVX2-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-NEXT: vmovaps 224(%rsi), %ymm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovaps 224(%rdx), %ymm4 +; AVX2-NEXT: vmovaps 224(%rcx), %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1],ymm0[1,1],ymm4[5,5],ymm0[5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6],ymm6[7] +; AVX2-NEXT: vbroadcastsd 240(%r8), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vbroadcastss 240(%r9), %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastss 240(%rax), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-NEXT: vbroadcastss %xmm12, %xmm7 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm8[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-NEXT: vbroadcastsd 8(%rax), %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,2,2,2] +; AVX2-NEXT: vbroadcastsd 8(%rax), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm11, %xmm6 +; AVX2-NEXT: vbroadcastss %xmm10, %xmm7 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm12[3,3],xmm13[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm14[2,2,2,2] ; AVX2-NEXT: vbroadcastsd 40(%rax), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm12, %xmm6 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm11, %xmm7 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm14[2,2,2,2] ; AVX2-NEXT: vbroadcastsd 72(%rax), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm12, %xmm7 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,3],xmm9[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] ; AVX2-NEXT: vbroadcastsd 104(%rax), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm12, %xmm7 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,3],xmm9[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] ; AVX2-NEXT: vbroadcastsd 136(%rax), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-NEXT: vbroadcastss %xmm13, %xmm7 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,3],xmm9[3,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,2,2,2] ; AVX2-NEXT: vbroadcastsd 168(%rax), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vbroadcastss %xmm14, %xmm6 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-NEXT: vbroadcastss %xmm13, %xmm7 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,3],xmm9[3,3] ; AVX2-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,2,2,2] ; AVX2-NEXT: vbroadcastsd 200(%rax), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload ; AVX2-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] ; AVX2-NEXT: vbroadcastss 220(%r8), %ymm7 +; AVX2-NEXT: vbroadcastss 220(%r9), %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-NEXT: vbroadcastss 220(%r9), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-NEXT: vbroadcastsd 216(%rax), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 240(%rdx), %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-NEXT: vbroadcastss 236(%r8), %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-NEXT: vbroadcastss 236(%r8), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] ; AVX2-NEXT: vmovaps 224(%rax), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11335,8 +11335,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm6[1,1],ymm11[5,5],ymm6[5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] @@ -11353,9 +11353,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm8[1,1],ymm1[5,5],ymm8[5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -11400,8 +11400,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-NEXT: vbroadcastsd 112(%rax), %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11445,7 +11445,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -11457,7 +11457,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] @@ -11468,8 +11468,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -11478,10 +11479,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-NEXT: vmovaps %ymm11, %ymm6 -; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vmovaps %ymm8, %ymm6 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,1,2,0,7,5,6,4] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -11490,7 +11491,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] @@ -11503,8 +11504,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vbroadcastss 80(%rdx), %ymm0 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] @@ -11532,7 +11532,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] @@ -11562,65 +11562,65 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastss 144(%rdx), %ymm0 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm1[1,2],ymm13[3,4],ymm1[5,6],ymm13[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-NEXT: vbroadcastss 176(%rdx), %ymm0 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] +; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm0[6],ymm13[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm7[3,3],mem[3,3] -; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-NEXT: vunpcklps {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload +; AVX2-NEXT: # xmm13 = xmm7[3,3],mem[3,3] +; AVX2-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3,4],ymm13[5,6],ymm14[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4],ymm13[5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] -; AVX2-NEXT: vbroadcastss 208(%rdx), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = mem[3,1,2,0,7,5,6,4] +; AVX2-NEXT: vbroadcastss 208(%rdx), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload ; AVX2-NEXT: # xmm14 = xmm1[3,3],mem[3,3] @@ -11635,17 +11635,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm4, 992(%rax) ; AVX2-NEXT: vmovaps %ymm5, 864(%rax) ; AVX2-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-NEXT: vmovaps %ymm8, 640(%rax) +; AVX2-NEXT: vmovaps %ymm12, 640(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-NEXT: vmovaps %ymm11, 416(%rax) +; AVX2-NEXT: vmovaps %ymm8, 320(%rax) +; AVX2-NEXT: vmovaps %ymm9, 192(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-NEXT: vmovaps %ymm12, 192(%rax) -; AVX2-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-NEXT: vmovaps %ymm9, 1472(%rax) +; AVX2-NEXT: vmovaps %ymm10, 1472(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11742,27 +11742,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps (%r8), %xmm13 ; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm4 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%r9), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%r9), %xmm11 ; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm5 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,1,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-FP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vmovaps (%rcx), %xmm10 +; AVX2-FP-NEXT: vmovaps (%rcx), %xmm14 ; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-FP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] +; AVX2-FP-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm14[1],zero +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] @@ -11774,11 +11773,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FP-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero +; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm3[1],zero ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11837,7 +11836,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%r9), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 @@ -11861,7 +11860,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%r8), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%r9), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] @@ -12073,10 +12072,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm2 -; AVX2-FP-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX2-FP-NEXT: vmovaps 224(%rcx), %xmm7 +; AVX2-FP-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-FP-NEXT: vmovaps 224(%rdx), %xmm6 +; AVX2-FP-NEXT: vbroadcastss %xmm6, %xmm3 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm5 @@ -12087,30 +12086,30 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 224(%r8), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-FP-NEXT: vbroadcastss %xmm3, %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] -; AVX2-FP-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-FP-NEXT: vbroadcastss %xmm3, %ymm1 +; AVX2-FP-NEXT: vbroadcastss 224(%rax), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm6[1],xmm7[1],zero +; AVX2-FP-NEXT: vbroadcastss 228(%r8), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 224(%rax), %ymm1, %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] @@ -12118,115 +12117,83 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm12 -; AVX2-FP-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] -; AVX2-FP-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] -; AVX2-FP-NEXT: vbroadcastss 240(%r9), %xmm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-FP-NEXT: vbroadcastss 240(%rax), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm14 -; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm15 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps %xmm13, %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] -; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 8(%rax), %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm1 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] +; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm4 +; AVX2-FP-NEXT: vmovaps 224(%rcx), %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1],ymm0[1,1],ymm4[5,5],ymm0[5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6],ymm6[7] +; AVX2-FP-NEXT: vbroadcastsd 240(%r8), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vbroadcastss 240(%r9), %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastss 240(%rax), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm7 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-FP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm8[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 40(%rax), %ymm8 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,2,2,2] +; AVX2-FP-NEXT: vbroadcastsd 8(%rax), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm11, %xmm6 ; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm7 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm12[3,3],xmm13[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 72(%rax), %ymm8 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm14[2,2,2,2] +; AVX2-FP-NEXT: vbroadcastsd 40(%rax), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm15, %xmm6 -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm6 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm11, %xmm7 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload @@ -12234,160 +12201,191 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 104(%rax), %ymm8 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm14[2,2,2,2] +; AVX2-FP-NEXT: vbroadcastsd 72(%rax), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm7 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,3],xmm9[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 136(%rax), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd 104(%rax), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm6 -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm7 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,3],xmm9[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] -; AVX2-FP-NEXT: vbroadcastsd 168(%rax), %ymm8 +; AVX2-FP-NEXT: vbroadcastsd 136(%rax), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm6 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm7 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,3],xmm9[3,3] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,2,2,2] +; AVX2-FP-NEXT: vbroadcastsd 168(%rax), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm7 +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,3],xmm9[3,3] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,2,2,2] ; AVX2-FP-NEXT: vbroadcastsd 200(%rax), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FP-NEXT: vbroadcastss 220(%r8), %ymm7 +; AVX2-FP-NEXT: vbroadcastss 220(%r9), %ymm8 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FP-NEXT: vbroadcastss 220(%r9), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FP-NEXT: vbroadcastsd 216(%rax), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 240(%rdx), %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FP-NEXT: vbroadcastss 236(%r8), %ymm8 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vbroadcastss 236(%r8), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] ; AVX2-FP-NEXT: vmovaps 224(%rax), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -12404,8 +12402,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm6[1,1],ymm11[5,5],ymm6[5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] @@ -12422,9 +12420,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm8[1,1],ymm1[5,5],ymm8[5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -12469,8 +12467,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FP-NEXT: vbroadcastsd 112(%rax), %ymm2 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12514,7 +12512,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -12526,7 +12524,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FP-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] @@ -12537,8 +12535,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -12547,10 +12546,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovaps %ymm11, %ymm6 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vmovaps %ymm8, %ymm6 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,1,2,0,7,5,6,4] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -12559,7 +12558,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] @@ -12572,8 +12571,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vbroadcastss 80(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] @@ -12601,7 +12599,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] @@ -12631,65 +12629,65 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastss 144(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm1[1,2],ymm13[3,4],ymm1[5,6],ymm13[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FP-NEXT: vbroadcastss 176(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm0[6],ymm13[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm7[3,3],mem[3,3] -; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm13 = xmm7[3,3],mem[3,3] +; AVX2-FP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-FP-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3,4],ymm13[5,6],ymm14[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4],ymm13[5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FP-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] -; AVX2-FP-NEXT: vbroadcastss 208(%rdx), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-FP-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = mem[3,1,2,0,7,5,6,4] +; AVX2-FP-NEXT: vbroadcastss 208(%rdx), %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm14 = xmm1[3,3],mem[3,3] @@ -12704,17 +12702,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm4, 992(%rax) ; AVX2-FP-NEXT: vmovaps %ymm5, 864(%rax) ; AVX2-FP-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-FP-NEXT: vmovaps %ymm8, 640(%rax) +; AVX2-FP-NEXT: vmovaps %ymm12, 640(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-FP-NEXT: vmovaps %ymm11, 416(%rax) +; AVX2-FP-NEXT: vmovaps %ymm8, 320(%rax) +; AVX2-FP-NEXT: vmovaps %ymm9, 192(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-FP-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-FP-NEXT: vmovaps %ymm12, 192(%rax) -; AVX2-FP-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-FP-NEXT: vmovaps %ymm9, 1472(%rax) +; AVX2-FP-NEXT: vmovaps %ymm10, 1472(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12993,16 +12991,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm1 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps (%r8), %ymm15 -; AVX2-FCP-NEXT: vmovaps (%r9), %ymm13 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%r9), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] @@ -13010,11 +13008,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13033,11 +13031,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13079,11 +13077,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13102,11 +13100,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13125,17 +13123,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm12 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm13 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[6],ymm13[6],ymm11[7],ymm13[7] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm7 -; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm8 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm8 +; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm9 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13148,111 +13146,111 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovaps 224(%rdx), %xmm6 -; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm6[1],xmm3[1],zero -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 228(%r8), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm4 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm4[1,1,1,1] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rax), %ymm5, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm5 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2] -; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermps %ymm11, %ymm9, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 224(%r8), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vbroadcastss %xmm4, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FCP-NEXT: vbroadcastss 224(%rax), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6],ymm2[7] -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm5 +; AVX2-FCP-NEXT: vmovaps 224(%rdx), %xmm12 +; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm12[1],xmm5[1],zero +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 228(%r8), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm7 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rax), %ymm4, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss %xmm5, %xmm2 +; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm4 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,1,2,2,0,1,2,2] +; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermps %ymm14, %ymm10, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 224(%r8), %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vbroadcastss %xmm7, %ymm14 +; AVX2-FCP-NEXT: vbroadcastss 224(%rax), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6],ymm2[7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm1[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FCP-NEXT: vbroadcastss 232(%rax), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 232(%rax), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm12 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovaps 224(%rcx), %ymm0 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1],ymm0[1,1],ymm3[5,5],ymm0[5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6],ymm11[7] -; AVX2-FCP-NEXT: vbroadcastsd 240(%r8), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vbroadcastss 240(%r9), %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 240(%rax), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 220(%r8), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 220(%r9), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovaps 224(%rcx), %ymm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1],ymm1[1,1],ymm3[5,5],ymm1[5,5] +; AVX2-FCP-NEXT: vbroadcastsd 240(%r8), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6],ymm6[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vbroadcastss 240(%r9), %xmm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 240(%rax), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 220(%r8), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastss 220(%r9), %ymm6 ; AVX2-FCP-NEXT: vbroadcastsd 216(%rax), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 240(%rdx), %ymm11 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6],ymm14[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastss 236(%r8), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6] -; AVX2-FCP-NEXT: vpermps 224(%r9), %ymm11, %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6],ymm11[7] -; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm11[2,3],ymm14[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6],ymm14[7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastss 240(%rdx), %ymm0 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm12[0],ymm2[1],ymm12[1],ymm2[4],ymm12[4],ymm2[5],ymm12[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastss 236(%r8), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [5,6,5,6,5,6,5,6] +; AVX2-FCP-NEXT: vpermps 224(%r9), %ymm6, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm7[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3,4],ymm0[5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 252(%r8), %ymm1 +; AVX2-FCP-NEXT: vbroadcastss 252(%r9), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vbroadcastss 252(%r9), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13261,27 +13259,27 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm1 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FCP-NEXT: vbroadcastsd 8(%rax), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13294,7 +13292,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,0,1,4,5,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] @@ -13302,31 +13301,31 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm0 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm1 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -13345,35 +13344,35 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FCP-NEXT: vbroadcastsd 48(%rax), %ymm2 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm0 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vbroadcastsd 72(%rax), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -13398,39 +13397,39 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6],ymm9[7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4],ymm4[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -13454,11 +13453,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -13467,22 +13466,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FCP-NEXT: vbroadcastsd 136(%rax), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -13502,11 +13501,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -13515,10 +13514,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vbroadcastsd 168(%rax), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -13528,17 +13527,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm12[1,1],ymm1[5,5],ymm12[5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FCP-NEXT: vbroadcastsd 176(%rax), %ymm2 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13550,11 +13549,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -13562,217 +13561,218 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vbroadcastsd 200(%rax), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] -; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vbroadcastsd 208(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm11[1,1],ymm0[5,5],ymm11[5,5] +; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] +; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] +; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vbroadcastsd 208(%rax), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = xmm3[3,3],mem[3,3] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm6 = xmm2[3,3],mem[3,3] ; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = xmm3[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm4[1,2,3,4],ymm6[5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 80(%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm1 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm7 = xmm2[3,3],mem[3,3] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm1[6],ymm9[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3,4],ymm7[5,6],ymm8[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7] +; AVX2-FCP-NEXT: vmovaps %ymm13, %ymm0 +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm9[0],ymm13[0],ymm9[1],ymm13[1],ymm9[4],ymm13[4],ymm9[5],ymm13[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = xmm8[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm8 = xmm8[3,3],mem[3,3] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3,4],ymm10[5,6],ymm13[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] -; AVX2-FCP-NEXT: vbroadcastss 144(%rdx), %ymm10 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6],ymm13[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0],ymm4[1,2,3,4],ymm8[5,6,7] +; AVX2-FCP-NEXT: vbroadcastss 144(%rdx), %ymm4 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm13 = xmm8[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm13 = xmm13[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm8 = xmm8[3,3],mem[3,3] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3,4],ymm14[5,6,7] ; AVX2-FCP-NEXT: vbroadcastss 176(%rdx), %ymm14 -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vmovaps %ymm12, %ymm1 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,1,2,0,7,5,6,4] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm15 = xmm8[3,3],mem[3,3] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm15 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm15 = xmm12[3,3],mem[3,3] ; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm15 = xmm15[0,1,2],mem[3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2,3],ymm14[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,1,2,0,7,5,6,4] -; AVX2-FCP-NEXT: vbroadcastss 208(%rdx), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1,2],ymm12[3,4],ymm15[5,6],ymm12[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3,4],ymm12[5,6,7] +; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm12 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FCP-NEXT: vbroadcastss 208(%rdx), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6],ymm15[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm8 = xmm1[3,3],mem[3,3] -; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm12 = xmm1[3,3],mem[3,3] +; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps %ymm4, 1440(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm3, 1440(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm0, 1312(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm14, 1216(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm13, 1088(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm10, 992(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm8, 1088(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm4, 992(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm9, 864(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, 768(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm6, 640(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm3, 544(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm13, 768(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm7, 640(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm2, 544(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm6, 416(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm5, 320(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) @@ -13879,43 +13879,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -13955,23 +13955,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -14031,29 +14031,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -14113,7 +14113,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -14140,7 +14140,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -14155,7 +14155,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512-NEXT: movw $-30962, %cx # imm = 0x870E @@ -14182,7 +14182,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -14194,7 +14194,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 @@ -14220,12 +14220,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512-NEXT: movw $3612, %ax # imm = 0xE1C @@ -14246,23 +14246,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] ; AVX512-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -14281,25 +14281,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -14351,43 +14351,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -14427,23 +14427,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -14503,29 +14503,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -14585,7 +14585,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -14612,7 +14612,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512-FCP-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -14627,7 +14627,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movw $-30962, %cx # imm = 0x870E @@ -14654,7 +14654,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -14666,7 +14666,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 @@ -14692,12 +14692,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512-FCP-NEXT: movw $3612, %ax # imm = 0xE1C @@ -14718,23 +14718,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -14753,25 +14753,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -14823,43 +14823,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -14899,23 +14899,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -14975,29 +14975,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -15057,7 +15057,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512DQ-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -15084,7 +15084,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512DQ-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -15099,7 +15099,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-NEXT: movw $-30962, %cx # imm = 0x870E @@ -15126,7 +15126,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512DQ-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -15138,7 +15138,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 @@ -15164,12 +15164,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512DQ-NEXT: movw $3612, %ax # imm = 0xE1C @@ -15190,23 +15190,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -15225,25 +15225,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512DQ-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -15295,43 +15295,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -15371,23 +15371,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -15447,29 +15447,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -15529,7 +15529,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -15556,7 +15556,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -15571,7 +15571,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movw $-30962, %cx # imm = 0x870E @@ -15598,7 +15598,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -15610,7 +15610,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 @@ -15636,12 +15636,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: movw $3612, %ax # imm = 0xE1C @@ -15662,23 +15662,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -15697,25 +15697,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -15767,43 +15767,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -15843,23 +15843,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -15919,29 +15919,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -16001,7 +16001,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512BW-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -16028,7 +16028,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -16043,7 +16043,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $-30962, %cx # imm = 0x870E @@ -16070,7 +16070,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -16082,7 +16082,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 @@ -16108,12 +16108,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C @@ -16134,23 +16134,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -16169,25 +16169,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -16239,43 +16239,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -16315,23 +16315,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -16391,29 +16391,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -16473,7 +16473,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -16500,7 +16500,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -16515,7 +16515,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movw $-30962, %cx # imm = 0x870E @@ -16542,7 +16542,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -16554,7 +16554,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 @@ -16580,12 +16580,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: movw $3612, %ax # imm = 0xE1C @@ -16606,23 +16606,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -16641,25 +16641,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -16711,43 +16711,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -16787,23 +16787,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -16863,29 +16863,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -16945,7 +16945,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -16972,7 +16972,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -16987,7 +16987,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: movw $-30962, %cx # imm = 0x870E @@ -17014,7 +17014,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -17026,7 +17026,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 @@ -17052,12 +17052,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: movw $3612, %ax # imm = 0xE1C @@ -17078,23 +17078,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -17113,25 +17113,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1472(%rax) @@ -17183,43 +17183,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,16,0,0,0,0,0,1,17,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,3,19,0,0,0,0,0,4,20] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,7,23,0,0,0,0,0,8,24,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,9,25,0,0,0,0,0,10,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,11,27,0,0,0,0,0,12,28,0,0,0,0,0,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,0,0,0,0,0,30,14,0,0,0,0,0,31,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 @@ -17259,23 +17259,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,5,21,0,0,0,0,0,6,22,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,7,23,0,0,0,0,0,8,24,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,9,25,0,0,0,0,0,10,26,0,0,0,0,0,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [11,0,0,0,0,0,28,12,0,0,0,0,0,29,13,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,15,31,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm24 @@ -17335,29 +17335,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,16,0,0,0,0,0,1,17,0,0,0,0,0,2,18] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,0,3,19,0,0,0,0,0,4,20,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,5,21,0,0,0,0,0,6,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,7,23,0,0,0,0,0,8,24,0,0,0,0,0,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [9,0,0,0,0,0,26,10,0,0,0,0,0,27,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,13,29,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,14,30,0,0,0,0,0,15,31,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -17417,7 +17417,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [22,0,0,0,0,5,6,23,0,0,0,0,12,13,24,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movw $28897, %cx # imm = 0x70E1 @@ -17444,7 +17444,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,3,4,25,0,0,0,0,10,11,26,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movw $7224, %cx # imm = 0x1C38 @@ -17459,7 +17459,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,0,0,0,0,8,9,28,0,0,0,0,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movw $-30962, %cx # imm = 0x870E @@ -17486,7 +17486,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,29,0,0,0,0,6,7,30,0,0,0,0,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 @@ -17498,7 +17498,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 @@ -17524,12 +17524,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,16,0,0,0,0,0,1,17,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,2,3,18,0,0,0,0,9,10,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movw $3612, %ax # imm = 0xE1C @@ -17550,23 +17550,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,2,18,0,0,0,0,0,3,19,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,1,2,3,4,23,0,0,8,9,10,11,24,0,0,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,2,25,0,0,6,7,8,9,26,0,0,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,27,0,0,4,5,6,7,28,0,0,11,12,13,14,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,2,3,4,5,30,0,0,9,10,11,12,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,20,0,0,0,0,7,8,21,0,0,0,0,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload @@ -17585,25 +17585,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [4,20,0,0,0,0,0,5,21,0,0,0,0,0,6,22] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,23,0,8,9,10,11,12,24,0,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,25,0,6,7,8,9,10,26,0,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,27,0,4,5,6,7,8,28,0,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [29,0,2,3,4,5,6,30,0,9,10,11,12,13,31,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1472(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index cf246e4ede089..902e19ebec881 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -174,7 +174,7 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: vzeroupper @@ -199,7 +199,7 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-FCP-NEXT: vzeroupper @@ -224,7 +224,7 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-NEXT: vzeroupper @@ -249,7 +249,7 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -274,7 +274,7 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -299,7 +299,7 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -324,7 +324,7 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -349,7 +349,7 @@ define void @store_i32_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,16,18,20,22,1,3,5,7,17,19,21,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -486,19 +486,19 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovaps (%r8), %xmm5 ; AVX2-NEXT: vmovaps (%r11), %xmm6 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm2 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm5 -; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm8 +; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm7 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-NEXT: vpermps %ymm8, %ymm6, %ymm9 +; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm9 ; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm10 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-NEXT: vpermps %ymm7, %ymm6, %ymm6 +; AVX2-NEXT: vpermps %ymm8, %ymm6, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1,5,1,5,1,5,1,5] -; AVX2-NEXT: vpermps %ymm8, %ymm9, %ymm10 +; AVX2-NEXT: vpermps %ymm7, %ymm9, %ymm10 ; AVX2-NEXT: vpermps %ymm5, %ymm9, %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -506,15 +506,15 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] -; AVX2-NEXT: vpermps %ymm8, %ymm10, %ymm11 +; AVX2-NEXT: vpermps %ymm7, %ymm10, %ymm11 ; AVX2-NEXT: vpermps %ymm5, %ymm10, %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-NEXT: vpermps %ymm7, %ymm10, %ymm4 +; AVX2-NEXT: vpermps %ymm8, %ymm10, %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [3,7,3,7,3,7,3,7] -; AVX2-NEXT: vpermps %ymm8, %ymm4, %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vpermps %ymm7, %ymm4, %ymm7 ; AVX2-NEXT: vpermps %ymm5, %ymm4, %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -540,19 +540,19 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovaps (%r8), %xmm5 ; AVX2-FP-NEXT: vmovaps (%r11), %xmm6 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm2 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-FP-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm5 -; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm8 +; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm7 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm6, %ymm9 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm9 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm6, %ymm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpermps %ymm8, %ymm6, %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1,5,1,5,1,5,1,5] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm9, %ymm10 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm9, %ymm10 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm9, %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -560,15 +560,15 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm10, %ymm11 +; AVX2-FP-NEXT: vpermps %ymm7, %ymm10, %ymm11 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm10, %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm10, %ymm4 +; AVX2-FP-NEXT: vpermps %ymm8, %ymm10, %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [3,7,3,7,3,7,3,7] -; AVX2-FP-NEXT: vpermps %ymm8, %ymm4, %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm7, %ymm4, %ymm7 ; AVX2-FP-NEXT: vpermps %ymm5, %ymm4, %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -594,19 +594,19 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovaps (%r8), %xmm5 ; AVX2-FCP-NEXT: vmovaps (%r11), %xmm6 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm5 -; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm8 +; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm7 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm6, %ymm9 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm6, %ymm9 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm6, %ymm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1,5,1,5,1,5,1,5] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm9, %ymm10 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm9, %ymm10 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm9, %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -614,15 +614,15 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm10, %ymm11 +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm10, %ymm11 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm10, %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm10, %ymm4 +; AVX2-FCP-NEXT: vpermps %ymm8, %ymm10, %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm4 = [3,7,3,7,3,7,3,7] -; AVX2-FCP-NEXT: vpermps %ymm8, %ymm4, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm7 ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -647,13 +647,13 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r11), %xmm3 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) @@ -671,13 +671,13 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -695,13 +695,13 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) @@ -719,13 +719,13 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -743,13 +743,13 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -767,13 +767,13 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -791,13 +791,13 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -815,13 +815,13 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1461,26 +1461,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: movb $-52, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1503,26 +1503,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: movb $-52, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1545,26 +1545,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: movb $-52, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1587,26 +1587,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: movb $-52, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1629,26 +1629,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movb $-52, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1671,26 +1671,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movb $-52, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1713,26 +1713,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movb $-52, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -1755,26 +1755,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,8,16,24,0,0,0,0,1,9,17,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,8,16,24,0,0,0,0,1,9,17,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,2,10,18,26,0,0,0,0,3,11,19,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,10,18,26,0,0,0,0,3,11,19,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,4,12,20,28,0,0,0,0,5,13,21,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,12,20,28,0,0,0,0,5,13,21,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,6,14,22,30,0,0,0,0,7,15,23,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,14,22,30,0,0,0,0,7,15,23,31,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) @@ -2295,9 +2295,9 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps (%rsi), %ymm8 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm4 ; AVX2-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-NEXT: vmovaps 32(%rdx), %ymm10 -; AVX2-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX2-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX2-NEXT: vmovaps 32(%r8), %ymm11 ; AVX2-NEXT: vmovaps 32(%r9), %ymm6 ; AVX2-NEXT: vmovaps 32(%r10), %ymm12 ; AVX2-NEXT: vmovaps 32(%rax), %ymm13 @@ -2305,10 +2305,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm14 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] -; AVX2-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm15 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2325,32 +2325,32 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%r9), %ymm2 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[2,2,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm10[2,3] -; AVX2-NEXT: vmovaps (%r10), %ymm10 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[2,2,2,2] +; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 52(%r8), %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-NEXT: vbroadcastss 52(%r8), %ymm11 +; AVX2-NEXT: vmovaps (%r10), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4],ymm6[5],ymm11[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm9 +; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 24(%rax), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,6],ymm3[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] @@ -2360,13 +2360,13 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastss 28(%r10), %ymm6 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] -; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] @@ -2374,10 +2374,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-NEXT: vbroadcastss 20(%r8), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 20(%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,1,1,5,5,5,5] @@ -2543,9 +2543,9 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm8 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm4 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm10 -; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm11 ; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm6 ; AVX2-FP-NEXT: vmovaps 32(%r10), %ymm12 ; AVX2-FP-NEXT: vmovaps 32(%rax), %ymm13 @@ -2553,10 +2553,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2573,32 +2573,32 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%r9), %ymm2 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[2,2,2,2] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm10[2,3] -; AVX2-FP-NEXT: vmovaps (%r10), %ymm10 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[2,2,2,2] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 52(%r8), %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FP-NEXT: vbroadcastss 52(%r8), %ymm11 +; AVX2-FP-NEXT: vmovaps (%r10), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4],ymm6[5],ymm11[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm9 +; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 24(%rax), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] @@ -2608,13 +2608,13 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastss 28(%r10), %ymm6 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] @@ -2622,10 +2622,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vbroadcastss 20(%r8), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 20(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,1,1,5,5,5,5] @@ -2791,9 +2791,9 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm8 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm4 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm10 -; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm11 ; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm6 ; AVX2-FCP-NEXT: vmovaps 32(%r10), %ymm12 ; AVX2-FCP-NEXT: vmovaps 32(%rax), %ymm13 @@ -2801,10 +2801,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2821,32 +2821,32 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%r9), %ymm2 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[2,2,2,2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vmovaps (%r10), %ymm10 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[2,2,2,2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 52(%r8), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FCP-NEXT: vbroadcastss 52(%r8), %ymm11 +; AVX2-FCP-NEXT: vmovaps (%r10), %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4],ymm6[5],ymm11[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm9 +; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 24(%rax), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] @@ -2856,13 +2856,13 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastss 28(%r10), %ymm6 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] @@ -2870,10 +2870,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vbroadcastss 20(%r8), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 20(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,1,1,5,5,5,5] @@ -3042,16 +3042,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512-NEXT: movb $-120, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: movb $34, %cl ; AVX512-NEXT: kmovw %ecx, %k2 @@ -3059,80 +3059,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $-52, %cl ; AVX512-NEXT: kmovw %ecx, %k3 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3160,16 +3160,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512-FCP-NEXT: movb $-120, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: movb $34, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 @@ -3177,80 +3177,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $-52, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3278,16 +3278,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-NEXT: movb $-120, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: movb $34, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 @@ -3295,80 +3295,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $-52, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k3 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3396,16 +3396,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: movb $-120, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: movb $34, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 @@ -3413,80 +3413,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $-52, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3514,16 +3514,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512BW-NEXT: movb $-120, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-NEXT: movb $34, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 @@ -3531,80 +3531,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-52, %cl ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3632,16 +3632,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: movb $-120, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: movb $34, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 @@ -3649,80 +3649,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $-52, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3750,16 +3750,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: movb $-120, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: movb $34, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 @@ -3767,80 +3767,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $-52, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -3868,16 +3868,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $-120, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $34, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 @@ -3885,80 +3885,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $-52, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k3} @@ -4960,43 +4960,43 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-NEXT: vmovaps (%rsi), %ymm5 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-NEXT: vmovaps (%rdx), %ymm6 ; AVX2-NEXT: vmovaps 32(%rcx), %ymm3 -; AVX2-NEXT: vmovaps (%rcx), %ymm9 +; AVX2-NEXT: vmovaps (%rcx), %ymm7 ; AVX2-NEXT: vmovaps (%r8), %ymm10 ; AVX2-NEXT: vmovaps (%r9), %ymm12 -; AVX2-NEXT: vmovaps (%rax), %ymm8 +; AVX2-NEXT: vmovaps (%rax), %ymm9 ; AVX2-NEXT: vmovaps (%r10), %ymm11 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm8[0],ymm15[2],ymm8[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm13 ; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3] -; AVX2-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 24(%r10), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7] -; AVX2-NEXT: vmovaps 32(%r9), %ymm8 +; AVX2-NEXT: vmovaps 32(%r8), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm13[7] +; AVX2-NEXT: vmovaps 32(%r9), %ymm9 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] ; AVX2-NEXT: vmovaps 32(%rax), %ymm10 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm9[2,2,2,2] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm6[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] ; AVX2-NEXT: vmovaps 32(%r10), %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] @@ -5005,45 +5005,45 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastss 28(%rax), %ymm5 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm6[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX2-NEXT: vbroadcastss 52(%r8), %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 52(%r8), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,3] +; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm4 +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] ; AVX2-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 56(%r10), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm5[7] ; AVX2-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm0 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm0 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 60(%rax), %ymm1 @@ -5051,7 +5051,7 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -5088,10 +5088,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vbroadcastss 92(%rax), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 92(%rax), %ymm0 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-NEXT: vmovaps 96(%rsi), %ymm1 @@ -5125,10 +5125,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-NEXT: vbroadcastss 120(%r10), %ymm10 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 120(%r10), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm10[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] @@ -5430,43 +5430,43 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm5 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm6 ; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm3 -; AVX2-FP-NEXT: vmovaps (%rcx), %ymm9 +; AVX2-FP-NEXT: vmovaps (%rcx), %ymm7 ; AVX2-FP-NEXT: vmovaps (%r8), %ymm10 ; AVX2-FP-NEXT: vmovaps (%r9), %ymm12 -; AVX2-FP-NEXT: vmovaps (%rax), %ymm8 +; AVX2-FP-NEXT: vmovaps (%rax), %ymm9 ; AVX2-FP-NEXT: vmovaps (%r10), %ymm11 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm8[0],ymm15[2],ymm8[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm13 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 24(%r10), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm8 +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm9 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] ; AVX2-FP-NEXT: vmovaps 32(%rax), %ymm10 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm9[2,2,2,2] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm6[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] ; AVX2-FP-NEXT: vmovaps 32(%r10), %ymm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] @@ -5475,45 +5475,45 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastss 28(%rax), %ymm5 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm6[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX2-FP-NEXT: vbroadcastss 52(%r8), %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 52(%r8), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,3] +; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm4 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 56(%r10), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm0 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm0 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 60(%rax), %ymm1 @@ -5521,7 +5521,7 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -5558,10 +5558,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vbroadcastss 92(%rax), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 92(%rax), %ymm0 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm1 @@ -5595,10 +5595,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vbroadcastss 120(%r10), %ymm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 120(%r10), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] @@ -5900,43 +5900,43 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm6 ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm3 -; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm9 +; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm7 ; AVX2-FCP-NEXT: vmovaps (%r8), %ymm10 ; AVX2-FCP-NEXT: vmovaps (%r9), %ymm12 -; AVX2-FCP-NEXT: vmovaps (%rax), %ymm8 +; AVX2-FCP-NEXT: vmovaps (%rax), %ymm9 ; AVX2-FCP-NEXT: vmovaps (%r10), %ymm11 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm8[0],ymm15[2],ymm8[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm13 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 24(%r10), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm8 +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm9 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] ; AVX2-FCP-NEXT: vmovaps 32(%rax), %ymm10 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm9[2,2,2,2] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm6[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] ; AVX2-FCP-NEXT: vmovaps 32(%r10), %ymm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] @@ -5945,45 +5945,45 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastss 28(%rax), %ymm5 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm6[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX2-FCP-NEXT: vbroadcastss 52(%r8), %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 52(%r8), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm4 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm4 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 56(%r10), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm0 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm0 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 60(%rax), %ymm1 @@ -5991,7 +5991,7 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -6028,10 +6028,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vbroadcastss 92(%rax), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 92(%rax), %ymm0 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm1 @@ -6065,10 +6065,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vbroadcastss 120(%r10), %ymm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 120(%r10), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] @@ -6371,57 +6371,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -6443,10 +6443,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -6467,28 +6467,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -6507,28 +6507,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -6660,57 +6660,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -6732,10 +6732,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -6756,28 +6756,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -6796,28 +6796,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -6949,57 +6949,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -7021,10 +7021,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -7045,28 +7045,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -7085,28 +7085,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -7238,57 +7238,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -7310,10 +7310,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -7334,28 +7334,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -7374,28 +7374,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -7527,57 +7527,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -7599,10 +7599,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -7623,28 +7623,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -7663,28 +7663,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -7816,57 +7816,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -7888,10 +7888,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -7912,28 +7912,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -7952,28 +7952,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -8105,57 +8105,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -8177,10 +8177,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -8201,28 +8201,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -8241,28 +8241,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -8394,57 +8394,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -8466,10 +8466,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -8490,28 +8490,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -8530,28 +8530,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -10654,16 +10654,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm13 ; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm8[2,3] -; AVX2-NEXT: vmovaps 32(%r8), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastss 24(%rax), %ymm13 +; AVX2-NEXT: vmovaps 32(%r8), %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX2-NEXT: vmovaps 32(%r9), %ymm10 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] @@ -10694,10 +10694,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX2-NEXT: vbroadcastss 52(%r8), %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 52(%r8), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm4 ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1,1,1,5,5,5,5] @@ -10763,10 +10763,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vbroadcastss 92(%r10), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 92(%r10), %ymm0 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-NEXT: vmovaps 96(%rsi), %ymm1 @@ -10800,10 +10800,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-NEXT: vbroadcastss 120(%rax), %ymm10 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 120(%rax), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm10[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] @@ -10886,10 +10886,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-NEXT: vbroadcastss 180(%r8), %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 180(%r8), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm7[5],ymm12[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] @@ -10955,10 +10955,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,2,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vbroadcastss 220(%r10), %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 220(%r10), %ymm2 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] @@ -10992,10 +10992,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-NEXT: vbroadcastss 248(%rax), %ymm10 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastss 248(%rax), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm10[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] @@ -11600,16 +11600,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm13 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm8[2,3] -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastss 24(%rax), %ymm13 +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm8 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm10 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] @@ -11640,10 +11640,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX2-FP-NEXT: vbroadcastss 52(%r8), %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 52(%r8), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm4 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1,1,1,5,5,5,5] @@ -11709,10 +11709,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vbroadcastss 92(%r10), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 92(%r10), %ymm0 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm1 @@ -11746,10 +11746,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vbroadcastss 120(%rax), %ymm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 120(%rax), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] @@ -11832,10 +11832,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-FP-NEXT: vbroadcastss 180(%r8), %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 180(%r8), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm7[5],ymm12[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] @@ -11901,10 +11901,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vbroadcastss 220(%r10), %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 220(%r10), %ymm2 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] @@ -11938,10 +11938,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vbroadcastss 248(%rax), %ymm10 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastss 248(%rax), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] @@ -12546,16 +12546,16 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm13 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm8[2,3] -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastss 24(%rax), %ymm13 +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm8 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm10 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] @@ -12586,10 +12586,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX2-FCP-NEXT: vbroadcastss 52(%r8), %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 52(%r8), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm4 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1,1,1,5,5,5,5] @@ -12655,10 +12655,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vbroadcastss 92(%r10), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 92(%r10), %ymm0 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm1 @@ -12692,10 +12692,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vbroadcastss 120(%rax), %ymm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 120(%rax), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] @@ -12778,10 +12778,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-FCP-NEXT: vbroadcastss 180(%r8), %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 180(%r8), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm7[5],ymm12[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] @@ -12847,10 +12847,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,2,2,2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vbroadcastss 220(%r10), %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 220(%r10), %ymm2 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] @@ -12884,10 +12884,10 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vbroadcastss 248(%rax), %ymm10 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastss 248(%rax), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] @@ -13474,41 +13474,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -13577,42 +13577,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -13681,39 +13681,39 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -13782,35 +13782,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -14125,41 +14125,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -14228,42 +14228,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -14332,39 +14332,39 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -14433,35 +14433,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -14776,41 +14776,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -14879,42 +14879,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -14983,39 +14983,39 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -15084,35 +15084,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -15427,41 +15427,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -15530,42 +15530,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -15634,39 +15634,39 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -15735,35 +15735,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -16078,41 +16078,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -16181,42 +16181,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -16285,39 +16285,39 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -16386,35 +16386,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -16729,41 +16729,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -16832,42 +16832,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -16936,39 +16936,39 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -17037,35 +17037,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -17380,41 +17380,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -17483,42 +17483,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -17587,39 +17587,39 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -17688,35 +17688,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -18031,41 +18031,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,0,16,0,0,0,0,0,0,1,17] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,2,18,0,0,0,0,0,0,3,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,4,20,0,0,0,0,0,0,5,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,6,22,0,0,0,0,0,0,7,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,8,24,0,0,0,0,0,0,9,25] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,10,26,0,0,0,0,0,0,11,27] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,12,28,0,0,0,0,0,0,13,29] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,14,30,0,0,0,0,0,0,15,31] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 @@ -18134,42 +18134,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,16,0,0,0,0,0,0,1,17,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,2,18,0,0,0,0,0,0,3,19,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,4,20,0,0,0,0,0,0,5,21,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,6,22,0,0,0,0,0,0,7,23,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,8,24,0,0,0,0,0,0,9,25,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,10,26,0,0,0,0,0,0,11,27,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,12,28,0,0,0,0,0,0,13,29,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,14,30,0,0,0,0,0,0,15,31,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 @@ -18238,39 +18238,39 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,16,0,0,0,0,0,0,1,17,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,2,18,0,0,0,0,0,0,3,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,20,0,0,0,0,0,0,5,21,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,6,22,0,0,0,0,0,0,7,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,8,24,0,0,0,0,0,0,9,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,10,26,0,0,0,0,0,0,11,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,12,28,0,0,0,0,0,0,13,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,14,30,0,0,0,0,0,0,15,31,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -18339,35 +18339,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,16,0,0,0,0,0,0,1,17,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [2,18,0,0,0,0,0,0,3,19,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [4,20,0,0,0,0,0,0,5,21,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,22,0,0,0,0,0,0,7,23,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,24,0,0,0,0,0,0,9,25,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [10,26,0,0,0,0,0,0,11,27,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,28,0,0,0,0,0,0,13,29,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [14,30,0,0,0,0,0,0,15,31,0,0,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll index 8d68f88249a9e..0ac9143417760 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -163,8 +163,8 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; ; AVX-LABEL: store_i64_stride2_vf4: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rsi), %xmm0 -; AVX-NEXT: vmovaps (%rdi), %xmm1 +; AVX-NEXT: vmovapd (%rsi), %xmm0 +; AVX-NEXT: vmovapd (%rdi), %xmm1 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -225,7 +225,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512: # %bb.0: ; AVX512-NEXT: vmovaps (%rdi), %ymm0 ; AVX512-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovaps %zmm0, (%rdx) ; AVX512-NEXT: vzeroupper @@ -235,7 +235,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX512-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512-FCP-NEXT: vzeroupper @@ -245,7 +245,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -255,7 +255,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -265,7 +265,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 ; AVX512BW-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -275,7 +275,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512BW-FCP-NEXT: vzeroupper @@ -285,7 +285,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rdx) ; AVX512DQ-BW-NEXT: vzeroupper @@ -295,7 +295,7 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -445,9 +445,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -458,9 +458,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -471,9 +471,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -484,9 +484,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -497,9 +497,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -510,9 +510,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -523,9 +523,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -536,9 +536,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) @@ -617,10 +617,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX-LABEL: store_i64_stride2_vf16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rsi), %xmm0 -; AVX-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX-NEXT: vmovaps (%rsi), %xmm1 +; AVX-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX-NEXT: vmovaps 64(%rsi), %xmm3 +; AVX-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX-NEXT: vmovaps (%rdi), %xmm4 ; AVX-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX-NEXT: vmovaps 64(%rdi), %xmm6 @@ -801,10 +801,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -821,10 +821,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -841,10 +841,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -861,10 +861,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -881,10 +881,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -901,10 +901,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -921,10 +921,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -941,10 +941,10 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 @@ -1479,10 +1479,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1513,10 +1513,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1547,10 +1547,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1581,10 +1581,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1615,10 +1615,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1649,10 +1649,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1683,10 +1683,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -1717,10 +1717,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -2233,13 +2233,13 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX-NEXT: vmovapd %ymm10, 672(%rdx) ; AVX-NEXT: vmovapd %ymm9, 608(%rdx) ; AVX-NEXT: vmovapd %ymm8, 544(%rdx) -; AVX-NEXT: vmovapd %ymm0, 480(%rdx) -; AVX-NEXT: vmovapd %ymm1, 416(%rdx) -; AVX-NEXT: vmovapd %ymm2, 352(%rdx) -; AVX-NEXT: vmovapd %ymm3, 288(%rdx) -; AVX-NEXT: vmovapd %ymm4, 224(%rdx) -; AVX-NEXT: vmovapd %ymm5, 160(%rdx) -; AVX-NEXT: vmovapd %ymm6, 96(%rdx) +; AVX-NEXT: vmovapd %ymm9, 480(%rdx) +; AVX-NEXT: vmovapd %ymm10, 416(%rdx) +; AVX-NEXT: vmovapd %ymm11, 352(%rdx) +; AVX-NEXT: vmovapd %ymm12, 288(%rdx) +; AVX-NEXT: vmovapd %ymm13, 224(%rdx) +; AVX-NEXT: vmovapd %ymm14, 160(%rdx) +; AVX-NEXT: vmovapd %ymm15, 96(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2905,10 +2905,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -2967,10 +2967,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3029,10 +3029,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3091,10 +3091,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3153,10 +3153,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3215,10 +3215,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3277,10 +3277,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -3339,10 +3339,10 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll index fe39c769c3545..44e83bb4abc75 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -94,7 +94,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0] +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u] ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512-NEXT: vmovaps %ymm0, (%rcx) @@ -106,7 +106,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u] ; AVX512-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512-FCP-NEXT: vmovaps %ymm0, (%rcx) @@ -118,7 +118,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0] +; AVX512DQ-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u] ; AVX512DQ-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-NEXT: vmovaps %ymm0, (%rcx) @@ -130,7 +130,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u] ; AVX512DQ-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rcx) @@ -142,7 +142,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0] +; AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u] ; AVX512BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512BW-NEXT: vmovaps %ymm0, (%rcx) @@ -154,7 +154,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0] +; AVX512BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u] ; AVX512BW-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rcx) @@ -166,7 +166,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0] +; AVX512DQ-BW-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u] ; AVX512DQ-BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-BW-NEXT: vmovaps %ymm0, (%rcx) @@ -178,7 +178,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinsertf32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,1,3,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rcx) @@ -225,12 +225,12 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovapd (%rsi), %ymm0 ; AVX-NEXT: vmovapd (%rdx), %ymm1 ; AVX-NEXT: vmovaps (%rdi), %xmm2 -; AVX-NEXT: vmovapd 16(%rdi), %xmm3 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm2[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm1[2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX-NEXT: vmovapd 16(%rdi), %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm3, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm1[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0,0,3,2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm4[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3] @@ -247,22 +247,22 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vbroadcastsd (%rdx), %ymm3 +; AVX2-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -270,22 +270,22 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd (%rdx), %ymm3 +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -293,22 +293,22 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd (%rdx), %ymm3 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -317,9 +317,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -331,9 +331,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -345,9 +345,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -359,9 +359,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -373,9 +373,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -387,9 +387,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -401,9 +401,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -415,9 +415,9 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [10,3,7,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,7,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,1,5,9,2,6] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, 64(%rcx) @@ -481,36 +481,36 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX-LABEL: store_i64_stride3_vf8: ; AVX: # %bb.0: -; AVX-NEXT: vmovapd (%rsi), %ymm0 -; AVX-NEXT: vmovapd 32(%rsi), %ymm1 -; AVX-NEXT: vmovapd (%rdx), %ymm2 -; AVX-NEXT: vmovapd 32(%rdx), %ymm3 +; AVX-NEXT: vmovapd (%rsi), %ymm3 +; AVX-NEXT: vmovapd 32(%rsi), %ymm2 +; AVX-NEXT: vmovapd (%rdx), %ymm0 +; AVX-NEXT: vmovapd 32(%rdx), %ymm1 ; AVX-NEXT: vmovaps (%rdi), %xmm4 ; AVX-NEXT: vmovapd 16(%rdi), %xmm5 ; AVX-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX-NEXT: vmovapd 48(%rdi), %xmm7 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm4[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 -; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm8, %ymm8 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm4[0],mem[0] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm7, %ymm7 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm6[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm8, %ymm8 -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm3[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm1[0,0,3,2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3],ymm8[2,3] +; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm8, %ymm7 +; AVX-NEXT: vmovapd 48(%rdi), %xmm8 +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm1[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm2[0,0,3,2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm8[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm0[0,0,3,2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3],ymm8[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm3[0,0,3,2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1,0,2,2] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2],ymm3[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2],ymm5[3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,0,2,2] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX-NEXT: vmovapd %ymm5, 64(%rcx) ; AVX-NEXT: vmovapd %ymm7, 160(%rcx) ; AVX-NEXT: vmovapd %ymm1, 128(%rcx) @@ -530,7 +530,6 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovaps 32(%rdx), %ymm5 ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] @@ -547,6 +546,7 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 32(%rdx), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] @@ -571,7 +571,6 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm5 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] @@ -588,6 +587,7 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 32(%rdx), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] @@ -612,7 +612,6 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm5 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] @@ -629,6 +628,7 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] @@ -648,17 +648,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -671,17 +671,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -694,17 +694,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -717,17 +717,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -740,17 +740,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -763,17 +763,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -786,17 +786,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -809,17 +809,17 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,8,0,1,9,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,3,11,0,4,12,0,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [5,0,14,6,0,15,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) @@ -947,37 +947,37 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd (%rsi), %ymm6 ; AVX-NEXT: vmovapd 32(%rsi), %ymm2 ; AVX-NEXT: vmovapd 64(%rsi), %ymm5 -; AVX-NEXT: vmovapd 96(%rsi), %ymm1 +; AVX-NEXT: vmovapd 96(%rsi), %ymm0 ; AVX-NEXT: vmovapd (%rdx), %ymm9 ; AVX-NEXT: vmovapd 32(%rdx), %ymm4 -; AVX-NEXT: vmovapd 64(%rdx), %ymm8 +; AVX-NEXT: vmovapd 64(%rdx), %ymm7 ; AVX-NEXT: vmovapd 96(%rdx), %ymm3 -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovaps (%rdi), %xmm1 ; AVX-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm1[0],mem[0] ; AVX-NEXT: vmovapd 48(%rdi), %xmm13 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm7, %ymm7 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm7[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm8, %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm8[0],mem[0] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 ; AVX-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm11 -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] ; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm10[0],mem[0] +; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm11, %ymm12 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm11, %ymm11 -; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] ; AVX-NEXT: vmovaps 96(%rdi), %xmm11 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm11[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX-NEXT: vinsertf128 $1, 96(%rdx), %ymm12, %ymm12 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm11[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11 +; AVX-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm12 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX-NEXT: vmovapd 80(%rdi), %xmm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm8[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm7[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm5[0,0,3,2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm14[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm7[2,3],ymm14[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm4[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm2[0,0,3,2] @@ -985,29 +985,29 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3] ; AVX-NEXT: vmovapd 112(%rdi), %xmm14 ; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm3[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0,0,3,2] +; AVX-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0,0,3,2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm3[2,3],ymm15[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3] ; AVX-NEXT: vmovapd 16(%rdi), %xmm15 ; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm9[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0,0,3,2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0,0,3,2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2],ymm15[3] ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,0,2,2] ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2],ymm6[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2],ymm5[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,0,2,2] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX-NEXT: vmovapd %ymm0, 64(%rcx) +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; AVX-NEXT: vmovapd %ymm1, 64(%rcx) ; AVX-NEXT: vmovapd %ymm14, 352(%rcx) -; AVX-NEXT: vmovapd %ymm1, 320(%rcx) +; AVX-NEXT: vmovapd %ymm0, 320(%rcx) ; AVX-NEXT: vmovapd %ymm13, 160(%rcx) ; AVX-NEXT: vmovapd %ymm2, 128(%rcx) ; AVX-NEXT: vmovapd %ymm5, 224(%rcx) @@ -1015,7 +1015,7 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd %ymm6, 32(%rcx) ; AVX-NEXT: vmovaps %ymm11, 288(%rcx) ; AVX-NEXT: vmovaps %ymm10, 96(%rcx) -; AVX-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX-NEXT: vmovaps %ymm8, 192(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, (%rcx) ; AVX-NEXT: vzeroupper @@ -1049,7 +1049,6 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] @@ -1062,13 +1061,14 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 32(%rdx), %ymm11 +; AVX2-NEXT: vmovaps 96(%rdx), %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm13[2,1,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5],ymm12[6,7] ; AVX2-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] @@ -1126,7 +1126,6 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] @@ -1139,13 +1138,14 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 32(%rdx), %ymm11 +; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm13[2,1,2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5],ymm12[6,7] ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] @@ -1203,7 +1203,6 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] @@ -1216,13 +1215,14 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm11 +; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm13[2,1,2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] @@ -1260,20 +1260,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1298,20 +1298,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1336,20 +1336,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1374,20 +1374,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1412,20 +1412,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1450,20 +1450,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1488,20 +1488,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1526,20 +1526,20 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,8,0,1,9,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [5,0,14,6,0,15,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,3,11,0,4,12,0,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 @@ -1802,13 +1802,13 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-LABEL: store_i64_stride3_vf32: ; AVX: # %bb.0: ; AVX-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX-NEXT: vmovapd (%rsi), %ymm6 +; AVX-NEXT: vmovapd (%rsi), %ymm7 +; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 32(%rsi), %ymm6 ; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 32(%rsi), %ymm5 -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd (%rdx), %ymm11 -; AVX-NEXT: vmovapd 32(%rdx), %ymm13 -; AVX-NEXT: vmovapd 64(%rdx), %ymm7 +; AVX-NEXT: vmovapd 32(%rdx), %ymm9 +; AVX-NEXT: vmovapd 64(%rdx), %ymm8 ; AVX-NEXT: vmovaps (%rdi), %xmm2 ; AVX-NEXT: vmovapd 16(%rdi), %xmm1 ; AVX-NEXT: vmovaps 32(%rdi), %xmm3 @@ -1816,12 +1816,12 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm2[0],mem[0] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm3[0],mem[0] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm2 +; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm5, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] @@ -1837,14 +1837,14 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX-NEXT: vinsertf128 $1, 128(%rdx), %ymm3, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX-NEXT: vinsertf128 $1, 128(%rdx), %ymm3, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX-NEXT: vmovaps 160(%rdi), %xmm3 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm3[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX-NEXT: vinsertf128 $1, 160(%rdx), %ymm3, %ymm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm2 +; AVX-NEXT: vinsertf128 $1, 160(%rdx), %ymm5, %ymm3 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%rdi), %xmm2 @@ -1860,21 +1860,22 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0,0,3,2] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm7[0,0,3,2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0,0,3,2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] +; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0,0,3,2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 80(%rdi), %xmm0 -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] ; AVX-NEXT: vmovapd 64(%rsi), %ymm10 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0,0,3,2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 96(%rdx), %ymm5 @@ -1899,8 +1900,7 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd 160(%rsi), %ymm6 ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm6[0,0,3,2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm8[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm8[0],ymm0[1],ymm8[2],ymm0[3] ; AVX-NEXT: vmovapd 192(%rdx), %ymm0 ; AVX-NEXT: vmovapd 208(%rdi), %xmm8 ; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm0[2,3] @@ -1912,9 +1912,9 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd 240(%rdi), %xmm14 ; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] ; AVX-NEXT: vmovapd 224(%rsi), %ymm2 -; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm2[0,0,3,2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm2[0,0,3,2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3] ; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX-NEXT: # ymm14 = mem[1,0,2,2] ; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2],ymm14[3] @@ -1922,7 +1922,8 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX-NEXT: # ymm14 = mem[1,0,2,2] ; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2],ymm14[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3] +; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[1,0,2,2] ; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2],ymm10[3] ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -1942,12 +1943,11 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3] -; AVX-NEXT: vmovapd %ymm12, 736(%rcx) +; AVX-NEXT: vmovapd %ymm13, 736(%rcx) ; AVX-NEXT: vmovapd %ymm2, 704(%rcx) ; AVX-NEXT: vmovapd %ymm8, 640(%rcx) ; AVX-NEXT: vmovapd %ymm0, 608(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 544(%rcx) +; AVX-NEXT: vmovapd %ymm12, 544(%rcx) ; AVX-NEXT: vmovapd %ymm1, 512(%rcx) ; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 448(%rcx) @@ -1960,7 +1960,7 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd %ymm10, 224(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX-NEXT: vmovapd %ymm13, 128(%rcx) +; AVX-NEXT: vmovapd %ymm14, 128(%rcx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX-NEXT: vmovapd %ymm11, 32(%rcx) @@ -1989,17 +1989,17 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: subq $168, %rsp ; AVX2-NEXT: vmovaps 128(%rdi), %ymm11 ; AVX2-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm13 ; AVX2-NEXT: vmovaps (%rsi), %ymm3 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-NEXT: vmovaps 64(%rsi), %ymm9 +; AVX2-NEXT: vmovaps 64(%rsi), %ymm0 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm14 ; AVX2-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX2-NEXT: vmovaps 64(%rdx), %ymm9 ; AVX2-NEXT: vmovaps 96(%rdx), %ymm15 ; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm2[0,1,2,1] @@ -2017,33 +2017,33 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 32(%rdx), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 64(%rdx), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[2,1,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] @@ -2085,44 +2085,44 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 192(%rdx), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-NEXT: vmovaps 192(%rdx), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-NEXT: vbroadcastsd 192(%rdx), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps 192(%rsi), %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-NEXT: vmovaps 192(%rdx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 224(%rdx), %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 224(%rsi), %ymm7 ; AVX2-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm4[4,5],ymm8[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-NEXT: vmovaps 224(%rdx), %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vmovaps %ymm4, 736(%rcx) +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vmovaps %ymm2, 736(%rcx) ; AVX2-NEXT: vmovaps %ymm8, 704(%rcx) ; AVX2-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-NEXT: vmovaps %ymm1, 640(%rcx) -; AVX2-NEXT: vmovaps %ymm5, 608(%rcx) -; AVX2-NEXT: vmovaps %ymm2, 576(%rcx) -; AVX2-NEXT: vmovaps %ymm3, 544(%rcx) +; AVX2-NEXT: vmovaps %ymm3, 608(%rcx) +; AVX2-NEXT: vmovaps %ymm4, 576(%rcx) +; AVX2-NEXT: vmovaps %ymm5, 544(%rcx) ; AVX2-NEXT: vmovaps %ymm6, 512(%rcx) ; AVX2-NEXT: vmovaps %ymm15, 480(%rcx) ; AVX2-NEXT: vmovaps %ymm13, 448(%rcx) @@ -2158,17 +2158,17 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: subq $168, %rsp ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm13 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm3 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm9 +; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm0 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm14 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm9 ; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm15 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm2[0,1,2,1] @@ -2186,33 +2186,33 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 32(%rdx), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 64(%rdx), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[2,1,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] @@ -2254,44 +2254,44 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 192(%rdx), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-FP-NEXT: vbroadcastsd 192(%rdx), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm2 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 224(%rdx), %ymm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm7 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm4[4,5],ymm8[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vmovaps %ymm4, 736(%rcx) +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovaps %ymm2, 736(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm8, 704(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm1, 640(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm5, 608(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm2, 576(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm3, 544(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm3, 608(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm4, 576(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm5, 544(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm6, 512(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm15, 480(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm13, 448(%rcx) @@ -2327,17 +2327,17 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: subq $168, %rsp ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm13 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm3 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm9 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm14 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm9 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm15 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm2[0,1,2,1] @@ -2355,33 +2355,33 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 32(%rdx), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 64(%rdx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[2,1,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] @@ -2423,44 +2423,44 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 192(%rdx), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-FCP-NEXT: vbroadcastsd 192(%rdx), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 224(%rdx), %ymm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm7 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm4[4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm4, 736(%rcx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm2, 736(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm8, 704(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm1, 640(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm5, 608(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm2, 576(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm3, 544(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm3, 608(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm4, 576(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm5, 544(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm6, 512(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm15, 480(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm13, 448(%rcx) @@ -2505,20 +2505,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2571,20 +2571,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2637,20 +2637,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2703,20 +2703,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2769,20 +2769,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2835,20 +2835,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2901,20 +2901,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -2967,20 +2967,20 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,8,0,1,9,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [5,0,14,6,0,15,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,3,11,0,4,12,0,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 @@ -3529,14 +3529,14 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vinsertf128 $1, 64(%rdx), %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm0 +; AVX-NEXT: vinsertf128 $1, 64(%rdx), %ymm3, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%rdi), %xmm0 @@ -3553,14 +3553,14 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, 160(%rdx), %ymm1, %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vinsertf128 $1, 160(%rdx), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vinsertf128 $1, 192(%rdx), %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm0 +; AVX-NEXT: vinsertf128 $1, 192(%rdx), %ymm3, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rdi), %xmm0 @@ -3577,14 +3577,14 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, 288(%rdx), %ymm1, %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vinsertf128 $1, 288(%rdx), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vinsertf128 $1, 320(%rdx), %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm0 +; AVX-NEXT: vinsertf128 $1, 320(%rdx), %ymm3, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 352(%rdi), %xmm0 @@ -3601,14 +3601,14 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, 416(%rdx), %ymm1, %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vinsertf128 $1, 416(%rdx), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vinsertf128 $1, 448(%rdx), %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm0 +; AVX-NEXT: vinsertf128 $1, 448(%rdx), %ymm3, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 480(%rdi), %xmm0 @@ -4034,17 +4034,17 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 192(%rdx), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rsi), %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,1,2,1] +; AVX2-NEXT: vbroadcastsd 192(%rdx), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4106,17 +4106,17 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 320(%rdx), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%rsi), %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,1,2,1] +; AVX2-NEXT: vbroadcastsd 320(%rdx), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 320(%rsi), %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-NEXT: vmovaps 320(%rdx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4129,91 +4129,91 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 352(%rdx), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 352(%rsi), %ymm1 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vmovaps 352(%rdx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 384(%rdx), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 384(%rsi), %ymm1 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vmovaps 384(%rdx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 416(%rdx), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 416(%rsi), %ymm1 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vmovaps 416(%rdx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 448(%rdx), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps 448(%rsi), %ymm15 -; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-NEXT: vmovaps 448(%rdx), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-NEXT: vbroadcastsd 448(%rdx), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps 448(%rsi), %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-NEXT: vmovaps 448(%rdx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm15[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 480(%rdx), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps 480(%rsi), %ymm13 -; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5],ymm12[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[0,1,2,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 480(%rdx), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-NEXT: vmovaps 480(%rsi), %ymm15 +; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm2[4,5],ymm14[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] ; AVX2-NEXT: vmovaps 480(%rdx), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovaps %ymm13, 1504(%rcx) -; AVX2-NEXT: vmovaps %ymm12, 1472(%rcx) +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovaps %ymm2, 1504(%rcx) +; AVX2-NEXT: vmovaps %ymm14, 1472(%rcx) ; AVX2-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-NEXT: vmovaps %ymm1, 1408(%rcx) -; AVX2-NEXT: vmovaps %ymm14, 1376(%rcx) -; AVX2-NEXT: vmovaps %ymm2, 1344(%rcx) -; AVX2-NEXT: vmovaps %ymm3, 1312(%rcx) -; AVX2-NEXT: vmovaps %ymm4, 1280(%rcx) -; AVX2-NEXT: vmovaps %ymm5, 1248(%rcx) -; AVX2-NEXT: vmovaps %ymm6, 1216(%rcx) -; AVX2-NEXT: vmovaps %ymm7, 1184(%rcx) -; AVX2-NEXT: vmovaps %ymm8, 1152(%rcx) -; AVX2-NEXT: vmovaps %ymm9, 1120(%rcx) -; AVX2-NEXT: vmovaps %ymm10, 1088(%rcx) -; AVX2-NEXT: vmovaps %ymm11, 1056(%rcx) +; AVX2-NEXT: vmovaps %ymm3, 1376(%rcx) +; AVX2-NEXT: vmovaps %ymm4, 1344(%rcx) +; AVX2-NEXT: vmovaps %ymm5, 1312(%rcx) +; AVX2-NEXT: vmovaps %ymm6, 1280(%rcx) +; AVX2-NEXT: vmovaps %ymm7, 1248(%rcx) +; AVX2-NEXT: vmovaps %ymm8, 1216(%rcx) +; AVX2-NEXT: vmovaps %ymm9, 1184(%rcx) +; AVX2-NEXT: vmovaps %ymm10, 1152(%rcx) +; AVX2-NEXT: vmovaps %ymm11, 1120(%rcx) +; AVX2-NEXT: vmovaps %ymm12, 1088(%rcx) +; AVX2-NEXT: vmovaps %ymm13, 1056(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1024(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4395,17 +4395,17 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 192(%rdx), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vbroadcastsd 192(%rdx), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4467,17 +4467,17 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 320(%rdx), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%rsi), %ymm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,1,2,1] +; AVX2-FP-NEXT: vbroadcastsd 320(%rdx), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 320(%rsi), %ymm0 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FP-NEXT: vmovaps 320(%rdx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4490,91 +4490,91 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 352(%rdx), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 352(%rsi), %ymm1 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vmovaps 352(%rdx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 384(%rdx), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 384(%rsi), %ymm1 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vmovaps 384(%rdx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 416(%rdx), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 416(%rsi), %ymm1 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vmovaps 416(%rdx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 448(%rdx), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovaps 448(%rsi), %ymm15 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-FP-NEXT: vmovaps 448(%rdx), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-FP-NEXT: vbroadcastsd 448(%rdx), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovaps 448(%rsi), %ymm2 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-FP-NEXT: vmovaps 448(%rdx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm15[0,1,2,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 480(%rdx), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 480(%rsi), %ymm13 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,3,0,1,6,7,4,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[0,1,2,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 480(%rdx), %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps 480(%rsi), %ymm15 +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm2[4,5],ymm14[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] ; AVX2-FP-NEXT: vmovaps 480(%rdx), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovaps %ymm13, 1504(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm12, 1472(%rcx) +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovaps %ymm2, 1504(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm14, 1472(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm1, 1408(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm14, 1376(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm2, 1344(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm3, 1312(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm4, 1280(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm5, 1248(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm6, 1216(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm7, 1184(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm8, 1152(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm9, 1120(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm10, 1088(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm11, 1056(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm3, 1376(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm4, 1344(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm5, 1312(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm6, 1280(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm7, 1248(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm8, 1216(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm9, 1184(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm10, 1152(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm11, 1120(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm12, 1088(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm13, 1056(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1024(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4756,17 +4756,17 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 192(%rdx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,1,2,1] +; AVX2-FCP-NEXT: vbroadcastsd 192(%rdx), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4828,17 +4828,17 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 320(%rdx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%rsi), %ymm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,1,2,1] +; AVX2-FCP-NEXT: vbroadcastsd 320(%rdx), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 320(%rsi), %ymm0 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FCP-NEXT: vmovaps 320(%rdx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4851,91 +4851,91 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 352(%rdx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 352(%rsi), %ymm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vmovaps 352(%rdx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 384(%rdx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 384(%rsi), %ymm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vmovaps 384(%rdx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 416(%rdx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 416(%rsi), %ymm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vmovaps 416(%rdx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 448(%rdx), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovaps 448(%rsi), %ymm15 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-FCP-NEXT: vmovaps 448(%rdx), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-FCP-NEXT: vbroadcastsd 448(%rdx), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovaps 448(%rsi), %ymm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-FCP-NEXT: vmovaps 448(%rdx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm15[0,1,2,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 480(%rdx), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 480(%rsi), %ymm13 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,3,0,1,6,7,4,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[0,1,2,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 480(%rdx), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 480(%rsi), %ymm15 +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm2[4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] ; AVX2-FCP-NEXT: vmovaps 480(%rdx), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm13, 1504(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm12, 1472(%rcx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm2, 1504(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm14, 1472(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm1, 1408(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm14, 1376(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm2, 1344(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm3, 1312(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm4, 1280(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm5, 1248(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm6, 1216(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm7, 1184(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm8, 1152(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm9, 1120(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm10, 1088(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm11, 1056(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm3, 1376(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm4, 1344(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm5, 1312(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm6, 1280(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm7, 1248(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm8, 1216(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm9, 1184(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm10, 1152(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm11, 1120(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm12, 1088(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm13, 1056(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1024(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5032,19 +5032,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] ; AVX512-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5153,19 +5153,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5274,19 +5274,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5395,19 +5395,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5516,19 +5516,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5637,19 +5637,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5758,19 +5758,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 @@ -5879,19 +5879,19 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,1,9,0,2,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,3,11,0,4,12,0,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [5,0,14,6,0,15,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm13 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll index 2721540305491..816784dd7872c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -98,7 +98,7 @@ define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512-NEXT: vzeroupper @@ -110,7 +110,7 @@ define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512-FCP-NEXT: vzeroupper @@ -122,7 +122,7 @@ define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512DQ-NEXT: vzeroupper @@ -134,7 +134,7 @@ define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -146,7 +146,7 @@ define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512BW-NEXT: vzeroupper @@ -158,7 +158,7 @@ define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -170,7 +170,7 @@ define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper @@ -182,7 +182,7 @@ define void @store_i64_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,8,10,1,3,9,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -341,9 +341,9 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -377,9 +377,9 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -413,9 +413,9 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -449,9 +449,9 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rdx), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,4,12,1,9,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,10,6,14,3,11,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) @@ -726,26 +726,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: movb $-52, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -761,26 +761,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512-FCP-NEXT: movb $-52, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -796,26 +796,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-NEXT: movb $-52, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -831,26 +831,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: movb $-52, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -866,26 +866,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -901,26 +901,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: movb $-52, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -936,26 +936,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: movb $-52, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -971,26 +971,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,8,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,8,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,10,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,10,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,4,12,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [4,12,0,0,5,13,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,6,14,0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%r8) @@ -1289,10 +1289,10 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rax ; AVX2-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-NEXT: vmovaps (%rdi), %ymm7 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-NEXT: vmovaps (%rsi), %ymm9 -; AVX2-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-NEXT: vmovaps (%rdx), %ymm10 ; AVX2-NEXT: vmovaps (%rcx), %ymm13 ; AVX2-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm2 @@ -1300,40 +1300,40 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm1 ; AVX2-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm10 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX2-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm11 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm1[1],ymm11[3],ymm1[3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm10 +; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm11 ; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm3, %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm10 -; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm7 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] -; AVX2-NEXT: vmovaps 96(%rsi), %xmm10 -; AVX2-NEXT: vinsertf128 $1, 96(%rcx), %ymm10, %ymm12 -; AVX2-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm10, %ymm14 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm11 +; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm8, %ymm8 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] +; AVX2-NEXT: vmovaps 96(%rsi), %xmm12 +; AVX2-NEXT: vinsertf128 $1, 96(%rcx), %ymm12, %ymm12 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-NEXT: vmovaps 96(%rdi), %xmm11 +; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm11, %ymm14 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm13[0],ymm10[2],ymm13[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3] ; AVX2-NEXT: vmovaps 32(%rdx), %ymm15 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] ; AVX2-NEXT: vmovaps 32(%rcx), %ymm13 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] +; AVX2-NEXT: vmovaps 64(%rdi), %ymm10 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX2-NEXT: vmovaps 64(%rsi), %ymm15 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] @@ -1341,20 +1341,20 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] ; AVX2-NEXT: vmovaps 64(%rcx), %ymm13 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm13[1],ymm5[3],ymm13[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] -; AVX2-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-NEXT: vmovaps 96(%rdi), %ymm10 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX2-NEXT: vmovaps 96(%rsi), %ymm5 ; AVX2-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-NEXT: vmovaps 96(%rcx), %ymm15 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] ; AVX2-NEXT: vmovaps %ymm3, 480(%r8) ; AVX2-NEXT: vmovaps %ymm0, 448(%r8) @@ -1362,11 +1362,11 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm2, 320(%r8) ; AVX2-NEXT: vmovaps %ymm4, 224(%r8) ; AVX2-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-NEXT: vmovaps %ymm8, 96(%r8) +; AVX2-NEXT: vmovaps %ymm7, 96(%r8) ; AVX2-NEXT: vmovaps %ymm14, 64(%r8) ; AVX2-NEXT: vmovaps %ymm12, 416(%r8) -; AVX2-NEXT: vmovaps %ymm10, 384(%r8) -; AVX2-NEXT: vmovaps %ymm7, 288(%r8) +; AVX2-NEXT: vmovaps %ymm11, 384(%r8) +; AVX2-NEXT: vmovaps %ymm8, 288(%r8) ; AVX2-NEXT: vmovaps %ymm6, 256(%r8) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 160(%r8) @@ -1384,10 +1384,10 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: pushq %rax ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm7 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm9 -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm10 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm13 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm2 @@ -1395,40 +1395,40 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm10 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm11 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm1[1],ymm11[3],ymm1[3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm10 +; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm11 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm3, %ymm3 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm10 -; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm7 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] -; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm10 -; AVX2-FP-NEXT: vinsertf128 $1, 96(%rcx), %ymm10, %ymm12 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm10, %ymm14 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm11 +; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm8, %ymm8 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] +; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm12 +; AVX2-FP-NEXT: vinsertf128 $1, 96(%rcx), %ymm12, %ymm12 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm11 +; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm11, %ymm14 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm13[0],ymm10[2],ymm13[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3] ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm15 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] ; AVX2-FP-NEXT: vmovaps 32(%rcx), %ymm13 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm10 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm15 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] @@ -1436,20 +1436,20 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] ; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm13 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm13[1],ymm5[3],ymm13[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm10 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm5 ; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-FP-NEXT: vmovaps 96(%rcx), %ymm15 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] ; AVX2-FP-NEXT: vmovaps %ymm3, 480(%r8) ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%r8) @@ -1457,11 +1457,11 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm2, 320(%r8) ; AVX2-FP-NEXT: vmovaps %ymm4, 224(%r8) ; AVX2-FP-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-FP-NEXT: vmovaps %ymm8, 96(%r8) +; AVX2-FP-NEXT: vmovaps %ymm7, 96(%r8) ; AVX2-FP-NEXT: vmovaps %ymm14, 64(%r8) ; AVX2-FP-NEXT: vmovaps %ymm12, 416(%r8) -; AVX2-FP-NEXT: vmovaps %ymm10, 384(%r8) -; AVX2-FP-NEXT: vmovaps %ymm7, 288(%r8) +; AVX2-FP-NEXT: vmovaps %ymm11, 384(%r8) +; AVX2-FP-NEXT: vmovaps %ymm8, 288(%r8) ; AVX2-FP-NEXT: vmovaps %ymm6, 256(%r8) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r8) @@ -1479,10 +1479,10 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: pushq %rax ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm9 -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm10 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm13 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm2 @@ -1490,40 +1490,40 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm10 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm11 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm1[1],ymm11[3],ymm1[3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm10 +; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm11 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm3, %ymm3 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm10 -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm7 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] -; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm10 -; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rcx), %ymm10, %ymm12 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm10, %ymm14 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm11 +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm8, %ymm8 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] +; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm12 +; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rcx), %ymm12, %ymm12 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm11 +; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm11, %ymm14 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm13[0],ymm10[2],ymm13[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3] ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm15 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm13 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm10 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm15 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] @@ -1531,20 +1531,20 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] ; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm13 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm13[1],ymm5[3],ymm13[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm10 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm15 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] ; AVX2-FCP-NEXT: vmovaps %ymm3, 480(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%r8) @@ -1552,11 +1552,11 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm2, 320(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm4, 224(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%r8) +; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm14, 64(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm12, 416(%r8) -; AVX2-FCP-NEXT: vmovaps %ymm10, 384(%r8) -; AVX2-FCP-NEXT: vmovaps %ymm7, 288(%r8) +; AVX2-FCP-NEXT: vmovaps %ymm11, 384(%r8) +; AVX2-FCP-NEXT: vmovaps %ymm8, 288(%r8) ; AVX2-FCP-NEXT: vmovaps %ymm6, 256(%r8) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r8) @@ -1580,32 +1580,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512-NEXT: movb $-52, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1641,32 +1641,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512-FCP-NEXT: movb $-52, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1702,32 +1702,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512DQ-NEXT: movb $-52, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1763,32 +1763,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512DQ-FCP-NEXT: movb $-52, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1824,32 +1824,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1885,32 +1885,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movb $-52, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -1946,32 +1946,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-NEXT: movb $-52, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -2007,32 +2007,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,10,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,8,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,0,6,14,0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [4,12,0,0,5,13,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -2668,8 +2668,8 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: subq $520, %rsp # imm = 0x208 ; AVX2-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX2-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm5 @@ -2875,8 +2875,8 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: subq $520, %rsp # imm = 0x208 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm5 @@ -3082,8 +3082,8 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: subq $520, %rsp # imm = 0x208 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm5 @@ -3302,32 +3302,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512-NEXT: movb $-52, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] ; AVX512-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] ; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3415,32 +3415,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512-FCP-NEXT: movb $-52, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3528,32 +3528,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512DQ-NEXT: movb $-52, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3641,32 +3641,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: movb $-52, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3754,32 +3754,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3867,32 +3867,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512BW-FCP-NEXT: movb $-52, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -3980,32 +3980,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512DQ-BW-NEXT: movb $-52, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -4093,32 +4093,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,2,10,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,10,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $-52, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,8,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,8,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,6,14,0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [6,14,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,4,12,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [4,12,0,0,5,13,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -5438,8 +5438,8 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX2-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm5 @@ -5869,8 +5869,8 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm5 @@ -6300,8 +6300,8 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm0 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm5 @@ -6743,19 +6743,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] ; AVX512-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -6835,19 +6835,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] ; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -7038,19 +7038,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -7130,19 +7130,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -7333,19 +7333,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -7425,19 +7425,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -7628,19 +7628,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -7720,19 +7720,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -7923,19 +7923,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -8015,19 +8015,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -8218,19 +8218,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -8310,19 +8310,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -8513,19 +8513,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -8605,19 +8605,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 @@ -8808,19 +8808,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,3,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 @@ -8900,19 +8900,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,1,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,10,0,0,3,11,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,12,0,0,5,13,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [6,14,0,0,7,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index f41123c5c3cfd..a58359ffc15d6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -119,13 +119,13 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [3,5] ; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa %xmm3, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) @@ -136,13 +136,13 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [3,5] ; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm3, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) @@ -153,13 +153,13 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [3,5] ; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm3, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) @@ -170,13 +170,13 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [3,5] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) @@ -187,13 +187,13 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [3,5] ; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa %xmm3, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9) @@ -204,13 +204,13 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [3,5] ; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) @@ -221,13 +221,13 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [3,5] ; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%r9) @@ -238,13 +238,13 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [3,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) @@ -305,32 +305,32 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-LABEL: store_i64_stride5_vf4: ; AVX: # %bb.0: ; AVX-NEXT: vmovapd (%rdi), %ymm0 -; AVX-NEXT: vmovapd (%r8), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] +; AVX-NEXT: vmovapd (%r8), %ymm2 ; AVX-NEXT: vmovapd 16(%rdx), %xmm3 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3] ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm1[1],ymm4[2,3] ; AVX-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1] ; AVX-NEXT: vbroadcastsd 24(%rcx), %ymm5 +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3] ; AVX-NEXT: vmovaps (%rdx), %xmm5 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX-NEXT: vbroadcastsd 8(%rsi), %ymm6 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm5[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm5[0],mem[0] ; AVX-NEXT: vmovaps (%rdi), %xmm5 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX-NEXT: vmovaps %xmm5, (%r9) -; AVX-NEXT: vmovaps %xmm1, 16(%r9) +; AVX-NEXT: vmovaps %xmm2, 16(%r9) ; AVX-NEXT: vmovapd %ymm4, 64(%r9) ; AVX-NEXT: vmovapd %ymm0, 32(%r9) -; AVX-NEXT: vmovapd %ymm2, 96(%r9) +; AVX-NEXT: vmovapd %ymm1, 96(%r9) ; AVX-NEXT: vmovapd %ymm3, 128(%r9) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -346,9 +346,9 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vbroadcastsd 8(%rsi), %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm6, %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] @@ -360,9 +360,9 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm7 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm2 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovaps %ymm6, 64(%r9) ; AVX2-NEXT: vmovaps %ymm1, 128(%r9) @@ -383,9 +383,9 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vbroadcastsd 8(%rsi), %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm6, %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] @@ -397,9 +397,9 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vbroadcastsd 16(%rcx), %ymm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm7 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm2 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovaps %ymm6, 64(%r9) ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9) @@ -420,9 +420,9 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vbroadcastsd 8(%rsi), %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm6, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] @@ -434,9 +434,9 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vbroadcastsd 16(%rcx), %ymm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm7 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm2 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r9) @@ -453,15 +453,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) @@ -477,15 +477,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) @@ -501,15 +501,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r9) @@ -525,15 +525,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) @@ -549,15 +549,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) @@ -573,15 +573,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) @@ -597,15 +597,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) @@ -621,15 +621,15 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [15,3,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [15,3,7,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,4,8,12,0,1,5,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,6,10,14,0,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) @@ -736,19 +736,19 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovapd 32(%rdi), %ymm9 ; AVX-NEXT: vmovaps (%rdi), %ymm1 ; AVX-NEXT: vmovaps (%rcx), %ymm0 -; AVX-NEXT: vmovaps (%r8), %ymm3 +; AVX-NEXT: vmovaps (%r8), %ymm2 +; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] ; AVX-NEXT: vmovapd 32(%r8), %ymm5 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] ; AVX-NEXT: vmovaps 16(%rdx), %xmm7 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovaps (%rdx), %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX-NEXT: vbroadcastsd 8(%rsi), %ymm3 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],mem[0],ymm9[2],mem[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] @@ -766,7 +766,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,3,2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm3[2,3],ymm12[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm2[2,3],ymm12[4,5,6,7] ; AVX-NEXT: vmovapd 48(%rsi), %xmm13 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm10[1] ; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm13 @@ -774,8 +774,8 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3] ; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm11[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm11[0],mem[0] ; AVX-NEXT: vmovaps (%rdi), %xmm7 ; AVX-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] @@ -784,12 +784,12 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovaps %xmm7, (%r9) ; AVX-NEXT: vmovaps %xmm4, 16(%r9) ; AVX-NEXT: vmovaps %xmm10, 160(%r9) -; AVX-NEXT: vmovaps %xmm3, 176(%r9) +; AVX-NEXT: vmovaps %xmm2, 176(%r9) ; AVX-NEXT: vmovaps %ymm12, 64(%r9) ; AVX-NEXT: vmovapd %ymm9, 192(%r9) ; AVX-NEXT: vmovapd %ymm8, 256(%r9) ; AVX-NEXT: vmovapd %ymm6, 224(%r9) -; AVX-NEXT: vmovaps %ymm2, 32(%r9) +; AVX-NEXT: vmovaps %ymm3, 32(%r9) ; AVX-NEXT: vmovaps %ymm1, 96(%r9) ; AVX-NEXT: vmovaps %ymm0, 128(%r9) ; AVX-NEXT: vmovapd %ymm5, 288(%r9) @@ -812,15 +812,15 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps (%rsi), %xmm8 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8 +; AVX2-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm9 ; AVX2-NEXT: vbroadcastsd 40(%rsi), %ymm13 @@ -829,23 +829,23 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1],ymm9[2,3,4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm11, %ymm11 ; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX2-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 56(%rsi), %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] ; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm14 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] ; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] @@ -880,15 +880,15 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm8 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm9 ; AVX2-FP-NEXT: vbroadcastsd 40(%rsi), %ymm13 @@ -897,23 +897,23 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1],ymm9[2,3,4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm11, %ymm11 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 56(%rsi), %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] ; AVX2-FP-NEXT: vbroadcastsd 24(%rsi), %ymm14 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] ; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] @@ -948,15 +948,15 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm8 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm9 ; AVX2-FCP-NEXT: vbroadcastsd 40(%rsi), %ymm13 @@ -965,23 +965,23 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1],ymm9[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm11, %ymm11 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 56(%rsi), %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm13 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] ; AVX2-FCP-NEXT: vbroadcastsd 24(%rsi), %ymm14 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] @@ -1002,56 +1002,56 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i64_stride5_vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] -; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512-NEXT: movb $49, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512-NEXT: movb $8, %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512-NEXT: movb $-116, %al -; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512-NEXT: movb $-116, %cl +; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] -; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512-NEXT: vpermi2q %zmm5, %zmm2, %zmm8 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512-NEXT: movb $24, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] -; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,14,u,u,u,7,15,u] +; AVX512-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] +; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm2, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%r9) ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%r9) @@ -1061,56 +1061,56 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-FCP-LABEL: store_i64_stride5_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512-FCP-NEXT: movb $49, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512-FCP-NEXT: movb $8, %al -; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512-FCP-NEXT: movb $-116, %al -; AVX512-FCP-NEXT: kmovw %eax, %k2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512-FCP-NEXT: movb $-116, %cl +; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512-FCP-NEXT: kmovw %eax, %k2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512-FCP-NEXT: movb $24, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,14,u,u,u,7,15,u] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) @@ -1120,56 +1120,56 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-LABEL: store_i64_stride5_vf8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512DQ-NEXT: movb $49, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512DQ-NEXT: movb $8, %al -; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-NEXT: movb $-116, %al -; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-NEXT: movb $-116, %cl +; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512DQ-NEXT: movb $24, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,14,u,u,u,7,15,u] +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%r9) @@ -1179,56 +1179,56 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-FCP-LABEL: store_i64_stride5_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: movb $49, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: movb $8, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: movb $-116, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: movb $-116, %cl +; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512DQ-FCP-NEXT: movb $24, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,14,u,u,u,7,15,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) @@ -1238,56 +1238,56 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512BW-LABEL: store_i64_stride5_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512BW-NEXT: movb $8, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: movb $-116, %al -; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512BW-NEXT: movb $-116, %cl +; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,14,u,u,u,7,15,u] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r9) @@ -1297,56 +1297,56 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512BW-FCP-LABEL: store_i64_stride5_vf8: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: movb $49, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: movb $8, %al -; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: movb $-116, %al -; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: movb $-116, %cl +; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,14,u,u,u,7,15,u] +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) @@ -1356,56 +1356,56 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-BW-LABEL: store_i64_stride5_vf8: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: movb $49, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: movb $8, %al -; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: movb $-116, %al -; AVX512DQ-BW-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: movb $-116, %cl +; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 256(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%r9) @@ -1415,56 +1415,56 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride5_vf8: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [3,0,0,0,12,4,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,11,0,0,0,4,12] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movb $49, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $8, %al -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,0,8,0,0,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,8,0,0,0,1,9,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: movb $-116, %al -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: movb $-116, %cl +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [1,0,0,0,10,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,2,10,0,0,0,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,5,13,0,0,0,6,14] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [6,14,0,0,0,7,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,14,u,u,u,7,15,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) @@ -1673,137 +1673,142 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX-LABEL: store_i64_stride5_vf16: ; AVX: # %bb.0: -; AVX-NEXT: subq $216, %rsp +; AVX-NEXT: subq $248, %rsp ; AVX-NEXT: vmovapd 32(%rdi), %ymm5 ; AVX-NEXT: vmovaps (%rdi), %ymm8 -; AVX-NEXT: vmovapd 96(%rdi), %ymm2 +; AVX-NEXT: vmovapd 96(%rdi), %ymm6 ; AVX-NEXT: vmovaps (%rcx), %ymm0 -; AVX-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX-NEXT: vmovaps 64(%rcx), %ymm2 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX-NEXT: vmovaps 16(%rdx), %xmm11 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX-NEXT: vmovaps 16(%rdx), %xmm9 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX-NEXT: vmovaps 80(%rdx), %xmm7 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 80(%rdx), %xmm11 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],mem[0],ymm6[2],mem[2] ; AVX-NEXT: vmovapd 96(%rcx), %xmm1 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rdx), %xmm0 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vbroadcastsd 8(%rsi), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps (%rdx), %xmm1 +; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vbroadcastsd 8(%rsi), %ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 32(%rdx), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vbroadcastsd 40(%rsi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3] +; AVX-NEXT: vbroadcastsd 40(%rsi), %ymm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX-NEXT: vbroadcastsd 72(%rsi), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX-NEXT: vbroadcastsd 72(%rsi), %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 64(%rdx), %xmm10 -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vbroadcastsd 104(%rsi), %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3] -; AVX-NEXT: vmovaps 96(%rdx), %xmm14 -; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm9 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3] -; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm11[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX-NEXT: vmovaps (%r8), %ymm9 -; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm12[2,3,4,5,6,7] -; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] -; AVX-NEXT: vmovapd 48(%rdx), %xmm6 -; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] -; AVX-NEXT: vmovapd 48(%rsi), %xmm8 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] -; AVX-NEXT: vmovapd 32(%r8), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0],ymm13[1,2,3] -; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm8[1],ymm15[2,3] -; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm8[2],ymm5[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm8[3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm7[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 64(%r8), %ymm9 -; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm4[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm2[2,3] -; AVX-NEXT: vmovapd 112(%rdx), %xmm2 -; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX-NEXT: vmovapd 112(%rsi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 120(%rcx), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX-NEXT: vmovapd 96(%r8), %ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX-NEXT: vbroadcastsd 104(%rsi), %ymm13 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm13[2,3] +; AVX-NEXT: vmovaps 96(%rdx), %xmm13 +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm12 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX-NEXT: vmovaps (%r8), %ymm0 +; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm15[2,3,4,5,6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm5[2,3] +; AVX-NEXT: vmovapd 48(%rdx), %xmm4 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[0],ymm0[0],ymm4[2],ymm0[3] +; AVX-NEXT: vmovapd 48(%rsi), %xmm5 +; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm12 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm4[0,1],ymm12[2,3] +; AVX-NEXT: vmovapd 32(%r8), %ymm0 +; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm0[0],mem[1,2,3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm11[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX-NEXT: vmovaps 64(%r8), %ymm1 +; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm6[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vmovapd 112(%rdx), %xmm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[2],ymm2[3] +; AVX-NEXT: vmovapd 112(%rsi), %xmm6 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm0[1] +; AVX-NEXT: vbroadcastsd 120(%rcx), %ymm6 +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3] +; AVX-NEXT: vmovapd 96(%r8), %ymm0 +; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3] -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm14[0],mem[0] -; AVX-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1,2],ymm0[3] +; AVX-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = xmm13[0],mem[0] ; AVX-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX-NEXT: vmovaps (%rdi), %xmm15 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX-NEXT: vmovaps %xmm8, 16(%r9) +; AVX-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX-NEXT: vmovaps %xmm9, 16(%r9) ; AVX-NEXT: vmovaps %xmm15, (%r9) -; AVX-NEXT: vmovaps %xmm1, 496(%r9) -; AVX-NEXT: vmovaps %xmm13, 480(%r9) +; AVX-NEXT: vmovaps %xmm6, 496(%r9) +; AVX-NEXT: vmovaps %xmm12, 480(%r9) ; AVX-NEXT: vmovaps %xmm0, 176(%r9) -; AVX-NEXT: vmovaps %xmm12, 160(%r9) +; AVX-NEXT: vmovaps %xmm13, 160(%r9) ; AVX-NEXT: vmovaps %xmm10, 336(%r9) -; AVX-NEXT: vmovaps %xmm14, 320(%r9) -; AVX-NEXT: vmovapd %ymm9, 576(%r9) +; AVX-NEXT: vmovaps %xmm1, 320(%r9) +; AVX-NEXT: vmovapd %ymm2, 576(%r9) ; AVX-NEXT: vmovapd %ymm3, 512(%r9) -; AVX-NEXT: vmovaps %ymm5, 384(%r9) +; AVX-NEXT: vmovaps %ymm14, 384(%r9) ; AVX-NEXT: vmovaps %ymm7, 352(%r9) -; AVX-NEXT: vmovapd %ymm11, 256(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 256(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 224(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1812,76 +1817,74 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm0, 64(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 32(%r9) -; AVX-NEXT: vmovapd %ymm2, 608(%r9) -; AVX-NEXT: vmovapd %ymm4, 544(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 448(%r9) -; AVX-NEXT: vmovaps %ymm6, 416(%r9) +; AVX-NEXT: vmovapd %ymm4, 608(%r9) +; AVX-NEXT: vmovapd %ymm5, 544(%r9) +; AVX-NEXT: vmovaps %ymm11, 448(%r9) +; AVX-NEXT: vmovaps %ymm8, 416(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 288(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 128(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 96(%r9) -; AVX-NEXT: addq $216, %rsp +; AVX-NEXT: addq $248, %rsp ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: store_i64_stride5_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-NEXT: vmovaps (%rdi), %ymm11 ; AVX2-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-NEXT: vmovaps 32(%rsi), %xmm2 ; AVX2-NEXT: vmovaps 64(%rsi), %xmm0 ; AVX2-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm7 -; AVX2-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX2-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-NEXT: vmovaps 32(%rdx), %xmm13 -; AVX2-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 -; AVX2-NEXT: vbroadcastsd 8(%rsi), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-NEXT: vbroadcastsd 8(%rsi), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9 -; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm9 -; AVX2-NEXT: vbroadcastsd 40(%rsi), %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm1 +; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 40(%rsi), %ymm1 +; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 ; AVX2-NEXT: vbroadcastsd 72(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, 96(%rcx), %ymm3, %ymm0 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 96(%rdx), %xmm2 @@ -1890,80 +1893,81 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],mem[0],ymm15[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 120(%rsi), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-NEXT: vmovaps (%rcx), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps 64(%rdx), %ymm3 -; AVX2-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 88(%rsi), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] -; AVX2-NEXT: vmovaps (%r8), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm5[2,3,4,5,6,7] -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] -; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps 32(%r8), %ymm0 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] -; AVX2-NEXT: vmovaps 64(%r8), %ymm3 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vblendps $243, (%rsp), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX2-NEXT: # ymm14 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm15[2,3] -; AVX2-NEXT: vbroadcastsd 112(%rcx), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vmovaps 96(%r8), %ymm15 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] -; AVX2-NEXT: vmovaps %ymm3, 576(%r9) +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 64(%rdx), %ymm4 +; AVX2-NEXT: vmovaps 64(%rcx), %ymm7 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-NEXT: vbroadcastsd 88(%rsi), %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm11[2,3] +; AVX2-NEXT: vmovaps (%r8), %ymm2 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1],ymm2[2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm9[2,3] +; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovaps 32(%r8), %ymm11 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm11[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-NEXT: # ymm4 = ymm4[2,3],mem[2,3] +; AVX2-NEXT: vmovaps 64(%r8), %ymm7 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm7[2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm0[2,3] +; AVX2-NEXT: vbroadcastsd 112(%rcx), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovaps 96(%r8), %ymm14 +; AVX2-NEXT: vblendps $252, (%rsp), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm14[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5],ymm7[6,7] +; AVX2-NEXT: vmovaps %ymm7, 576(%r9) ; AVX2-NEXT: vmovaps %ymm0, 544(%r9) ; AVX2-NEXT: vmovaps %ymm1, 512(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 480(%r9) -; AVX2-NEXT: vmovaps %ymm2, 416(%r9) -; AVX2-NEXT: vmovaps %ymm14, 384(%r9) +; AVX2-NEXT: vmovaps %ymm4, 416(%r9) +; AVX2-NEXT: vmovaps %ymm13, 384(%r9) ; AVX2-NEXT: vmovaps %ymm11, 352(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 320(%r9) @@ -1972,16 +1976,16 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm8, 192(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-NEXT: vmovaps %ymm10, 96(%r9) +; AVX2-NEXT: vmovaps %ymm9, 96(%r9) ; AVX2-NEXT: vmovaps %ymm12, 64(%r9) -; AVX2-NEXT: vmovaps %ymm13, 32(%r9) +; AVX2-NEXT: vmovaps %ymm15, 32(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r9) -; AVX2-NEXT: vmovaps %ymm7, 448(%r9) +; AVX2-NEXT: vmovaps %ymm3, 448(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-NEXT: vmovaps %ymm4, 608(%r9) -; AVX2-NEXT: vmovaps %ymm9, 288(%r9) +; AVX2-NEXT: vmovaps %ymm2, 608(%r9) +; AVX2-NEXT: vmovaps %ymm10, 288(%r9) ; AVX2-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1989,58 +1993,57 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-LABEL: store_i64_stride5_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm11 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm2 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm0 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm7 -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-FP-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm13 -; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd 8(%rsi), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-FP-NEXT: vbroadcastsd 8(%rsi), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm9 -; AVX2-FP-NEXT: vbroadcastsd 40(%rsi), %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm1 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm2 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd 40(%rsi), %ymm1 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 ; AVX2-FP-NEXT: vbroadcastsd 72(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rcx), %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 96(%rdx), %xmm2 @@ -2049,80 +2052,81 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],mem[0],ymm15[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 120(%rsi), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 24(%rsi), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm3 -; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 88(%rsi), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] -; AVX2-FP-NEXT: vmovaps (%r8), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm0 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] -; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm3 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps $243, (%rsp), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm14 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm15[2,3] -; AVX2-FP-NEXT: vbroadcastsd 112(%rcx), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm15 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovaps %ymm3, 576(%r9) +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-FP-NEXT: vbroadcastsd 24(%rsi), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm4 +; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm7 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-FP-NEXT: vbroadcastsd 88(%rsi), %ymm1 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm11[2,3] +; AVX2-FP-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1],ymm2[2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm9[2,3] +; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm11 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm11[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm4 = ymm4[2,3],mem[2,3] +; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm7 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm7[2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm0[2,3] +; AVX2-FP-NEXT: vbroadcastsd 112(%rcx), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm14 +; AVX2-FP-NEXT: vblendps $252, (%rsp), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm14[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovaps %ymm7, 576(%r9) ; AVX2-FP-NEXT: vmovaps %ymm0, 544(%r9) ; AVX2-FP-NEXT: vmovaps %ymm1, 512(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%r9) -; AVX2-FP-NEXT: vmovaps %ymm2, 416(%r9) -; AVX2-FP-NEXT: vmovaps %ymm14, 384(%r9) +; AVX2-FP-NEXT: vmovaps %ymm4, 416(%r9) +; AVX2-FP-NEXT: vmovaps %ymm13, 384(%r9) ; AVX2-FP-NEXT: vmovaps %ymm11, 352(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 320(%r9) @@ -2131,16 +2135,16 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm8, 192(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-FP-NEXT: vmovaps %ymm10, 96(%r9) +; AVX2-FP-NEXT: vmovaps %ymm9, 96(%r9) ; AVX2-FP-NEXT: vmovaps %ymm12, 64(%r9) -; AVX2-FP-NEXT: vmovaps %ymm13, 32(%r9) +; AVX2-FP-NEXT: vmovaps %ymm15, 32(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FP-NEXT: vmovaps %ymm7, 448(%r9) +; AVX2-FP-NEXT: vmovaps %ymm3, 448(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FP-NEXT: vmovaps %ymm4, 608(%r9) -; AVX2-FP-NEXT: vmovaps %ymm9, 288(%r9) +; AVX2-FP-NEXT: vmovaps %ymm2, 608(%r9) +; AVX2-FP-NEXT: vmovaps %ymm10, 288(%r9) ; AVX2-FP-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -2148,58 +2152,57 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-LABEL: store_i64_stride5_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm2 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm0 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm7 -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm13 -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vbroadcastsd 8(%rsi), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vbroadcastsd 8(%rsi), %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vbroadcastsd 40(%rsi), %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm2 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd 40(%rsi), %ymm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vbroadcastsd 72(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rcx), %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %xmm2 @@ -2208,80 +2211,81 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],mem[0],ymm15[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 120(%rsi), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 24(%rsi), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 88(%rsi), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm0 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] -; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm3 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $243, (%rsp), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm14 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm15[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 112(%rcx), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm15 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm3, 576(%r9) +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-FCP-NEXT: vbroadcastsd 24(%rsi), %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm7 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-FCP-NEXT: vbroadcastsd 88(%rsi), %ymm1 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm11[2,3] +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm15 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1],ymm2[2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm9[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm11 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm11[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm4 = ymm4[2,3],mem[2,3] +; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm7 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm7[2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm0[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 112(%rcx), %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm14 +; AVX2-FCP-NEXT: vblendps $252, (%rsp), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm14[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm7, 576(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm1, 512(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm2, 416(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm14, 384(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm4, 416(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm13, 384(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm11, 352(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%r9) @@ -2290,16 +2294,16 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm8, 192(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm10, 96(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm9, 96(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm12, 64(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm13, 32(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm15, 32(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FCP-NEXT: vmovaps %ymm7, 448(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm3, 448(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm4, 608(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm9, 288(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm2, 608(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm10, 288(%r9) ; AVX2-FCP-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -2316,10 +2320,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512-NEXT: movb $49, %al @@ -2332,47 +2336,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512-NEXT: movb $-116, %al ; AVX512-NEXT: kmovw %eax, %k3 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512-NEXT: movb $24, %al ; AVX512-NEXT: kmovw %eax, %k2 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2415,10 +2419,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512-FCP-NEXT: movb $49, %al @@ -2431,47 +2435,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512-FCP-NEXT: movb $-116, %al ; AVX512-FCP-NEXT: kmovw %eax, %k3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512-FCP-NEXT: movb $24, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2514,10 +2518,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512DQ-NEXT: movb $49, %al @@ -2530,47 +2534,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512DQ-NEXT: movb $-116, %al ; AVX512DQ-NEXT: kmovw %eax, %k3 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512DQ-NEXT: movb $24, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2613,10 +2617,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: movb $49, %al @@ -2629,47 +2633,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: movb $-116, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: movb $24, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2712,10 +2716,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512BW-NEXT: movb $49, %al @@ -2728,47 +2732,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2811,10 +2815,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: movb $49, %al @@ -2827,47 +2831,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512BW-FCP-NEXT: movb $-116, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: movb $24, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -2910,10 +2914,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: movb $49, %al @@ -2926,47 +2930,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512DQ-BW-NEXT: movb $-116, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: movb $24, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -3009,10 +3013,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [3,0,0,0,12,4,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,3,11,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movb $49, %al @@ -3025,47 +3029,47 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,0,8,0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,8,0,0,0,1,9,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $-116, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [6,14,0,0,0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movb $24, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,5,13,0,0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,0,0,0,10,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,2,10,0,0,0,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 @@ -3506,29 +3510,28 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-LABEL: store_i64_stride5_vf32: ; AVX: # %bb.0: ; AVX-NEXT: subq $1048, %rsp # imm = 0x418 -; AVX-NEXT: vmovaps 192(%rdi), %ymm9 -; AVX-NEXT: vmovapd 160(%rdi), %ymm7 -; AVX-NEXT: vmovapd 96(%rdi), %ymm5 +; AVX-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX-NEXT: vmovapd 160(%rdi), %ymm8 +; AVX-NEXT: vmovapd 96(%rdi), %ymm4 ; AVX-NEXT: vmovaps 128(%rcx), %ymm0 ; AVX-NEXT: vmovaps (%rcx), %ymm1 ; AVX-NEXT: vmovaps 64(%rcx), %ymm2 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX-NEXT: vmovaps 16(%rdx), %xmm5 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX-NEXT: vmovaps 80(%rdx), %xmm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX-NEXT: vmovapd 96(%rcx), %xmm2 -; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX-NEXT: vmovapd 96(%rcx), %xmm1 +; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX-NEXT: vmovaps 144(%rdx), %xmm2 @@ -3536,13 +3539,13 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],mem[0],ymm8[2],mem[2] ; AVX-NEXT: vmovapd 160(%rcx), %xmm1 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2] ; AVX-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -3555,16 +3558,16 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps (%rdi), %ymm8 +; AVX-NEXT: vmovaps (%rdi), %ymm13 ; AVX-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps (%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 32(%rdi), %ymm2 @@ -3579,20 +3582,20 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX-NEXT: vmovaps 64(%rdi), %ymm3 ; AVX-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] ; AVX-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3601,211 +3604,212 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX-NEXT: vbroadcastsd 136(%rsi), %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 128(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vmovaps 128(%rdx), %xmm9 +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX-NEXT: vbroadcastsd 168(%rsi), %ymm7 +; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3] +; AVX-NEXT: vmovaps 160(%rdx), %xmm14 +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm6 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovaps 192(%rdx), %xmm6 +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX-NEXT: vbroadcastsd 232(%rsi), %ymm6 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3] -; AVX-NEXT: vmovaps 224(%rdx), %xmm11 -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1,2],ymm11[3] -; AVX-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] +; AVX-NEXT: vbroadcastsd 232(%rsi), %ymm7 +; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm0[0,1],ymm7[2,3] +; AVX-NEXT: vmovaps 224(%rdx), %xmm7 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm15 +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm11[0,1,2],ymm15[3] +; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm5[0],mem[0] +; AVX-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX-NEXT: vmovaps (%r8), %ymm15 -; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7] -; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX-NEXT: vmovaps (%r8), %ymm13 +; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = ymm13[0,1],mem[2,3,4,5,6,7] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX-NEXT: vmovapd 48(%rdx), %xmm8 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm2[0],ymm8[2],ymm2[3] -; AVX-NEXT: vmovapd 48(%rsi), %xmm15 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm15[1],xmm8[1] -; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm15 -; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3] -; AVX-NEXT: vmovapd 32(%r8), %ymm15 -; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = ymm15[0],mem[1,2,3] -; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = mem[0],ymm15[1],mem[2,3] -; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 48(%rdx), %xmm11 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm11[0],ymm2[0],ymm11[2],ymm2[3] +; AVX-NEXT: vmovapd 48(%rsi), %xmm13 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm11[1] +; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm13 +; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3] +; AVX-NEXT: vmovapd 32(%r8), %ymm13 +; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = ymm13[0],mem[1,2,3] +; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = mem[0],ymm13[1],mem[2,3] +; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm13[2],ymm2[3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm15[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1,2],ymm13[3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX-NEXT: vmovaps 64(%r8), %ymm8 -; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm8[6,7] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $252, (%rsp), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = ymm8[0,1],mem[2,3,4,5,6,7] -; AVX-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill -; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7] -; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vmovaps 64(%r8), %ymm3 +; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $243, (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm5[2,3] -; AVX-NEXT: vmovapd 112(%rdx), %xmm5 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[3] -; AVX-NEXT: vmovapd 112(%rsi), %xmm8 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm5[1] -; AVX-NEXT: vbroadcastsd 120(%rcx), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3] -; AVX-NEXT: vmovapd 96(%r8), %ymm8 -; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3] -; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX-NEXT: # ymm13 = ymm8[0],mem[1,2,3] -; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm8[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm4[2,3] +; AVX-NEXT: vmovapd 112(%rdx), %xmm3 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] +; AVX-NEXT: vmovapd 112(%rsi), %xmm4 +; AVX-NEXT: vbroadcastsd 120(%rcx), %ymm11 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3] +; AVX-NEXT: vmovapd 96(%r8), %ymm4 +; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = mem[0],ymm4[1],mem[2,3] +; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX-NEXT: # ymm7 = ymm4[0],mem[1,2,3] +; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovaps 128(%r8), %ymm2 -; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm2[2,3],ymm10[4,5,6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm2[2,3],ymm12[4,5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm8[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm7[2,3] -; AVX-NEXT: vmovapd 176(%rdx), %xmm2 -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX-NEXT: vmovapd 176(%rsi), %xmm5 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 184(%rcx), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] -; AVX-NEXT: vmovapd 160(%r8), %ymm5 -; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: # ymm7 = mem[0],ymm5[1],mem[2,3] -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: # ymm7 = ymm5[0],mem[1,2,3] -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm5[3] +; AVX-NEXT: vmovapd 176(%rdx), %xmm1 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] +; AVX-NEXT: vmovapd 176(%rsi), %xmm3 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; AVX-NEXT: vbroadcastsd 184(%rcx), %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX-NEXT: vmovapd 160(%r8), %ymm3 +; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = mem[0],ymm3[1],mem[2,3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = ymm3[0],mem[1,2,3] +; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3] +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX-NEXT: vmovaps 192(%r8), %ymm2 -; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm12[2,3,4,5,6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm6[2,3,4,5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX-NEXT: vmovapd 240(%rdx), %xmm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX-NEXT: vmovapd 240(%rsi), %xmm2 +; AVX-NEXT: vbroadcastsd 248(%rcx), %ymm3 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX-NEXT: vbroadcastsd 248(%rcx), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX-NEXT: vmovapd 224(%r8), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0],ymm11[1,2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX-NEXT: vmovapd 224(%r8), %ymm2 +; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: # ymm3 = ymm2[0],mem[1,2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm2[1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm5[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX-NEXT: # xmm7 = xmm0[0],mem[0] -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm4[0],mem[0] -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm3[0],mem[0] -; AVX-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm1[0],mem[0] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm1[0],mem[0] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm14[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX-NEXT: vmovaps 160(%rdi), %xmm13 +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 96(%rdi), %xmm13 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX-NEXT: vmovaps 224(%rdi), %xmm9 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX-NEXT: vmovaps 160(%rdi), %xmm11 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX-NEXT: vmovaps (%rdi), %xmm10 +; AVX-NEXT: vmovaps 224(%rdi), %xmm10 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX-NEXT: vmovaps (%rdi), %xmm8 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX-NEXT: vmovaps %xmm8, 16(%r9) -; AVX-NEXT: vmovaps %xmm10, (%r9) -; AVX-NEXT: vmovaps %xmm2, 976(%r9) -; AVX-NEXT: vmovaps %xmm11, 960(%r9) +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX-NEXT: vmovaps %xmm7, 16(%r9) +; AVX-NEXT: vmovaps %xmm8, (%r9) +; AVX-NEXT: vmovaps %xmm1, 976(%r9) +; AVX-NEXT: vmovaps %xmm12, 960(%r9) ; AVX-NEXT: vmovaps %xmm0, 1136(%r9) -; AVX-NEXT: vmovaps %xmm9, 1120(%r9) -; AVX-NEXT: vmovaps %xmm5, 816(%r9) -; AVX-NEXT: vmovaps %xmm13, 800(%r9) -; AVX-NEXT: vmovaps %xmm7, 496(%r9) -; AVX-NEXT: vmovaps %xmm15, 480(%r9) -; AVX-NEXT: vmovaps %xmm1, 176(%r9) -; AVX-NEXT: vmovaps %xmm12, 160(%r9) -; AVX-NEXT: vmovaps %xmm3, 336(%r9) -; AVX-NEXT: vmovaps %xmm14, 320(%r9) -; AVX-NEXT: vmovaps %xmm4, 656(%r9) +; AVX-NEXT: vmovaps %xmm10, 1120(%r9) +; AVX-NEXT: vmovaps %xmm2, 816(%r9) +; AVX-NEXT: vmovaps %xmm11, 800(%r9) +; AVX-NEXT: vmovaps %xmm4, 496(%r9) +; AVX-NEXT: vmovaps %xmm13, 480(%r9) +; AVX-NEXT: vmovaps %xmm3, 176(%r9) +; AVX-NEXT: vmovaps %xmm15, 160(%r9) +; AVX-NEXT: vmovaps %xmm14, 336(%r9) +; AVX-NEXT: vmovaps %xmm5, 320(%r9) +; AVX-NEXT: vmovaps %xmm9, 656(%r9) ; AVX-NEXT: vmovaps %xmm6, 640(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 1216(%r9) @@ -3827,9 +3831,9 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm0, 576(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 512(%r9) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 384(%r9) ; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 384(%r9) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 352(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 256(%r9) @@ -3878,76 +3882,77 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-LABEL: store_i64_stride5_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $1128, %rsp # imm = 0x468 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-NEXT: vmovaps (%rdi), %ymm13 ; AVX2-NEXT: vmovaps (%rsi), %xmm2 ; AVX2-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 -; AVX2-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-NEXT: vmovaps (%rdx), %xmm10 -; AVX2-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm5 -; AVX2-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vbroadcastsd 8(%rsi), %ymm5 +; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm7 +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 -; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm5 +; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm5 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 +; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 ; AVX2-NEXT: vbroadcastsd 40(%rsi), %ymm5 -; AVX2-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-NEXT: vbroadcastsd 72(%rsi), %ymm2 -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vbroadcastsd 104(%rsi), %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0 @@ -3957,33 +3962,32 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-NEXT: vbroadcastsd 136(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 160(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 168(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 @@ -3994,157 +3998,158 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 200(%rsi), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 224(%rdi), %ymm9 ; AVX2-NEXT: vbroadcastsd 232(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 120(%rsi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 120(%rsi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 184(%rsi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 248(%rsi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 248(%rsi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 248(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-NEXT: vmovaps (%rcx), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps 64(%rdx), %ymm14 ; AVX2-NEXT: vmovaps 64(%rcx), %ymm15 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 88(%rsi), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps 128(%rdx), %ymm3 -; AVX2-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 152(%rsi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps 192(%rdx), %ymm5 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-NEXT: vbroadcastsd 88(%rsi), %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 128(%rdx), %ymm2 +; AVX2-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-NEXT: vbroadcastsd 152(%rsi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 192(%rdx), %ymm3 ; AVX2-NEXT: vmovaps 192(%rcx), %ymm4 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 216(%rsi), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2,3,4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX2-NEXT: vbroadcastsd 216(%rsi), %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-NEXT: vmovaps (%r8), %ymm1 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovaps (%r8), %ymm9 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm10[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm13[2,3] -; AVX2-NEXT: vbroadcastsd 112(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps 96(%r8), %ymm1 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = ymm0[2,3],mem[2,3] -; AVX2-NEXT: vmovaps 128(%r8), %ymm2 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps 64(%r8), %ymm5 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $243, (%rsp), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovaps 160(%r8), %ymm3 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-NEXT: vbroadcastsd 112(%rcx), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 96(%r8), %ymm5 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] +; AVX2-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] +; AVX2-NEXT: vbroadcastsd 176(%rcx), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovaps 160(%r8), %ymm2 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = ymm2[2,3],mem[2,3] ; AVX2-NEXT: vmovaps 192(%r8), %ymm4 @@ -4152,19 +4157,20 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] ; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm9[2,3] -; AVX2-NEXT: vbroadcastsd 240(%rcx), %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm12[2,3] +; AVX2-NEXT: vbroadcastsd 240(%rcx), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovaps 224(%r8), %ymm0 ; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-NEXT: vmovaps %ymm1, 1184(%r9) ; AVX2-NEXT: vmovaps %ymm2, 1152(%r9) @@ -4175,14 +4181,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm5, 992(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 960(%r9) -; AVX2-NEXT: vmovaps %ymm8, 896(%r9) -; AVX2-NEXT: vmovaps %ymm6, 864(%r9) -; AVX2-NEXT: vmovaps %ymm7, 832(%r9) +; AVX2-NEXT: vmovaps %ymm6, 896(%r9) +; AVX2-NEXT: vmovaps %ymm7, 864(%r9) +; AVX2-NEXT: vmovaps %ymm8, 832(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 800(%r9) -; AVX2-NEXT: vmovaps %ymm10, 736(%r9) -; AVX2-NEXT: vmovaps %ymm12, 704(%r9) -; AVX2-NEXT: vmovaps %ymm13, 672(%r9) +; AVX2-NEXT: vmovaps %ymm9, 736(%r9) +; AVX2-NEXT: vmovaps %ymm13, 704(%r9) +; AVX2-NEXT: vmovaps %ymm14, 672(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 640(%r9) ; AVX2-NEXT: vmovaps %ymm15, 576(%r9) @@ -4194,7 +4200,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 480(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 416(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 384(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 352(%r9) @@ -4202,7 +4208,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 320(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 256(%r9) -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 192(%r9) @@ -4216,14 +4222,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r9) -; AVX2-NEXT: vmovaps %ymm14, 1088(%r9) +; AVX2-NEXT: vmovaps %ymm11, 1088(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 768(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-NEXT: vmovaps %ymm11, 1248(%r9) +; AVX2-NEXT: vmovaps %ymm10, 1248(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 928(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4237,76 +4243,77 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-LABEL: store_i64_stride5_vf32: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $1128, %rsp # imm = 0x468 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm13 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm2 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovaps (%rdx), %xmm10 -; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm5 -; AVX2-FP-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vbroadcastsd 8(%rsi), %ymm5 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm7 +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm5 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm5 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 ; AVX2-FP-NEXT: vbroadcastsd 40(%rsi), %ymm5 -; AVX2-FP-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-FP-NEXT: vbroadcastsd 72(%rsi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vbroadcastsd 104(%rsi), %ymm0 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0 @@ -4316,33 +4323,32 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-FP-NEXT: vbroadcastsd 136(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 160(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 168(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 @@ -4353,157 +4359,158 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 200(%rsi), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm9 ; AVX2-FP-NEXT: vbroadcastsd 232(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 120(%rsi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 120(%rsi), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 184(%rsi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 248(%rsi), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 248(%rsi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 248(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FP-NEXT: vbroadcastsd 24(%rsi), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm14 ; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm15 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 88(%rsi), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm3 -; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 152(%rsi), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm5 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-FP-NEXT: vbroadcastsd 88(%rsi), %ymm2 +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm2 +; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-FP-NEXT: vbroadcastsd 152(%rsi), %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm3 ; AVX2-FP-NEXT: vmovaps 192(%rcx), %ymm4 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 216(%rsi), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2,3,4,5,6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX2-FP-NEXT: vbroadcastsd 216(%rsi), %ymm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-FP-NEXT: vmovaps (%r8), %ymm1 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovaps (%r8), %ymm9 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm10[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm13[2,3] -; AVX2-FP-NEXT: vbroadcastsd 112(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = ymm0[2,3],mem[2,3] -; AVX2-FP-NEXT: vmovaps 128(%r8), %ymm2 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm5 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $243, (%rsp), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm3 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FP-NEXT: vbroadcastsd 112(%rcx), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 96(%r8), %ymm5 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] +; AVX2-FP-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vbroadcastsd 176(%rcx), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm2 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = ymm2[2,3],mem[2,3] ; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm4 @@ -4511,19 +4518,20 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] ; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm9[2,3] -; AVX2-FP-NEXT: vbroadcastsd 240(%rcx), %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm12[2,3] +; AVX2-FP-NEXT: vbroadcastsd 240(%rcx), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm0 ; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-FP-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-FP-NEXT: vmovaps %ymm1, 1184(%r9) ; AVX2-FP-NEXT: vmovaps %ymm2, 1152(%r9) @@ -4534,14 +4542,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm5, 992(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 960(%r9) -; AVX2-FP-NEXT: vmovaps %ymm8, 896(%r9) -; AVX2-FP-NEXT: vmovaps %ymm6, 864(%r9) -; AVX2-FP-NEXT: vmovaps %ymm7, 832(%r9) +; AVX2-FP-NEXT: vmovaps %ymm6, 896(%r9) +; AVX2-FP-NEXT: vmovaps %ymm7, 864(%r9) +; AVX2-FP-NEXT: vmovaps %ymm8, 832(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 800(%r9) -; AVX2-FP-NEXT: vmovaps %ymm10, 736(%r9) -; AVX2-FP-NEXT: vmovaps %ymm12, 704(%r9) -; AVX2-FP-NEXT: vmovaps %ymm13, 672(%r9) +; AVX2-FP-NEXT: vmovaps %ymm9, 736(%r9) +; AVX2-FP-NEXT: vmovaps %ymm13, 704(%r9) +; AVX2-FP-NEXT: vmovaps %ymm14, 672(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 640(%r9) ; AVX2-FP-NEXT: vmovaps %ymm15, 576(%r9) @@ -4553,7 +4561,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 416(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 384(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 352(%r9) @@ -4561,7 +4569,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 320(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%r9) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%r9) @@ -4575,14 +4583,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FP-NEXT: vmovaps %ymm14, 1088(%r9) +; AVX2-FP-NEXT: vmovaps %ymm11, 1088(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 768(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FP-NEXT: vmovaps %ymm11, 1248(%r9) +; AVX2-FP-NEXT: vmovaps %ymm10, 1248(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 928(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4596,76 +4604,77 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-LABEL: store_i64_stride5_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $1128, %rsp # imm = 0x468 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm2 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm10 -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 8(%rsi), %ymm5 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm5 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm5 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vbroadcastsd 40(%rsi), %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd 72(%rsi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 104(%rsi), %ymm0 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0 @@ -4675,33 +4684,32 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-FCP-NEXT: vbroadcastsd 136(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 168(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 @@ -4712,157 +4720,158 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 200(%rsi), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm9 ; AVX2-FCP-NEXT: vbroadcastsd 232(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 120(%rsi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 120(%rsi), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 184(%rsi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 248(%rsi), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 248(%rsi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 248(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FCP-NEXT: vbroadcastsd 24(%rsi), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm14 ; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm15 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 88(%rsi), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 152(%rsi), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm5 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-FCP-NEXT: vbroadcastsd 88(%rsi), %ymm2 +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm2 +; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-FCP-NEXT: vbroadcastsd 152(%rsi), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm3 ; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm4 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 216(%rsi), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX2-FCP-NEXT: vbroadcastsd 216(%rsi), %ymm13 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm9 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm10[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm13[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 112(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = ymm0[2,3],mem[2,3] -; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm2 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm5 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $243, (%rsp), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm3 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 112(%rcx), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm5 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] +; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm14 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 176(%rcx), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm2 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm2[2,3],mem[2,3] ; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm4 @@ -4870,19 +4879,20 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm9[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 240(%rcx), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm12[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 240(%rcx), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm0 ; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm1, 1184(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm2, 1152(%r9) @@ -4893,14 +4903,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm5, 992(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm8, 896(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm6, 864(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm7, 832(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm6, 896(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm7, 864(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm8, 832(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 800(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm10, 736(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm12, 704(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm13, 672(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm9, 736(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm13, 704(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm14, 672(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm15, 576(%r9) @@ -4912,7 +4922,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%r9) @@ -4920,7 +4930,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 320(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%r9) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%r9) @@ -4934,14 +4944,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FCP-NEXT: vmovaps %ymm14, 1088(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm11, 1088(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 768(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm11, 1248(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm10, 1248(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 928(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4966,11 +4976,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -4981,25 +4991,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5018,7 +5028,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] ; AVX512-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5028,7 +5038,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5089,15 +5099,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5168,11 +5178,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5183,25 +5193,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5220,7 +5230,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5230,7 +5240,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5291,15 +5301,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5370,11 +5380,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5385,25 +5395,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5422,7 +5432,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5432,7 +5442,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5493,15 +5503,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5572,11 +5582,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5587,25 +5597,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5624,7 +5634,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5634,7 +5644,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5695,15 +5705,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5774,11 +5784,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5789,25 +5799,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5826,7 +5836,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5836,7 +5846,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -5897,15 +5907,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5976,11 +5986,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5991,25 +6001,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6028,7 +6038,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6038,7 +6048,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -6099,15 +6109,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6178,11 +6188,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6193,25 +6203,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6230,7 +6240,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6240,7 +6250,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -6301,15 +6311,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6380,11 +6390,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [3,0,0,0,12,4,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,3,11,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6395,25 +6405,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,8,0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,8,0,0,0,1,9,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [6,14,0,0,0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [1,0,0,0,10,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6432,7 +6442,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,5,13,0,0,0,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6442,7 +6452,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,2,10,0,0,0,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 @@ -6503,15 +6513,15 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7393,31 +7403,30 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX-LABEL: store_i64_stride5_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX-NEXT: subq $2296, %rsp # imm = 0x8F8 +; AVX-NEXT: vmovaps 192(%rdi), %ymm15 ; AVX-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX-NEXT: vmovaps 64(%rcx), %ymm1 ; AVX-NEXT: vmovaps 128(%rcx), %ymm0 ; AVX-NEXT: vmovaps (%rcx), %ymm2 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX-NEXT: vmovaps 16(%rdx), %xmm14 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] ; AVX-NEXT: vmovaps 80(%rdx), %xmm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps 96(%rcx), %xmm1 +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX-NEXT: vmovaps 144(%rdx), %xmm2 @@ -7432,9 +7441,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX-NEXT: vmovaps %ymm14, %ymm2 -; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],mem[0],ymm15[2],mem[2] +; AVX-NEXT: vmovaps %ymm15, %ymm3 +; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -7454,8 +7463,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 288(%rdi), %ymm14 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX-NEXT: vmovapd 288(%rdi), %ymm15 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],mem[0],ymm15[2],mem[2] ; AVX-NEXT: vmovapd 288(%rcx), %xmm1 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -7533,7 +7542,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vbroadcastsd 72(%rsi), %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7558,17 +7567,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX-NEXT: vbroadcastsd 168(%rsi), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -7596,10 +7605,10 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX-NEXT: vbroadcastsd 296(%rsi), %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vbroadcastsd 296(%rsi), %ymm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3] ; AVX-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -7622,47 +7631,47 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX-NEXT: vbroadcastsd 392(%rsi), %ymm10 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vbroadcastsd 392(%rsi), %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovaps 384(%rdx), %xmm15 -; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX-NEXT: vmovaps 384(%rdx), %xmm1 +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX-NEXT: vmovapd 416(%rdi), %ymm11 ; AVX-NEXT: vbroadcastsd 424(%rsi), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] ; AVX-NEXT: vmovaps 416(%rdx), %xmm9 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm10 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm10[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm10[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX-NEXT: vbroadcastsd 456(%rsi), %ymm9 -; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX-NEXT: vmovaps 448(%rdx), %xmm10 -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX-NEXT: vbroadcastsd 488(%rsi), %ymm10 -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm10[2,3] +; AVX-NEXT: vbroadcastsd 456(%rsi), %ymm10 +; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX-NEXT: vmovaps 448(%rdx), %xmm9 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm12 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm12[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,3,2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX-NEXT: vbroadcastsd 488(%rsi), %ymm12 +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3] ; AVX-NEXT: vmovapd %ymm13, %ymm9 -; AVX-NEXT: vmovaps 480(%rdx), %xmm10 -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13 -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3] -; AVX-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm14[0],mem[0] +; AVX-NEXT: vmovaps 480(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm12[0,1,2],ymm14[3] +; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] ; AVX-NEXT: vmovaps (%r8), %ymm12 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload ; AVX-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm12[6,7] @@ -7673,9 +7682,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload ; AVX-NEXT: # ymm13 = mem[0,1],ymm12[2,3],mem[4,5,6,7] ; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] ; AVX-NEXT: vmovapd 48(%rdx), %xmm8 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[3] ; AVX-NEXT: vmovapd 48(%rsi), %xmm12 @@ -7708,14 +7717,14 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm0[2,3] ; AVX-NEXT: vmovapd 112(%rdx), %xmm7 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] ; AVX-NEXT: vmovapd 112(%rsi), %xmm8 +; AVX-NEXT: vbroadcastsd 120(%rcx), %ymm12 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] -; AVX-NEXT: vbroadcastsd 120(%rcx), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3] ; AVX-NEXT: vmovapd 96(%r8), %ymm8 ; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload ; AVX-NEXT: # ymm12 = mem[0],ymm8[1],mem[2,3] @@ -7725,10 +7734,10 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3] ; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3] -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3] +; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX-NEXT: vmovaps 128(%r8), %ymm6 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload @@ -7740,16 +7749,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5,6,7] ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm0[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] -; AVX-NEXT: vmovapd 176(%rdx), %xmm6 -; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] +; AVX-NEXT: vmovapd 176(%rdx), %xmm5 +; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[3] ; AVX-NEXT: vmovapd 176(%rsi), %xmm7 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] ; AVX-NEXT: vbroadcastsd 184(%rcx), %ymm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] ; AVX-NEXT: vmovapd 160(%r8), %ymm7 ; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload ; AVX-NEXT: # ymm8 = mem[0],ymm7[1],mem[2,3] @@ -7757,9 +7766,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload ; AVX-NEXT: # ymm8 = ymm7[0],mem[1,2,3] ; AVX-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm7[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3] +; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3] ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] @@ -7781,104 +7790,104 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd 240(%rdx), %xmm5 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[3] ; AVX-NEXT: vmovapd 240(%rsi), %xmm6 +; AVX-NEXT: vbroadcastsd 248(%rcx), %ymm7 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX-NEXT: vbroadcastsd 248(%rcx), %ymm6 -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] ; AVX-NEXT: vmovapd 224(%r8), %ymm6 ; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX-NEXT: # ymm7 = ymm6[0],mem[1,2,3] ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3] -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3] -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm6[3] -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0],ymm6[1],mem[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2],ymm3[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX-NEXT: vmovaps 256(%r8), %ymm4 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm15[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm14[2,3] -; AVX-NEXT: vmovapd 304(%rdx), %xmm4 -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] +; AVX-NEXT: vmovapd 304(%rdx), %xmm3 +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm3[0],ymm5[0],ymm3[2],ymm5[3] ; AVX-NEXT: vmovapd 304(%rsi), %xmm5 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1] ; AVX-NEXT: vbroadcastsd 312(%rcx), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] ; AVX-NEXT: vmovapd 288(%r8), %ymm5 ; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload ; AVX-NEXT: # ymm6 = mem[0],ymm5[1],mem[2,3] ; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX-NEXT: # ymm6 = ymm5[0],mem[1,2,3] -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3] -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm5[3] +; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm5[0],mem[1,2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3] ; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vmovaps 320(%r8), %ymm4 -; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm4[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] ; AVX-NEXT: vmovapd 368(%rdx), %xmm3 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] ; AVX-NEXT: vmovapd 368(%rsi), %xmm4 +; AVX-NEXT: vbroadcastsd 376(%rcx), %ymm5 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX-NEXT: vbroadcastsd 376(%rcx), %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] ; AVX-NEXT: vmovapd 352(%r8), %ymm4 -; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = ymm4[0],mem[1,2,3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX-NEXT: # ymm5 = mem[0],ymm4[1],mem[2,3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm4[3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm4[0],mem[1,2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0],ymm4[1],mem[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vmovaps 384(%r8), %ymm3 -; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm3[6,7] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX-NEXT: vmovapd 432(%rdx), %xmm2 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] ; AVX-NEXT: vmovapd 432(%rsi), %xmm3 @@ -7906,7 +7915,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm1[2,3],ymm10[4,5,6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7914,25 +7923,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd 496(%rdx), %xmm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX-NEXT: vmovapd 496(%rsi), %xmm2 +; AVX-NEXT: vbroadcastsd 504(%rcx), %ymm3 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX-NEXT: vbroadcastsd 504(%rcx), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] ; AVX-NEXT: vmovapd 480(%r8), %ymm2 ; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload ; AVX-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] ; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: # ymm3 = ymm2[0],mem[1,2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0],ymm14[1,2,3] ; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7940,22 +7944,13 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm10[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload -; AVX-NEXT: # xmm7 = xmm15[0],mem[0] -; AVX-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7965,18 +7960,29 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7984,21 +7990,25 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm1[0],mem[0] +; AVX-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm1[0],mem[0] ; AVX-NEXT: vmovaps 352(%rdi), %xmm15 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps 320(%rdi), %xmm13 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 320(%rdi), %xmm14 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX-NEXT: vmovaps 416(%rdi), %xmm13 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps 480(%rdi), %xmm12 @@ -8015,39 +8025,39 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX-NEXT: vmovaps %xmm8, 16(%r9) ; AVX-NEXT: vmovaps %xmm9, (%r9) -; AVX-NEXT: vmovaps %xmm7, 1936(%r9) +; AVX-NEXT: vmovaps %xmm2, 1936(%r9) ; AVX-NEXT: vmovaps %xmm10, 1920(%r9) ; AVX-NEXT: vmovaps %xmm0, 2256(%r9) ; AVX-NEXT: vmovaps %xmm11, 2240(%r9) -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vmovaps %xmm0, 2416(%r9) +; AVX-NEXT: vmovaps %xmm6, 2416(%r9) ; AVX-NEXT: vmovaps %xmm12, 2400(%r9) ; AVX-NEXT: vmovaps %xmm1, 2096(%r9) -; AVX-NEXT: vmovaps %xmm14, 2080(%r9) +; AVX-NEXT: vmovaps %xmm13, 2080(%r9) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 1616(%r9) -; AVX-NEXT: vmovaps %xmm13, 1600(%r9) -; AVX-NEXT: vmovaps %xmm2, 1776(%r9) +; AVX-NEXT: vmovaps %xmm14, 1600(%r9) +; AVX-NEXT: vmovaps %xmm3, 1776(%r9) ; AVX-NEXT: vmovaps %xmm15, 1760(%r9) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 1456(%r9) -; AVX-NEXT: vmovaps %xmm3, 1440(%r9) +; AVX-NEXT: vmovaps %xmm4, 1440(%r9) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 976(%r9) -; AVX-NEXT: vmovaps %xmm4, 960(%r9) -; AVX-NEXT: vmovaps %xmm5, 1136(%r9) -; AVX-NEXT: vmovaps %xmm6, 1120(%r9) +; AVX-NEXT: vmovaps %xmm5, 960(%r9) +; AVX-NEXT: vmovaps %xmm7, 1136(%r9) +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovaps %xmm0, 1120(%r9) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 816(%r9) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 800(%r9) -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 496(%r9) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 480(%r9) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 176(%r9) -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 160(%r9) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 336(%r9) @@ -8189,7 +8199,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm0, 128(%r9) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 96(%r9) -; AVX-NEXT: addq $2264, %rsp # imm = 0x8D8 +; AVX-NEXT: addq $2296, %rsp # imm = 0x8F8 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -8207,22 +8217,22 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 ; AVX2-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm3 ; AVX2-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm5 -; AVX2-NEXT: vbroadcastsd 8(%rsi), %ymm6 +; AVX2-NEXT: vbroadcastsd 8(%rsi), %ymm5 +; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 ; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm5 @@ -8232,25 +8242,25 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 40(%rsi), %ymm5 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-NEXT: vbroadcastsd 72(%rsi), %ymm2 +; AVX2-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm1 @@ -8258,15 +8268,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX2-NEXT: vbroadcastsd 104(%rsi), %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],mem[0],ymm10[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0 @@ -8277,32 +8287,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 136(%rsi), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 160(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 168(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 @@ -8313,32 +8323,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 200(%rsi), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 232(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 256(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 256(%rcx), %ymm0, %ymm0 @@ -8349,32 +8359,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 264(%rsi), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 288(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 288(%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 288(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 288(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 296(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 320(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 320(%rcx), %ymm0, %ymm0 @@ -8385,32 +8395,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 328(%rsi), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 352(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 352(%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 352(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 352(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 360(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 352(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 384(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 384(%rcx), %ymm0, %ymm0 @@ -8421,32 +8431,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 392(%rsi), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 416(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 416(%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 416(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 424(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 448(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 448(%rcx), %ymm0, %ymm0 @@ -8457,164 +8467,164 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 456(%rsi), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 480(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 480(%rcx), %ymm0, %ymm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX2-NEXT: vmovaps 480(%rdx), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 480(%rdi), %ymm2 -; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 488(%rsi), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 120(%rsi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 120(%rsi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 184(%rsi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 248(%rsi), %ymm0 +; AVX2-NEXT: vbroadcastsd 184(%rsi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 248(%rsi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 248(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 312(%rsi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 312(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 376(%rsi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 312(%rcx), %ymm2 +; AVX2-NEXT: vbroadcastsd 376(%rcx), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 376(%rsi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 440(%rsi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 440(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 504(%rsi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 440(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 504(%rsi), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 504(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 504(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rdx), %ymm9 +; AVX2-NEXT: vmovaps (%rdx), %ymm8 ; AVX2-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-NEXT: vmovaps 64(%rdx), %ymm3 ; AVX2-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 88(%rsi), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rdx), %ymm3 -; AVX2-NEXT: vmovaps 128(%rcx), %ymm15 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-NEXT: vbroadcastsd 88(%rsi), %ymm5 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 152(%rsi), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rdx), %ymm5 -; AVX2-NEXT: vmovaps 192(%rcx), %ymm4 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-NEXT: vmovaps 128(%rdx), %ymm4 +; AVX2-NEXT: vmovaps 128(%rcx), %ymm12 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-NEXT: vbroadcastsd 152(%rsi), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 192(%rdx), %ymm15 +; AVX2-NEXT: vmovaps 192(%rcx), %ymm5 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm5[1],ymm15[3],ymm5[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX2-NEXT: vbroadcastsd 216(%rsi), %ymm7 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 256(%rdx), %ymm7 ; AVX2-NEXT: vmovaps 256(%rcx), %ymm6 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX2-NEXT: vbroadcastsd 280(%rsi), %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 320(%rdx), %ymm10 -; AVX2-NEXT: vmovaps 320(%rcx), %ymm8 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-NEXT: vmovaps 320(%rcx), %ymm9 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-NEXT: vbroadcastsd 344(%rsi), %ymm13 ; AVX2-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 344(%rsi), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm11[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 384(%rdx), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 384(%rcx), %ymm11 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] -; AVX2-NEXT: vbroadcastsd 408(%rsi), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastsd 408(%rsi), %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 448(%rdx), %ymm12 -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 448(%rdx), %ymm13 +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 448(%rcx), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] ; AVX2-NEXT: vbroadcastsd 472(%rsi), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-NEXT: vmovaps (%r8), %ymm1 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm8[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: # ymm9 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-NEXT: vmovaps 64(%r8), %ymm1 @@ -8640,12 +8650,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-NEXT: vmovaps 128(%r8), %ymm1 @@ -8658,12 +8668,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] +; AVX2-NEXT: vbroadcastsd 176(%rcx), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 176(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovaps 160(%r8), %ymm1 ; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] @@ -8676,7 +8686,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-NEXT: vmovaps 192(%r8), %ymm1 @@ -8702,11 +8712,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2,3],mem[2,3] @@ -8720,12 +8730,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 304(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 304(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovaps 288(%r8), %ymm1 ; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] @@ -8738,7 +8748,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-NEXT: vmovaps 320(%r8), %ymm1 @@ -8761,64 +8771,64 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-NEXT: # ymm15 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-NEXT: vmovaps 384(%r8), %ymm11 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-NEXT: # ymm13 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vmovaps 384(%r8), %ymm8 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = ymm8[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm8[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 432(%rcx), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vmovaps 416(%r8), %ymm12 -; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: # ymm8 = ymm12[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[0,1],ymm12[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm1[2,3] +; AVX2-NEXT: vbroadcastsd 432(%rcx), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovaps 416(%r8), %ymm9 +; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-NEXT: # ymm7 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = ymm11[2,3],mem[2,3] +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm8[2,3],mem[2,3] ; AVX2-NEXT: vmovaps 448(%r8), %ymm0 ; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm0[4,5],ymm8[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 496(%rcx), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm0[2,3] +; AVX2-NEXT: vbroadcastsd 496(%rcx), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovaps 480(%r8), %ymm0 ; AVX2-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX2-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] ; AVX2-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX2-NEXT: vmovaps %ymm1, 2464(%r9) ; AVX2-NEXT: vmovaps %ymm2, 2432(%r9) @@ -8831,16 +8841,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 2240(%r9) ; AVX2-NEXT: vmovaps %ymm6, 2176(%r9) ; AVX2-NEXT: vmovaps %ymm7, 2144(%r9) -; AVX2-NEXT: vmovaps %ymm8, 2112(%r9) +; AVX2-NEXT: vmovaps %ymm10, 2112(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2080(%r9) -; AVX2-NEXT: vmovaps %ymm9, 2016(%r9) -; AVX2-NEXT: vmovaps %ymm10, 1984(%r9) -; AVX2-NEXT: vmovaps %ymm13, 1952(%r9) +; AVX2-NEXT: vmovaps %ymm12, 2016(%r9) +; AVX2-NEXT: vmovaps %ymm13, 1984(%r9) +; AVX2-NEXT: vmovaps %ymm14, 1952(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1920(%r9) -; AVX2-NEXT: vmovaps %ymm14, 1856(%r9) -; AVX2-NEXT: vmovaps %ymm15, 1824(%r9) +; AVX2-NEXT: vmovaps %ymm15, 1856(%r9) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 1824(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1792(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8869,7 +8880,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 1312(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1280(%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1184(%r9) @@ -8901,7 +8912,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 672(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 640(%r9) -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 544(%r9) @@ -8933,8 +8944,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%r9) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 2368(%r9) +; AVX2-NEXT: vmovaps %ymm9, 2368(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2048(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8949,7 +8959,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-NEXT: vmovaps %ymm11, 2528(%r9) +; AVX2-NEXT: vmovaps %ymm8, 2528(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2208(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8982,22 +8992,22 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-FP-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm5 -; AVX2-FP-NEXT: vbroadcastsd 8(%rsi), %ymm6 +; AVX2-FP-NEXT: vbroadcastsd 8(%rsi), %ymm5 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm5 @@ -9007,25 +9017,25 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 40(%rsi), %ymm5 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-FP-NEXT: vbroadcastsd 72(%rsi), %ymm2 +; AVX2-FP-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm1 @@ -9033,15 +9043,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX2-FP-NEXT: vbroadcastsd 104(%rsi), %ymm0 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],mem[0],ymm10[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0 @@ -9052,32 +9062,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 136(%rsi), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 160(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 168(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 @@ -9088,32 +9098,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 200(%rsi), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 232(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 256(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 256(%rcx), %ymm0, %ymm0 @@ -9124,32 +9134,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 264(%rsi), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 288(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 288(%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 288(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 296(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 320(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 320(%rcx), %ymm0, %ymm0 @@ -9160,32 +9170,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 328(%rsi), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 352(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 352(%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 352(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 360(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 384(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 384(%rcx), %ymm0, %ymm0 @@ -9196,32 +9206,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 392(%rsi), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 416(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 416(%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 416(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 424(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 448(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rcx), %ymm0, %ymm0 @@ -9232,164 +9242,164 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 456(%rsi), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 480(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rcx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 480(%rdx), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 488(%rsi), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 120(%rsi), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 120(%rsi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 184(%rsi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 248(%rsi), %ymm0 +; AVX2-FP-NEXT: vbroadcastsd 184(%rsi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 248(%rsi), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd 248(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 312(%rsi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 312(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 376(%rsi), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 312(%rcx), %ymm2 +; AVX2-FP-NEXT: vbroadcastsd 376(%rcx), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 376(%rsi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 440(%rsi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 440(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 504(%rsi), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 440(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 504(%rsi), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 504(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 504(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm9 +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm8 ; AVX2-FP-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-FP-NEXT: vbroadcastsd 24(%rsi), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm3 ; AVX2-FP-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 88(%rsi), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm3 -; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm15 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FP-NEXT: vbroadcastsd 88(%rsi), %ymm5 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 152(%rsi), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm5 -; AVX2-FP-NEXT: vmovaps 192(%rcx), %ymm4 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm4 +; AVX2-FP-NEXT: vmovaps 128(%rcx), %ymm12 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-FP-NEXT: vbroadcastsd 152(%rsi), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm15 +; AVX2-FP-NEXT: vmovaps 192(%rcx), %ymm5 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm5[1],ymm15[3],ymm5[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX2-FP-NEXT: vbroadcastsd 216(%rsi), %ymm7 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 256(%rdx), %ymm7 ; AVX2-FP-NEXT: vmovaps 256(%rcx), %ymm6 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX2-FP-NEXT: vbroadcastsd 280(%rsi), %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 320(%rdx), %ymm10 -; AVX2-FP-NEXT: vmovaps 320(%rcx), %ymm8 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-FP-NEXT: vmovaps 320(%rcx), %ymm9 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-FP-NEXT: vbroadcastsd 344(%rsi), %ymm13 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 344(%rsi), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm11[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 384(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 384(%rcx), %ymm11 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] -; AVX2-FP-NEXT: vbroadcastsd 408(%rsi), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 408(%rsi), %ymm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 448(%rdx), %ymm12 -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 448(%rdx), %ymm13 +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 448(%rcx), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] ; AVX2-FP-NEXT: vbroadcastsd 472(%rsi), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FP-NEXT: vmovaps (%r8), %ymm1 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm8[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FP-NEXT: vmovaps 64(%r8), %ymm1 @@ -9415,12 +9425,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FP-NEXT: vmovaps 128(%r8), %ymm1 @@ -9433,12 +9443,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vbroadcastsd 176(%rcx), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 176(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] @@ -9451,7 +9461,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm1 @@ -9477,11 +9487,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] @@ -9495,12 +9505,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 304(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 304(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 288(%r8), %ymm1 ; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] @@ -9513,7 +9523,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FP-NEXT: vmovaps 320(%r8), %ymm1 @@ -9536,64 +9546,64 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm15 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-FP-NEXT: vmovaps 384(%r8), %ymm11 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm13 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vmovaps 384(%r8), %ymm8 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = ymm8[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 432(%rcx), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 416(%r8), %ymm12 -; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm8 = ymm12[0,1],mem[2,3,4,5,6,7] -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[0,1],ymm12[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vbroadcastsd 432(%rcx), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 416(%r8), %ymm9 +; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm7 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = ymm11[2,3],mem[2,3] +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm8[2,3],mem[2,3] ; AVX2-FP-NEXT: vmovaps 448(%r8), %ymm0 ; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm0[4,5],ymm8[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 496(%rcx), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm0[2,3] +; AVX2-FP-NEXT: vbroadcastsd 496(%rcx), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 480(%r8), %ymm0 ; AVX2-FP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX2-FP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] ; AVX2-FP-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX2-FP-NEXT: vmovaps %ymm1, 2464(%r9) ; AVX2-FP-NEXT: vmovaps %ymm2, 2432(%r9) @@ -9606,16 +9616,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 2240(%r9) ; AVX2-FP-NEXT: vmovaps %ymm6, 2176(%r9) ; AVX2-FP-NEXT: vmovaps %ymm7, 2144(%r9) -; AVX2-FP-NEXT: vmovaps %ymm8, 2112(%r9) +; AVX2-FP-NEXT: vmovaps %ymm10, 2112(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2080(%r9) -; AVX2-FP-NEXT: vmovaps %ymm9, 2016(%r9) -; AVX2-FP-NEXT: vmovaps %ymm10, 1984(%r9) -; AVX2-FP-NEXT: vmovaps %ymm13, 1952(%r9) +; AVX2-FP-NEXT: vmovaps %ymm12, 2016(%r9) +; AVX2-FP-NEXT: vmovaps %ymm13, 1984(%r9) +; AVX2-FP-NEXT: vmovaps %ymm14, 1952(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1920(%r9) -; AVX2-FP-NEXT: vmovaps %ymm14, 1856(%r9) -; AVX2-FP-NEXT: vmovaps %ymm15, 1824(%r9) +; AVX2-FP-NEXT: vmovaps %ymm15, 1856(%r9) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 1824(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1792(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9644,7 +9655,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 1312(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1184(%r9) @@ -9676,7 +9687,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 672(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 640(%r9) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 544(%r9) @@ -9708,8 +9719,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 2368(%r9) +; AVX2-FP-NEXT: vmovaps %ymm9, 2368(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2048(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9724,7 +9734,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FP-NEXT: vmovaps %ymm11, 2528(%r9) +; AVX2-FP-NEXT: vmovaps %ymm8, 2528(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2208(%r9) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9757,22 +9767,22 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vbroadcastsd 8(%rsi), %ymm6 +; AVX2-FCP-NEXT: vbroadcastsd 8(%rsi), %ymm5 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm5 @@ -9782,25 +9792,25 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastsd 40(%rsi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd 72(%rsi), %ymm2 +; AVX2-FCP-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm1 @@ -9808,15 +9818,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX2-FCP-NEXT: vbroadcastsd 104(%rsi), %ymm0 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],mem[0],ymm10[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rcx), %ymm0, %ymm0 @@ -9827,32 +9837,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 136(%rsi), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 168(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 @@ -9863,32 +9873,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 200(%rsi), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 232(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 256(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rcx), %ymm0, %ymm0 @@ -9899,32 +9909,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 264(%rsi), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 288(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 288(%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 288(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 296(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 320(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rcx), %ymm0, %ymm0 @@ -9935,32 +9945,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 328(%rsi), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 352(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 352(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 360(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 384(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rcx), %ymm0, %ymm0 @@ -9971,32 +9981,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 392(%rsi), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 416(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 416(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 424(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 448(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rcx), %ymm0, %ymm0 @@ -10007,164 +10017,164 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 456(%rsi), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 480(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rcx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 480(%rdx), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 488(%rsi), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 56(%rsi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 120(%rsi), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 120(%rsi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 184(%rsi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 248(%rsi), %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd 184(%rsi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 248(%rsi), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd 248(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 312(%rsi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 312(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 376(%rsi), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 312(%rcx), %ymm2 +; AVX2-FCP-NEXT: vbroadcastsd 376(%rcx), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 376(%rsi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 440(%rsi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 440(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 504(%rsi), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 440(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 504(%rsi), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 504(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 504(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm9 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm8 ; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] ; AVX2-FCP-NEXT: vbroadcastsd 24(%rsi), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm3 ; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 88(%rsi), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm15 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 88(%rsi), %ymm5 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 152(%rsi), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm5 -; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm4 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm12 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-FCP-NEXT: vbroadcastsd 152(%rsi), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm15 +; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm5 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm5[1],ymm15[3],ymm5[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX2-FCP-NEXT: vbroadcastsd 216(%rsi), %ymm7 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 256(%rdx), %ymm7 ; AVX2-FCP-NEXT: vmovaps 256(%rcx), %ymm6 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX2-FCP-NEXT: vbroadcastsd 280(%rsi), %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 320(%rdx), %ymm10 -; AVX2-FCP-NEXT: vmovaps 320(%rcx), %ymm8 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-FCP-NEXT: vmovaps 320(%rcx), %ymm9 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-FCP-NEXT: vbroadcastsd 344(%rsi), %ymm13 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 344(%rsi), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm11[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 384(%rdx), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 384(%rcx), %ymm11 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] -; AVX2-FCP-NEXT: vbroadcastsd 408(%rsi), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 408(%rsi), %ymm13 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 448(%rdx), %ymm12 -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 448(%rdx), %ymm13 +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 448(%rcx), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] ; AVX2-FCP-NEXT: vbroadcastsd 472(%rsi), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FCP-NEXT: vmovaps (%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm8[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm1 @@ -10190,12 +10200,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm1 @@ -10208,12 +10218,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 176(%rcx), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 176(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm1 ; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] @@ -10226,7 +10236,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm1 @@ -10252,11 +10262,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] @@ -10270,12 +10280,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 304(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 304(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 288(%r8), %ymm1 ; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] @@ -10288,7 +10298,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FCP-NEXT: vmovaps 320(%r8), %ymm1 @@ -10311,64 +10321,64 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm15 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-FCP-NEXT: vmovaps 384(%r8), %ymm11 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm13 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vmovaps 384(%r8), %ymm8 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm14 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm14 = ymm8[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 432(%rcx), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 416(%r8), %ymm12 -; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm8 = ymm12[0,1],mem[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm12[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 432(%rcx), %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 416(%r8), %ymm9 +; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = ymm11[2,3],mem[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm8[2,3],mem[2,3] ; AVX2-FCP-NEXT: vmovaps 448(%r8), %ymm0 ; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm0[4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 496(%rcx), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm0[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 496(%rcx), %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 480(%r8), %ymm0 ; AVX2-FCP-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm1, 2464(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm2, 2432(%r9) @@ -10381,16 +10391,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 2240(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm6, 2176(%r9) ; AVX2-FCP-NEXT: vmovaps %ymm7, 2144(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm8, 2112(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm10, 2112(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2080(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm9, 2016(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm10, 1984(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm13, 1952(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm12, 2016(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm13, 1984(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm14, 1952(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1920(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm14, 1856(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm15, 1824(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm15, 1856(%r9) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 1824(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1792(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10419,7 +10430,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 1312(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1280(%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1184(%r9) @@ -10451,7 +10462,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%r9) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%r9) @@ -10483,8 +10494,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 2368(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm9, 2368(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2048(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10499,7 +10509,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FCP-NEXT: vmovaps %ymm11, 2528(%r9) +; AVX2-FCP-NEXT: vmovaps %ymm8, 2528(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2208(%r9) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10532,7 +10542,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -10548,20 +10558,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5 @@ -10651,20 +10661,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] ; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -10801,16 +10811,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10995,7 +11005,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -11011,20 +11021,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 @@ -11114,20 +11124,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -11264,16 +11274,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11458,7 +11468,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -11474,20 +11484,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm14, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 @@ -11577,20 +11587,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -11727,16 +11737,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512DQ-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11921,7 +11931,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -11937,20 +11947,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 @@ -12040,20 +12050,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -12190,16 +12200,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12384,7 +12394,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -12400,20 +12410,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 @@ -12503,20 +12513,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -12653,16 +12663,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12847,7 +12857,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -12863,20 +12873,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 @@ -12966,20 +12976,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -13116,16 +13126,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13310,7 +13320,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -13326,20 +13336,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 @@ -13429,20 +13439,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -13579,16 +13589,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13773,7 +13783,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [3,0,0,0,12,4,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill @@ -13789,20 +13799,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,8,0,0,0,1,9,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,0,2,10,0,0,0,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,5,13,0,0,0,6,14] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5 @@ -13892,20 +13902,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,3,11,0,0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,0,8,0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [1,0,0,0,10,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [6,14,0,0,0,7,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -14042,16 +14052,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index aac6a1bddd08a..18a762714504a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -71,9 +71,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-NEXT: vmovaps (%rdx), %xmm1 -; AVX2-NEXT: vmovaps (%r8), %xmm2 ; AVX2-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%r8), %xmm2 ; AVX2-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm2 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,1,3] @@ -92,9 +92,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovaps (%rdx), %xmm1 -; AVX2-FP-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm2 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,1,3] @@ -113,9 +113,9 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm1 -; AVX2-FCP-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm2 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,1,3] @@ -134,14 +134,14 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7] ; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) @@ -153,14 +153,14 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7] ; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) @@ -172,14 +172,14 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7] ; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) @@ -191,14 +191,14 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) @@ -210,14 +210,14 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7] ; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) @@ -229,14 +229,14 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7] ; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) @@ -248,14 +248,14 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7] ; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) @@ -267,14 +267,14 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) @@ -374,10 +374,10 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3],ymm1[2,3] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[3] ; AVX-NEXT: vmovapd 16(%rdx), %xmm1 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX-NEXT: vbroadcastsd 24(%r8), %ymm3 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[3] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0] @@ -397,43 +397,43 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-NEXT: vmovaps (%rcx), %ymm3 -; AVX2-NEXT: vmovaps (%r8), %ymm4 -; AVX2-NEXT: vmovaps (%r9), %xmm5 -; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = xmm5[0,0] +; AVX2-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-NEXT: vmovaps (%rcx), %ymm5 +; AVX2-NEXT: vmovaps (%r8), %ymm2 +; AVX2-NEXT: vmovaps (%r9), %xmm6 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = xmm6[0,0] ; AVX2-NEXT: vmovaps (%rsi), %xmm7 ; AVX2-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[0,1],ymm9[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[0,1],ymm9[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-NEXT: vmovaps (%rcx), %xmm9 ; AVX2-NEXT: vmovaps (%rdx), %xmm10 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] ; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 ; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm10 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm10[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3],ymm8[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] ; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm1 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps %ymm8, 96(%rax) +; AVX2-NEXT: vmovaps %ymm5, 96(%rax) ; AVX2-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-NEXT: vmovaps %ymm7, (%rax) -; AVX2-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-NEXT: vmovaps %ymm4, 160(%rax) +; AVX2-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -442,43 +442,43 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FP-NEXT: vmovaps (%rcx), %ymm3 -; AVX2-FP-NEXT: vmovaps (%r8), %ymm4 -; AVX2-FP-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = xmm5[0,0] +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-FP-NEXT: vmovaps (%rcx), %ymm5 +; AVX2-FP-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FP-NEXT: vmovaps (%r9), %xmm6 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = xmm6[0,0] ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm7 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[0,1],ymm9[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[0,1],ymm9[0,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FP-NEXT: vmovaps (%rcx), %xmm9 ; AVX2-FP-NEXT: vmovaps (%rdx), %xmm10 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] ; AVX2-FP-NEXT: vbroadcastsd 8(%r8), %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm10 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm10[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3],ymm8[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vbroadcastsd 16(%r9), %ymm1 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rax) ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FP-NEXT: vmovaps %ymm7, (%rax) -; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm4, 160(%rax) +; AVX2-FP-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -487,43 +487,43 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm3 -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm4 -; AVX2-FCP-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = xmm5[0,0] +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm5 +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FCP-NEXT: vmovaps (%r9), %xmm6 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = xmm6[0,0] ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm7 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[0,1],ymm9[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[0,1],ymm9[0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm9 ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm10 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] ; AVX2-FCP-NEXT: vbroadcastsd 8(%r8), %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm10 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm10[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3],ymm8[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 16(%r9), %ymm1 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm7, (%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm4, 160(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -536,18 +536,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -564,18 +564,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -592,18 +592,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -620,18 +620,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -648,18 +648,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -676,18 +676,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -704,18 +704,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -732,18 +732,18 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,4,8,12,0,0,1,5] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,5,0,0,10,14,2,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [11,15,3,7,11,15,3,7] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,14,2,3,4,5,11,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) @@ -886,7 +886,6 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovapd 32(%r8), %ymm14 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX-NEXT: vmovaps (%rsi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX-NEXT: vmovaps (%rdi), %xmm3 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm1[1] @@ -908,7 +907,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vinsertf128 $1, (%r9), %ymm7, %ymm10 ; AVX-NEXT: vbroadcastsd 8(%r8), %ymm12 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] ; AVX-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm12[1],xmm4[1] @@ -918,54 +918,54 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovapd 32(%rsi), %ymm15 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX-NEXT: vmovapd 32(%r9), %ymm0 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm15[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[2],ymm14[3] -; AVX-NEXT: vmovapd (%rdi), %ymm14 -; AVX-NEXT: vmovapd (%rsi), %ymm15 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX-NEXT: vmovapd (%r9), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm15[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm11[0],ymm14[0],ymm11[2],ymm14[3] -; AVX-NEXT: vmovapd 48(%rdx), %xmm11 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],mem[1] -; AVX-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3] -; AVX-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm15 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX-NEXT: vmovapd 16(%rdx), %xmm15 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] -; AVX-NEXT: vbroadcastsd 24(%r8), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] -; AVX-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX-NEXT: vmovapd 32(%r9), %ymm14 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3],ymm15[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm15[0],ymm13[2],ymm15[3] +; AVX-NEXT: vmovapd (%rdi), %ymm15 +; AVX-NEXT: vmovapd (%rsi), %ymm0 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm15[2,3] +; AVX-NEXT: vmovapd (%r9), %ymm15 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[0],ymm0[0],ymm11[2],ymm0[3] +; AVX-NEXT: vmovapd 48(%rdx), %xmm0 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX-NEXT: vbroadcastsd 56(%r8), %ymm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX-NEXT: vmovaps 48(%rdi), %xmm2 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vmovapd 16(%rdx), %xmm2 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm7 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX-NEXT: vbroadcastsd 24(%r8), %ymm7 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm15[3] +; AVX-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vbroadcastsd 16(%rcx), %ymm15 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm15[6,7] ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm12[0],xmm4[0] ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0] -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm9[0],xmm5[0] ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps %xmm5, 16(%rax) -; AVX-NEXT: vmovaps %xmm3, (%rax) +; AVX-NEXT: vmovaps %xmm3, 16(%rax) +; AVX-NEXT: vmovaps %xmm1, (%rax) ; AVX-NEXT: vmovaps %xmm6, 208(%rax) ; AVX-NEXT: vmovaps %xmm4, 192(%rax) -; AVX-NEXT: vmovapd %ymm14, 128(%rax) +; AVX-NEXT: vmovapd %ymm11, 128(%rax) ; AVX-NEXT: vmovapd %ymm13, 320(%rax) -; AVX-NEXT: vmovaps %ymm2, 96(%rax) -; AVX-NEXT: vmovapd %ymm1, 160(%rax) +; AVX-NEXT: vmovaps %ymm7, 96(%rax) +; AVX-NEXT: vmovapd %ymm2, 160(%rax) ; AVX-NEXT: vmovapd %ymm10, 224(%rax) ; AVX-NEXT: vmovaps %ymm0, 288(%rax) -; AVX-NEXT: vmovaps %ymm7, 64(%rax) -; AVX-NEXT: vmovapd %ymm11, 352(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 64(%rax) +; AVX-NEXT: vmovapd %ymm14, 352(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 256(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -975,253 +975,253 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-LABEL: store_i64_stride6_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%r8), %ymm1 -; AVX2-NEXT: vmovaps 32(%r8), %ymm4 -; AVX2-NEXT: vmovaps (%r9), %xmm3 -; AVX2-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm3[0,0] +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps (%r8), %ymm2 +; AVX2-NEXT: vmovaps 32(%r8), %ymm5 +; AVX2-NEXT: vmovaps (%r9), %xmm4 +; AVX2-NEXT: vmovaps 32(%r9), %xmm6 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm4[0,0] ; AVX2-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm9[1] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[0,1],ymm2[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX2-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm9[1] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm3[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm3 +; AVX2-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-NEXT: vmovaps 32(%rcx), %xmm10 ; AVX2-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm12[1] +; AVX2-NEXT: vmovaps 32(%rdx), %xmm11 ; AVX2-NEXT: vbroadcastsd 40(%r8), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm11[1] -; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm7[1] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[0,1],ymm8[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX2-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm11[1],xmm10[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm12[1] +; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = xmm6[0,0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm8[1] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[0,1],ymm14[0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm9, %ymm12 +; AVX2-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm13 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm12[0],ymm7[2],ymm12[2] ; AVX2-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm10 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm14[2,3] -; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm12[2,3] -; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm11 +; AVX2-NEXT: vmovaps (%rdx), %ymm15 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm11[2,3] +; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm8[1],ymm13[1],ymm8[3],ymm13[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3] -; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm13[0],ymm8[2],ymm13[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm13 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm14 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] +; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-NEXT: vmovaps %ymm11, 160(%rax) -; AVX2-NEXT: vmovaps %ymm9, 288(%rax) -; AVX2-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-NEXT: vmovaps %ymm10, 352(%rax) -; AVX2-NEXT: vmovaps %ymm6, (%rax) -; AVX2-NEXT: vmovaps %ymm5, 224(%rax) -; AVX2-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-NEXT: vmovaps %ymm2, 256(%rax) -; AVX2-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-NEXT: vmovaps %ymm10, 160(%rax) +; AVX2-NEXT: vmovaps %ymm11, 288(%rax) +; AVX2-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-NEXT: vmovaps %ymm9, 352(%rax) +; AVX2-NEXT: vmovaps %ymm7, (%rax) +; AVX2-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-NEXT: vmovaps %ymm3, 256(%rax) +; AVX2-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i64_stride6_vf8: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovaps (%r8), %ymm1 -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm4 -; AVX2-FP-NEXT: vmovaps (%r9), %xmm3 -; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = xmm3[0,0] +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm5 +; AVX2-FP-NEXT: vmovaps (%r9), %xmm4 +; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm6 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = xmm4[0,0] ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm9[1] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[0,1],ymm2[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm9[1] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm3[0,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm3 +; AVX2-FP-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm10 ; AVX2-FP-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm12[1] +; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm11 ; AVX2-FP-NEXT: vbroadcastsd 40(%r8), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm11[1] -; AVX2-FP-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm7[1] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[0,1],ymm8[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX2-FP-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm11[1],xmm10[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm12[1] +; AVX2-FP-NEXT: vbroadcastsd 8(%r8), %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = xmm6[0,0] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm8[1] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[0,1],ymm14[0,1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm9, %ymm12 +; AVX2-FP-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm13 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm12[0],ymm7[2],ymm12[2] ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm10 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-FP-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm14[2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%r9), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm12[2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX2-FP-NEXT: vbroadcastsd 56(%r8), %ymm11 +; AVX2-FP-NEXT: vmovaps (%rdx), %ymm15 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm11[2,3] +; AVX2-FP-NEXT: vbroadcastsd 48(%r9), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm8[1],ymm13[1],ymm8[3],ymm13[3] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3] -; AVX2-FP-NEXT: vbroadcastsd 16(%r9), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm13[0],ymm8[2],ymm13[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX2-FP-NEXT: vbroadcastsd 16(%rcx), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm13 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-FP-NEXT: vbroadcastsd 16(%r9), %ymm14 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] +; AVX2-FP-NEXT: vbroadcastsd 16(%rcx), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FP-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-FP-NEXT: vmovaps %ymm11, 160(%rax) -; AVX2-FP-NEXT: vmovaps %ymm9, 288(%rax) -; AVX2-FP-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FP-NEXT: vmovaps %ymm10, 352(%rax) -; AVX2-FP-NEXT: vmovaps %ymm6, (%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 224(%rax) -; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-FP-NEXT: vmovaps %ymm2, 256(%rax) -; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-FP-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-FP-NEXT: vmovaps %ymm10, 160(%rax) +; AVX2-FP-NEXT: vmovaps %ymm11, 288(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-FP-NEXT: vmovaps %ymm9, 352(%rax) +; AVX2-FP-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FP-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, 256(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i64_stride6_vf8: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm1 -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm4 -; AVX2-FCP-NEXT: vmovaps (%r9), %xmm3 -; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = xmm3[0,0] +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm5 +; AVX2-FCP-NEXT: vmovaps (%r9), %xmm4 +; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm6 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = xmm4[0,0] ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm9[1] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[0,1],ymm2[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm9[1] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm3[0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm10 ; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm12[1] +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm11 ; AVX2-FCP-NEXT: vbroadcastsd 40(%r8), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm11[1] -; AVX2-FCP-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm7[1] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[0,1],ymm8[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm11[1],xmm10[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm12[1] +; AVX2-FCP-NEXT: vbroadcastsd 8(%r8), %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = xmm6[0,0] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm8[1] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[0,1],ymm14[0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm9, %ymm12 +; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm13 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm12[0],ymm7[2],ymm12[2] ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm10 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-FCP-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm14[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%r9), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm12[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX2-FCP-NEXT: vbroadcastsd 56(%r8), %ymm11 +; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm15 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm11[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 48(%r9), %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm8[1],ymm13[1],ymm8[3],ymm13[3] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 16(%r9), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm13[0],ymm8[2],ymm13[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 16(%rcx), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm13 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-FCP-NEXT: vbroadcastsd 16(%r9), %ymm14 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 16(%rcx), %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm11, 160(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm9, 288(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm10, 352(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm6, (%rax) -; AVX2-FCP-NEXT: vmovaps %ymm5, 224(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, 256(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm10, 160(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm11, 288(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm9, 352(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm3, 256(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -1232,11 +1232,11 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,4,12] ; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512-NEXT: movb $12, %r10b ; AVX512-NEXT: kmovw %r10d, %k1 @@ -1244,9 +1244,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movb $16, %r10b ; AVX512-NEXT: kmovw %r10d, %k2 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1254,32 +1254,32 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movb $48, %r9b ; AVX512-NEXT: kmovw %r9d, %k2 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] +; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1289,16 +1289,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1316,11 +1316,11 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,4,12] ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512-FCP-NEXT: movb $12, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k1 @@ -1328,9 +1328,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movb $16, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1338,32 +1338,32 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movb $48, %r9b ; AVX512-FCP-NEXT: kmovw %r9d, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1373,16 +1373,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1400,11 +1400,11 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,4,12] ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512DQ-NEXT: movb $12, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k1 @@ -1412,9 +1412,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movb $16, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1422,32 +1422,32 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movb $48, %r9b ; AVX512DQ-NEXT: kmovw %r9d, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] +; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1457,16 +1457,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1484,11 +1484,11 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,4,12] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: movb $12, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 @@ -1496,9 +1496,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movb $16, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1506,32 +1506,32 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movb $48, %r9b ; AVX512DQ-FCP-NEXT: kmovw %r9d, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1541,16 +1541,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1568,11 +1568,11 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,4,12] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512BW-NEXT: movb $12, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 @@ -1580,9 +1580,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movb $16, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1590,32 +1590,32 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1625,16 +1625,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1652,11 +1652,11 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,4,12] ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: movb $12, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 @@ -1664,9 +1664,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movb $16, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1674,32 +1674,32 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movb $48, %r9b ; AVX512BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1709,16 +1709,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1736,11 +1736,11 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,4,12] ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: movb $12, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 @@ -1748,9 +1748,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movb $16, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1758,32 +1758,32 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movb $48, %r9b ; AVX512DQ-BW-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1793,16 +1793,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -1820,11 +1820,11 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,4,12] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movb $12, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 @@ -1832,9 +1832,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movb $16, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] @@ -1842,32 +1842,32 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movb $48, %r9b ; AVX512DQ-BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1877,16 +1877,16 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) @@ -2158,31 +2158,29 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-LABEL: store_i64_stride6_vf16: ; AVX: # %bb.0: ; AVX-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX-NEXT: vmovapd (%r8), %ymm11 +; AVX-NEXT: vmovapd (%r8), %ymm13 ; AVX-NEXT: vmovapd 32(%r8), %ymm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX-NEXT: vmovaps (%rsi), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 64(%rsi), %xmm5 -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm2 ; AVX-NEXT: vbroadcastsd 8(%r8), %ymm3 +; AVX-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2192,55 +2190,59 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] +; AVX-NEXT: vmovaps 32(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; AVX-NEXT: vinsertf128 $1, 32(%r9), %ymm3, %ymm4 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; AVX-NEXT: vbroadcastsd 40(%r8), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm15 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm5[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovapd 64(%r8), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rcx), %xmm10 -; AVX-NEXT: vmovaps 64(%rdx), %xmm9 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm10[1] -; AVX-NEXT: vbroadcastsd 72(%r8), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 40(%r8), %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rsi), %xmm8 -; AVX-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] +; AVX-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-NEXT: vmovapd 96(%r8), %ymm1 +; AVX-NEXT: vmovapd 64(%r8), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rcx), %xmm6 -; AVX-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm6[1] -; AVX-NEXT: vbroadcastsd 104(%r8), %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 64(%rdx), %xmm15 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm15[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, 64(%r9), %ymm2, %ymm3 +; AVX-NEXT: vbroadcastsd 72(%r8), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 96(%rsi), %xmm14 +; AVX-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm10[1],xmm14[1] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vmovapd 96(%r8), %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 96(%rcx), %xmm8 +; AVX-NEXT: vmovaps 96(%rdx), %xmm7 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm8[1] +; AVX-NEXT: vbroadcastsd 104(%r8), %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd (%rdi), %ymm2 -; AVX-NEXT: vmovapd (%rsi), %ymm12 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] -; AVX-NEXT: vmovapd (%r9), %ymm4 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3],ymm12[2,3] +; AVX-NEXT: vmovapd (%rsi), %ymm11 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX-NEXT: vmovapd (%r9), %ymm6 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],ymm11[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[2],ymm11[3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 32(%rdi), %ymm2 @@ -2254,113 +2256,113 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd 64(%rdi), %ymm0 ; AVX-NEXT: vmovapd 64(%rsi), %ymm11 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] -; AVX-NEXT: vmovapd 64(%r9), %ymm0 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 96(%rdi), %ymm5 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3] +; AVX-NEXT: vmovapd 64(%r9), %ymm4 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3],ymm11[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 96(%rdi), %ymm1 ; AVX-NEXT: vmovapd 96(%rsi), %ymm11 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm5[2,3] -; AVX-NEXT: vmovapd 96(%r9), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],ymm11[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] -; AVX-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm1[2,3] +; AVX-NEXT: vmovapd 96(%r9), %ymm5 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm5[2,3],ymm11[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[0],ymm11[0],ymm3[2],ymm11[3] +; AVX-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vbroadcastsd 16(%rcx), %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 16(%rdx), %xmm5 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 16(%rdx), %xmm3 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] ; AVX-NEXT: vbroadcastsd 24(%r8), %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 48(%rdi), %xmm4 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3] -; AVX-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 80(%rcx), %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6,7] -; AVX-NEXT: vmovapd 80(%rdx), %xmm14 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3] +; AVX-NEXT: vmovaps 48(%rdi), %xmm12 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm12[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm3[0,1,2],ymm6[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vmovapd 48(%rdx), %xmm3 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX-NEXT: vbroadcastsd 56(%r8), %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3] +; AVX-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vbroadcastsd 80(%rcx), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovapd 80(%rdx), %xmm0 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 88(%r8), %ymm13 -; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] ; AVX-NEXT: vmovaps 112(%rdi), %xmm13 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm14 -; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX-NEXT: vmovapd 112(%rdx), %xmm14 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] -; AVX-NEXT: vbroadcastsd 120(%r8), %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3] -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm15[0],mem[0] -; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = xmm10[0],mem[0] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1,2],ymm4[3] +; AVX-NEXT: vmovapd 112(%rdx), %xmm4 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm1 +; AVX-NEXT: vbroadcastsd 120(%r8), %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0],mem[0] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm15[0],mem[0] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX-NEXT: # xmm13 = xmm13[0],mem[0] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = xmm15[0],mem[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm14[0] ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = xmm6[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX-NEXT: # xmm8 = xmm8[0],mem[0] +; AVX-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps %xmm8, 16(%rax) -; AVX-NEXT: vmovaps %xmm6, (%rax) -; AVX-NEXT: vmovaps %xmm3, 592(%rax) -; AVX-NEXT: vmovaps %xmm7, 576(%rax) -; AVX-NEXT: vmovaps %xmm14, 208(%rax) -; AVX-NEXT: vmovaps %xmm10, 192(%rax) -; AVX-NEXT: vmovaps %xmm9, 400(%rax) -; AVX-NEXT: vmovaps %xmm12, 384(%rax) +; AVX-NEXT: vmovaps %xmm14, 16(%rax) +; AVX-NEXT: vmovaps %xmm8, (%rax) +; AVX-NEXT: vmovaps %xmm7, 592(%rax) +; AVX-NEXT: vmovaps %xmm10, 576(%rax) +; AVX-NEXT: vmovaps %xmm15, 208(%rax) +; AVX-NEXT: vmovaps %xmm13, 192(%rax) +; AVX-NEXT: vmovaps %xmm5, 400(%rax) +; AVX-NEXT: vmovaps %xmm4, 384(%rax) ; AVX-NEXT: vmovapd %ymm11, 704(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm3, 512(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm3, 320(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm3, 128(%rax) -; AVX-NEXT: vmovapd %ymm1, 736(%rax) -; AVX-NEXT: vmovaps %ymm13, 672(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 640(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 608(%rax) -; AVX-NEXT: vmovapd %ymm0, 544(%rax) -; AVX-NEXT: vmovaps %ymm2, 480(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 512(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 320(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm4, 128(%rax) +; AVX-NEXT: vmovapd %ymm0, 736(%rax) +; AVX-NEXT: vmovaps %ymm1, 672(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 640(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 608(%rax) +; AVX-NEXT: vmovapd %ymm2, 544(%rax) +; AVX-NEXT: vmovaps %ymm3, 480(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 448(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 416(%rax) -; AVX-NEXT: vmovapd %ymm4, 352(%rax) -; AVX-NEXT: vmovaps %ymm5, 288(%rax) +; AVX-NEXT: vmovapd %ymm6, 352(%rax) +; AVX-NEXT: vmovaps %ymm9, 288(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 256(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 224(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 160(%rax) +; AVX-NEXT: vmovapd %ymm12, 160(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 96(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2477,37 +2479,37 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX2-NEXT: vbroadcastsd 16(%r9), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm2 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm4 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm6 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm6[2,3] -; AVX2-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm5 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm4[2,3],ymm5[2,3] +; AVX2-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm9 ; AVX2-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm10[2,3] ; AVX2-NEXT: vbroadcastsd 80(%rcx), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-NEXT: vperm2f128 $19, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX2-NEXT: vperm2f128 $19, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-NEXT: # ymm6 = mem[2,3],ymm6[2,3] ; AVX2-NEXT: vbroadcastsd 80(%r9), %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX2-NEXT: vbroadcastsd 88(%r8), %ymm10 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] @@ -2519,16 +2521,16 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 112(%rcx), %ymm13 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-NEXT: vbroadcastsd 112(%r9), %ymm11 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-NEXT: vbroadcastsd 112(%r9), %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-NEXT: vbroadcastsd 120(%r8), %ymm12 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vmovaps %ymm11, 736(%rax) @@ -2536,16 +2538,16 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm8, 672(%rax) ; AVX2-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-NEXT: vmovaps %ymm9, 544(%rax) -; AVX2-NEXT: vmovaps %ymm7, 512(%rax) -; AVX2-NEXT: vmovaps %ymm5, 480(%rax) +; AVX2-NEXT: vmovaps %ymm6, 512(%rax) +; AVX2-NEXT: vmovaps %ymm4, 480(%rax) ; AVX2-NEXT: vmovaps %ymm14, 384(%rax) -; AVX2-NEXT: vmovaps %ymm6, 352(%rax) +; AVX2-NEXT: vmovaps %ymm5, 352(%rax) ; AVX2-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-NEXT: vmovaps %ymm2, 288(%rax) +; AVX2-NEXT: vmovaps %ymm1, 288(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-NEXT: vmovaps %ymm7, 128(%rax) ; AVX2-NEXT: vmovaps %ymm15, 96(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%rax) @@ -2675,37 +2677,37 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vbroadcastsd 16(%r9), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm2 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-FP-NEXT: vbroadcastsd 48(%r9), %ymm4 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%r9), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-FP-NEXT: vbroadcastsd 56(%r8), %ymm6 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm6[2,3] -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-FP-NEXT: vbroadcastsd 56(%r8), %ymm5 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm4[2,3],ymm5[2,3] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm9 ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm10[2,3] ; AVX2-FP-NEXT: vbroadcastsd 80(%rcx), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-FP-NEXT: vperm2f128 $19, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX2-FP-NEXT: vperm2f128 $19, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm6 = mem[2,3],ymm6[2,3] ; AVX2-FP-NEXT: vbroadcastsd 80(%r9), %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 88(%r8), %ymm10 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] @@ -2717,16 +2719,16 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 112(%rcx), %ymm13 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-FP-NEXT: vbroadcastsd 112(%r9), %ymm11 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-FP-NEXT: vbroadcastsd 112(%r9), %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 120(%r8), %ymm12 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vmovaps %ymm11, 736(%rax) @@ -2734,16 +2736,16 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm8, 672(%rax) ; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-FP-NEXT: vmovaps %ymm9, 544(%rax) -; AVX2-FP-NEXT: vmovaps %ymm7, 512(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 480(%rax) +; AVX2-FP-NEXT: vmovaps %ymm6, 512(%rax) +; AVX2-FP-NEXT: vmovaps %ymm4, 480(%rax) ; AVX2-FP-NEXT: vmovaps %ymm14, 384(%rax) -; AVX2-FP-NEXT: vmovaps %ymm6, 352(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 352(%rax) ; AVX2-FP-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-FP-NEXT: vmovaps %ymm2, 288(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-FP-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-FP-NEXT: vmovaps %ymm7, 128(%rax) ; AVX2-FP-NEXT: vmovaps %ymm15, 96(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax) @@ -2873,37 +2875,37 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 16(%r9), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm2 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-FCP-NEXT: vbroadcastsd 48(%r9), %ymm4 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%r9), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-FCP-NEXT: vbroadcastsd 56(%r8), %ymm6 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm6[2,3] -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-FCP-NEXT: vbroadcastsd 56(%r8), %ymm5 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm4[2,3],ymm5[2,3] +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm9 ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm10[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 80(%rcx), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-FCP-NEXT: vperm2f128 $19, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX2-FCP-NEXT: vperm2f128 $19, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm6 = mem[2,3],ymm6[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 80(%r9), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 88(%r8), %ymm10 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] @@ -2915,16 +2917,16 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastsd 112(%rcx), %ymm13 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-FCP-NEXT: vbroadcastsd 112(%r9), %ymm11 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 112(%r9), %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 120(%r8), %ymm12 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm11, 736(%rax) @@ -2932,16 +2934,16 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm8, 672(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm9, 544(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm7, 512(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm5, 480(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm6, 512(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm4, 480(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm14, 384(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm6, 352(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm5, 352(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, 288(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm7, 128(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm15, 96(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax) @@ -2970,7 +2972,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -2981,8 +2983,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,12] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512-NEXT: movb $12, %r10b @@ -2996,18 +2998,18 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] ; AVX512-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm9 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 @@ -3019,10 +3021,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $48, %r9b ; AVX512-NEXT: kmovw %r9d, %k2 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm15, %zmm24, %zmm13 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm26 @@ -3032,60 +3034,60 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm15, %zmm27, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512-NEXT: vpermi2q %zmm14, %zmm12, %zmm19 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 ; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 ; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 ; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512-NEXT: vpermi2q %zmm14, %zmm12, %zmm25 ; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} ; AVX512-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 ; AVX512-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm14 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm12 ; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm14[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm8 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 ; AVX512-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] ; AVX512-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpermt2q %zmm15, %zmm6, %zmm5 ; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm2 ; AVX512-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 ; AVX512-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) @@ -3108,7 +3110,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -3119,8 +3121,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,12] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512-FCP-NEXT: movb $12, %r10b @@ -3134,18 +3136,18 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 @@ -3157,10 +3159,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $48, %r9b ; AVX512-FCP-NEXT: kmovw %r9d, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm13 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 @@ -3170,60 +3172,60 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm19 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm25 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm14 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm14[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) @@ -3246,7 +3248,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -3257,8 +3259,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,12] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512DQ-NEXT: movb $12, %r10b @@ -3272,18 +3274,18 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm9 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm17 @@ -3295,10 +3297,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movb $48, %r9b ; AVX512DQ-NEXT: kmovw %r9d, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm24, %zmm13 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm26 @@ -3308,60 +3310,60 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm27, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm12, %zmm19 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512DQ-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm12, %zmm25 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm14 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm14[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm6, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rax) @@ -3384,7 +3386,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -3395,8 +3397,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512DQ-FCP-NEXT: movb $12, %r10b @@ -3410,18 +3412,18 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 @@ -3433,10 +3435,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $48, %r9b ; AVX512DQ-FCP-NEXT: kmovw %r9d, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 @@ -3446,60 +3448,60 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm19 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm25 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm14[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) @@ -3522,7 +3524,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -3533,8 +3535,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,12] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512BW-NEXT: movb $12, %r10b @@ -3548,18 +3550,18 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 @@ -3571,10 +3573,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 @@ -3584,60 +3586,60 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm12, %zmm19 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm12, %zmm25 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm14[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm6, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) @@ -3660,7 +3662,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -3671,8 +3673,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,12] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: movb $12, %r10b @@ -3686,18 +3688,18 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 @@ -3709,10 +3711,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $48, %r9b ; AVX512BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm13 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 @@ -3722,60 +3724,60 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm19 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm25 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm14[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) @@ -3798,7 +3800,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -3809,8 +3811,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,12] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512DQ-BW-NEXT: movb $12, %r10b @@ -3824,18 +3826,18 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm17 @@ -3847,10 +3849,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movb $48, %r9b ; AVX512DQ-BW-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm26 @@ -3860,60 +3862,60 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm12, %zmm19 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm12, %zmm25 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm14[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rax) @@ -3936,7 +3938,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm2 @@ -3947,8 +3949,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,12] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,4,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movb $12, %r10b @@ -3962,18 +3964,18 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 @@ -3985,10 +3987,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $48, %r9b ; AVX512DQ-BW-FCP-NEXT: kmovd %r9d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm23, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 @@ -3998,60 +4000,60 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm28[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm12, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm12, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm21[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm14[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm20, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) @@ -4599,8 +4601,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX-NEXT: vmovaps (%rsi), %xmm4 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX-NEXT: vmovaps 64(%rsi), %xmm11 +; AVX-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX-NEXT: vmovaps (%rdi), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] @@ -4610,241 +4611,242 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rcx), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm3 ; AVX-NEXT: vbroadcastsd 8(%r8), %ymm4 +; AVX-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3] +; AVX-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX-NEXT: vinsertf128 $1, 32(%r9), %ymm4, %ymm5 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX-NEXT: vbroadcastsd 40(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX-NEXT: vbroadcastsd 40(%r8), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovaps 64(%rsi), %xmm13 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm13[1],xmm11[1] +; AVX-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vmovapd 64(%r8), %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX-NEXT: vmovapd 64(%r8), %ymm4 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rcx), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX-NEXT: vbroadcastsd 72(%r8), %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 64(%r9), %ymm2, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX-NEXT: vinsertf128 $1, 64(%r9), %ymm2, %ymm3 +; AVX-NEXT: vbroadcastsd 72(%r8), %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rsi), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vmovapd 96(%r8), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] -; AVX-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX-NEXT: vmovapd 96(%r8), %ymm10 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rcx), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 96(%rcx), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX-NEXT: vbroadcastsd 104(%r8), %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX-NEXT: vbroadcastsd 104(%r8), %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 128(%rsi), %xmm14 -; AVX-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vmovapd 128(%r8), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3] -; AVX-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rcx), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 128(%rdx), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX-NEXT: vbroadcastsd 136(%r8), %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 128(%r9), %ymm2, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rsi), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX-NEXT: vmovapd 160(%r8), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1],ymm4[2,3] +; AVX-NEXT: vmovaps 128(%rdi), %xmm15 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm15[1],xmm14[1] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 +; AVX-NEXT: vmovapd 128(%r8), %ymm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 128(%rcx), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 128(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; AVX-NEXT: vbroadcastsd 136(%r8), %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 128(%r9), %ymm3, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 160(%rsi), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 160(%rdi), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 +; AVX-NEXT: vmovapd 160(%r8), %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3] -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rcx), %xmm6 -; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX-NEXT: vbroadcastsd 168(%r8), %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 160(%r9), %ymm4, %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 160(%rcx), %xmm7 +; AVX-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps 160(%rdx), %xmm6 +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vbroadcastsd 168(%r8), %ymm5 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 160(%r9), %ymm6, %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%rsi), %xmm6 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX-NEXT: vmovapd 192(%r8), %ymm4 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm4[0,1],ymm6[2,3] +; AVX-NEXT: vmovaps 192(%rdi), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX-NEXT: vmovapd 192(%r8), %ymm6 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%rcx), %xmm7 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 192(%rdx), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX-NEXT: vbroadcastsd 200(%r8), %ymm7 -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 192(%r9), %ymm6, %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 192(%rdx), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; AVX-NEXT: vinsertf128 $1, 192(%r9), %ymm5, %ymm7 +; AVX-NEXT: vbroadcastsd 200(%r8), %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rsi), %xmm7 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 224(%rdi), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX-NEXT: vmovapd 224(%r8), %ymm9 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3] +; AVX-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX-NEXT: vmovapd 224(%r8), %ymm12 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rcx), %xmm7 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 224(%rdx), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX-NEXT: vmovaps 224(%rdx), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; AVX-NEXT: vbroadcastsd 232(%r8), %ymm7 -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 224(%r9), %ymm6, %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd (%rdi), %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 224(%r9), %ymm5, %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd (%rdi), %ymm5 ; AVX-NEXT: vmovapd (%rsi), %ymm7 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm6[2,3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm5[2,3] ; AVX-NEXT: vmovapd (%r9), %ymm0 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm7[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[3] -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 32(%rdi), %ymm6 -; AVX-NEXT: vmovapd 32(%rsi), %ymm7 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] -; AVX-NEXT: vmovapd 32(%r9), %ymm12 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm12[2,3],ymm7[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 32(%rdi), %ymm5 +; AVX-NEXT: vmovapd 32(%rsi), %ymm8 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm8[1],ymm5[3],ymm8[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX-NEXT: vmovapd 32(%r9), %ymm7 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm7[2,3],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 64(%rdi), %ymm1 -; AVX-NEXT: vmovapd 64(%rsi), %ymm7 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX-NEXT: vmovapd 64(%r9), %ymm6 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] +; AVX-NEXT: vmovapd 64(%rsi), %ymm8 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX-NEXT: vmovapd 64(%r9), %ymm5 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 96(%rdi), %ymm1 -; AVX-NEXT: vmovapd 96(%rsi), %ymm3 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] -; AVX-NEXT: vmovapd 96(%r9), %ymm5 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] +; AVX-NEXT: vmovapd 96(%rsi), %ymm8 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] +; AVX-NEXT: vmovapd 96(%r9), %ymm4 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 128(%rdi), %ymm1 -; AVX-NEXT: vmovapd 128(%rsi), %ymm7 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX-NEXT: vmovapd 128(%r9), %ymm3 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm7[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 160(%rdi), %ymm1 -; AVX-NEXT: vmovapd 160(%rsi), %ymm7 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX-NEXT: vmovapd 128(%rsi), %ymm8 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] -; AVX-NEXT: vmovapd 160(%r9), %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm7[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[3] +; AVX-NEXT: vmovapd 128(%r9), %ymm1 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm8[0],ymm2[2],ymm8[3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 192(%rdi), %ymm2 -; AVX-NEXT: vmovapd 192(%rsi), %ymm7 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX-NEXT: vmovapd 192(%r9), %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],ymm7[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[3] -; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 224(%rdi), %ymm4 -; AVX-NEXT: vmovapd 224(%rsi), %ymm7 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm4[2,3] -; AVX-NEXT: vmovapd 224(%r9), %ymm4 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm4[2,3],ymm7[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[3] -; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 16(%rcx), %ymm8 -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 16(%rdx), %xmm7 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] -; AVX-NEXT: vbroadcastsd 24(%r8), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX-NEXT: vmovapd 160(%rdi), %ymm2 +; AVX-NEXT: vmovapd 160(%rsi), %ymm8 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] +; AVX-NEXT: vmovapd 160(%r9), %ymm2 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm8[0],ymm3[2],ymm8[3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 192(%rdi), %ymm3 +; AVX-NEXT: vmovapd 192(%rsi), %ymm8 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm3[2,3] +; AVX-NEXT: vmovapd 192(%r9), %ymm3 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[2],ymm8[3] +; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 224(%rdi), %ymm6 +; AVX-NEXT: vmovapd 224(%rsi), %ymm8 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[2,3],ymm6[2,3] +; AVX-NEXT: vmovapd 224(%r9), %ymm6 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3],ymm8[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[3] +; AVX-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vbroadcastsd 16(%rcx), %ymm9 +; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 16(%rdx), %xmm8 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],mem[1] +; AVX-NEXT: vbroadcastsd 24(%r8), %ymm9 +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3] +; AVX-NEXT: vmovaps 48(%rdi), %xmm9 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm7 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vmovapd 48(%rdx), %xmm8 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],mem[1] +; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm9 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 48(%rdx), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 56(%r8), %ymm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX-NEXT: vbroadcastsd 56(%r8), %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -4856,79 +4858,79 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 88(%r8), %ymm7 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX-NEXT: vmovaps 112(%rdi), %xmm7 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 112(%rdx), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 120(%r8), %ymm6 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm5 +; AVX-NEXT: vbroadcastsd 120(%r8), %ymm8 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 144(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 144(%rcx), %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX-NEXT: vbroadcastsd 144(%rcx), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 144(%rdx), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 152(%r8), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX-NEXT: vbroadcastsd 152(%r8), %ymm4 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX-NEXT: vmovaps 176(%rdi), %xmm4 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vmovapd 176(%rdx), %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX-NEXT: vbroadcastsd 176(%rcx), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 176(%rdx), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 184(%r8), %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX-NEXT: vbroadcastsd 184(%r8), %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 208(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 208(%rcx), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 208(%rdx), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 216(%r8), %ymm15 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX-NEXT: vbroadcastsd 216(%r8), %ymm12 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3] +; AVX-NEXT: vmovaps 240(%rdi), %xmm12 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 240(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 240(%rcx), %ymm15 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 240(%rdx), %xmm15 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] -; AVX-NEXT: vbroadcastsd 248(%r8), %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3] +; AVX-NEXT: vmovapd 240(%rdx), %xmm0 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],mem[1] +; AVX-NEXT: vbroadcastsd 240(%rcx), %ymm1 +; AVX-NEXT: vbroadcastsd 248(%r8), %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm14[0] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX-NEXT: # xmm15 = xmm0[0],mem[0] -; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm13[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload ; AVX-NEXT: # xmm12 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm10[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm11[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload ; AVX-NEXT: # xmm10 = xmm0[0],mem[0] @@ -4977,8 +4979,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm11, 192(%rax) ; AVX-NEXT: vmovaps %xmm12, 400(%rax) ; AVX-NEXT: vmovaps %xmm13, 384(%rax) -; AVX-NEXT: vmovaps %xmm15, 784(%rax) -; AVX-NEXT: vmovaps %xmm14, 768(%rax) +; AVX-NEXT: vmovaps %xmm14, 784(%rax) +; AVX-NEXT: vmovaps %xmm15, 768(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5298,9 +5300,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -5333,40 +5335,40 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] ; AVX2-NEXT: vbroadcastsd 144(%rcx), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-NEXT: vbroadcastsd 144(%r9), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-NEXT: vmovaps 160(%rdx), %ymm2 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] ; AVX2-NEXT: vbroadcastsd 176(%rcx), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-NEXT: vbroadcastsd 184(%r8), %ymm1 @@ -5376,8 +5378,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 192(%rdx), %ymm2 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX2-NEXT: vbroadcastsd 208(%rcx), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vbroadcastsd 208(%rcx), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -5387,27 +5389,27 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 216(%r8), %ymm2 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-NEXT: vmovaps 224(%rsi), %ymm12 +; AVX2-NEXT: vmovaps 224(%rsi), %ymm9 ; AVX2-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm0[2,3] ; AVX2-NEXT: vbroadcastsd 240(%rcx), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] +; AVX2-NEXT: vbroadcastsd 240(%r9), %ymm9 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-NEXT: vbroadcastsd 240(%r9), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vbroadcastsd 248(%r8), %ymm12 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3] +; AVX2-NEXT: vbroadcastsd 248(%r8), %ymm9 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] @@ -5428,11 +5430,11 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 960(%rax) ; AVX2-NEXT: vmovaps %ymm6, 928(%rax) -; AVX2-NEXT: vmovaps %ymm10, 896(%rax) -; AVX2-NEXT: vmovaps %ymm11, 864(%rax) +; AVX2-NEXT: vmovaps %ymm11, 896(%rax) +; AVX2-NEXT: vmovaps %ymm12, 864(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 768(%rax) -; AVX2-NEXT: vmovaps %ymm9, 736(%rax) +; AVX2-NEXT: vmovaps %ymm10, 736(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5453,7 +5455,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-NEXT: vmovaps %ymm12, 160(%rax) +; AVX2-NEXT: vmovaps %ymm9, 160(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5731,9 +5733,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -5766,40 +5768,40 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vbroadcastsd 144(%rcx), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vbroadcastsd 144(%r9), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm2 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vbroadcastsd 176(%rcx), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 184(%r8), %ymm1 @@ -5809,8 +5811,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm2 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX2-FP-NEXT: vbroadcastsd 208(%rcx), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vbroadcastsd 208(%rcx), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -5820,27 +5822,27 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 216(%r8), %ymm2 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm12 +; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm9 ; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vbroadcastsd 240(%rcx), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] +; AVX2-FP-NEXT: vbroadcastsd 240(%r9), %ymm9 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-FP-NEXT: vbroadcastsd 240(%r9), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vbroadcastsd 248(%r8), %ymm12 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3] +; AVX2-FP-NEXT: vbroadcastsd 248(%r8), %ymm9 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] @@ -5861,11 +5863,11 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 960(%rax) ; AVX2-FP-NEXT: vmovaps %ymm6, 928(%rax) -; AVX2-FP-NEXT: vmovaps %ymm10, 896(%rax) -; AVX2-FP-NEXT: vmovaps %ymm11, 864(%rax) +; AVX2-FP-NEXT: vmovaps %ymm11, 896(%rax) +; AVX2-FP-NEXT: vmovaps %ymm12, 864(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 768(%rax) -; AVX2-FP-NEXT: vmovaps %ymm9, 736(%rax) +; AVX2-FP-NEXT: vmovaps %ymm10, 736(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5886,7 +5888,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-FP-NEXT: vmovaps %ymm12, 160(%rax) +; AVX2-FP-NEXT: vmovaps %ymm9, 160(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6164,9 +6166,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -6199,40 +6201,40 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 144(%rcx), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 144(%r9), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm2 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 176(%rcx), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 184(%r8), %ymm1 @@ -6242,8 +6244,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm2 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 208(%rcx), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 208(%rcx), %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -6253,27 +6255,27 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastsd 216(%r8), %ymm2 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm12 +; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm9 ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 240(%rcx), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] +; AVX2-FCP-NEXT: vbroadcastsd 240(%r9), %ymm9 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 240(%r9), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vbroadcastsd 248(%r8), %ymm12 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 248(%r8), %ymm9 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] @@ -6294,11 +6296,11 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm6, 928(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm10, 896(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm11, 864(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm11, 896(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm12, 864(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 768(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm9, 736(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm10, 736(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6319,7 +6321,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm12, 160(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm9, 160(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6364,25 +6366,25 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride6_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm25 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm23 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm28 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm24 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm12 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] ; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -6391,23 +6393,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512-NEXT: vpermi2q %zmm29, %zmm6, %zmm27 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -6425,59 +6427,60 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-NEXT: vpermt2q %zmm23, %zmm15, %zmm11 ; AVX512-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-NEXT: vpermt2q %zmm23, %zmm16, %zmm10 ; AVX512-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm17 ; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512-NEXT: vpermi2q %zmm20, %zmm9, %zmm14 +; AVX512-NEXT: vpermi2q %zmm20, %zmm9, %zmm15 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512-NEXT: vpermi2q %zmm20, %zmm9, %zmm16 +; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm6 ; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512-NEXT: vpermi2q %zmm18, %zmm28, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm20 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] ; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm18 ; AVX512-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm28 ; AVX512-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm13 ; AVX512-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512-NEXT: vpermi2q %zmm2, %zmm24, %zmm25 +; AVX512-NEXT: vpermi2q %zmm2, %zmm24, %zmm29 +; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 ; AVX512-NEXT: movb $12, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6486,8 +6489,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512-NEXT: movb $48, %al ; AVX512-NEXT: kmovw %eax, %k2 @@ -6507,18 +6510,18 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} ; AVX512-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm17[4,5,6,7] +; AVX512-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 ; AVX512-NEXT: vmovdqa64 128(%r9), %zmm30 @@ -6526,36 +6529,37 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm23 ; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm19[4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512-NEXT: vpermt2q %zmm2, %zmm5, %zmm8 ; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm8 ; AVX512-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 ; AVX512-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} ; AVX512-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -6564,48 +6568,48 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k1} +; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm5 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm10 ; AVX512-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 ; AVX512-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512-NEXT: vinserti32x4 $2, 192(%r8), %zmm9, %zmm16 ; AVX512-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm19[1],mem[1],ymm19[3],mem[3] ; AVX512-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm6 ; AVX512-NEXT: movb $16, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] ; AVX512-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm27 ; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm7 ; AVX512-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 ; AVX512-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 ; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 @@ -6616,14 +6620,14 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm24, 1344(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, 1280(%rax) ; AVX512-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 1088(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 832(%rax) ; AVX512-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 576(%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512-NEXT: vmovdqa64 %zmm18, 448(%rax) ; AVX512-NEXT: vmovdqa64 %zmm12, 320(%rax) @@ -6635,31 +6639,31 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride6_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm12 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] ; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -6668,23 +6672,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512-FCP-NEXT: vpermi2q %zmm29, %zmm6, %zmm27 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -6702,59 +6706,60 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm20 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] ; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm25 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 ; AVX512-FCP-NEXT: movb $12, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6763,8 +6768,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512-FCP-NEXT: movb $48, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 @@ -6784,18 +6789,18 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm17[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm30 @@ -6803,36 +6808,37 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm19[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} ; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -6841,48 +6847,48 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k1} +; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 ; AVX512-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm9, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm19[1],mem[1],ymm19[3],mem[3] ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm6 ; AVX512-FCP-NEXT: movb $16, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 @@ -6893,14 +6899,14 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 1344(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 1280(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 1088(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 576(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) @@ -6912,31 +6918,31 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride6_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm25 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm28 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm24 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm12 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] ; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -6945,23 +6951,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512DQ-NEXT: vpermi2q %zmm29, %zmm6, %zmm27 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -6979,59 +6985,60 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm15, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm16, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm9, %zmm14 +; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm9, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm9, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm28, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm20 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] ; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm18 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm28 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm25 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 ; AVX512DQ-NEXT: movb $12, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7040,8 +7047,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-NEXT: movb $48, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 @@ -7061,18 +7068,18 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} ; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm30 @@ -7080,36 +7087,37 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm5, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -7118,48 +7126,48 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k1} +; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 ; AVX512DQ-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512DQ-NEXT: vinserti32x4 $2, 192(%r8), %zmm9, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512DQ-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm19[1],mem[1],ymm19[3],mem[3] ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm6 ; AVX512DQ-NEXT: movb $16, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm21, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 @@ -7170,14 +7178,14 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 1344(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1280(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1088(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 832(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 576(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 448(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%rax) @@ -7189,31 +7197,31 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride6_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm12 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] ; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -7222,23 +7230,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm29, %zmm6, %zmm27 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -7256,59 +7264,60 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm20 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] ; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm25 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: movb $12, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7317,8 +7326,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: movb $48, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 @@ -7338,18 +7347,18 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm30 @@ -7357,36 +7366,37 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -7395,48 +7405,48 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm9, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm19[1],mem[1],ymm19[3],mem[3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm6 ; AVX512DQ-FCP-NEXT: movb $16, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 @@ -7447,14 +7457,14 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 1344(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 1280(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 1088(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 576(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) @@ -7466,31 +7476,31 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm23 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm28 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm24 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -7499,23 +7509,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm29, %zmm6, %zmm27 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -7533,59 +7543,60 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm16, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm9, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm9, %zmm15 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm9, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm20 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm28 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 ; AVX512BW-NEXT: movb $12, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7594,8 +7605,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512BW-NEXT: movb $48, %al ; AVX512BW-NEXT: kmovd %eax, %k2 @@ -7615,18 +7626,18 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm30 @@ -7634,37 +7645,38 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm5 +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload @@ -7672,48 +7684,48 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 ; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm9, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm19[1],mem[1],ymm19[3],mem[3] ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm6 ; AVX512BW-NEXT: movb $16, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 @@ -7724,14 +7736,14 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1344(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 1280(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1088(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax) @@ -7743,31 +7755,31 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride6_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm12 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -7776,23 +7788,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512BW-FCP-NEXT: vpermi2q %zmm29, %zmm6, %zmm27 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -7810,59 +7822,60 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm28, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm20 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] ; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 ; AVX512BW-FCP-NEXT: movb $12, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7871,8 +7884,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512BW-FCP-NEXT: movb $48, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 @@ -7892,18 +7905,18 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm30 @@ -7911,36 +7924,37 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -7949,48 +7963,48 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm9, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm19[1],mem[1],ymm19[3],mem[3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: movb $16, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 @@ -8001,14 +8015,14 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 1344(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 1280(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 1088(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 576(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) @@ -8020,31 +8034,31 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride6_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-BW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm12 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -8053,23 +8067,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512DQ-BW-NEXT: vpermi2q %zmm29, %zmm6, %zmm27 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -8087,59 +8101,60 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm16, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm9, %zmm14 +; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm9, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm9, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm28, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm20 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] ; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 ; AVX512DQ-BW-NEXT: movb $12, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8148,8 +8163,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-BW-NEXT: movb $48, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 @@ -8169,18 +8184,18 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm30 @@ -8188,36 +8203,37 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -8226,48 +8242,48 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm9, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm19[1],mem[1],ymm19[3],mem[3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: movb $16, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 @@ -8278,14 +8294,14 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 1344(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1280(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 1088(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 832(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 576(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 448(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 320(%rax) @@ -8297,31 +8313,31 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-BW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride6_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -8330,23 +8346,23 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm29, %zmm11, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm29, %zmm6, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 @@ -8364,59 +8380,60 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm16, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm19, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm24, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,9,2,10,1,9,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm28, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,13,6,14,5,13,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm29, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm29, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm21, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: movb $12, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8425,8 +8442,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $48, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 @@ -8446,18 +8463,18 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm13 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm25 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,9,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm30 @@ -8465,36 +8482,37 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,13,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,0,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -8503,48 +8521,48 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm10, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm11, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm9, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm19, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm19[1],mem[1],ymm19[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm19, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movb $16, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [10,0,2,3,4,5,11,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,10,2,3,4,5,6,11] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm21, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm19, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm2 @@ -8555,14 +8573,14 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 1344(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 1280(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 1216(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1088(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 1088(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 960(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 896(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 576(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) @@ -8574,7 +8592,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-BW-FCP-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-BW-FCP-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -9631,453 +9649,451 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-LABEL: store_i64_stride6_vf64: ; AVX: # %bb.0: ; AVX-NEXT: subq $3464, %rsp # imm = 0xD88 -; AVX-NEXT: vmovaps 32(%r8), %ymm7 -; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd (%r8), %ymm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vmovapd 32(%r8), %ymm8 +; AVX-NEXT: vmovapd (%r8), %ymm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX-NEXT: vmovaps (%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps (%rdi), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm3 +; AVX-NEXT: vmovaps (%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, (%r9), %ymm0, %ymm3 ; AVX-NEXT: vbroadcastsd 8(%r8), %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm5[1] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] ; AVX-NEXT: vmovaps 32(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 40(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovapd 64(%r8), %ymm15 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] +; AVX-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, 32(%r9), %ymm4, %ymm5 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastsd 40(%r8), %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5],ymm5[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovapd 64(%r8), %ymm14 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 64(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 72(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, 64(%r9), %ymm0, %ymm3 +; AVX-NEXT: vbroadcastsd 72(%r8), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovapd 96(%r8), %ymm9 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] +; AVX-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovaps 96(%r8), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX-NEXT: vbroadcastsd 104(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 96(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 96(%r9), %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 128(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovaps 128(%r8), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovapd 128(%r8), %ymm7 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 128(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 128(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX-NEXT: vmovaps 128(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX-NEXT: vbroadcastsd 136(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 128(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 128(%r9), %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 160(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovapd 160(%r8), %ymm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] +; AVX-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovapd 160(%r8), %ymm13 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 160(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 160(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 168(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 160(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 160(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vbroadcastsd 168(%r8), %ymm0 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 160(%r9), %ymm3, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovapd 192(%r8), %ymm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] +; AVX-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovapd 192(%r8), %ymm10 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 192(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 200(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 192(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, 192(%r9), %ymm0, %ymm3 +; AVX-NEXT: vbroadcastsd 200(%r8), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovapd 224(%r8), %ymm11 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] +; AVX-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovaps 224(%r8), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX-NEXT: vbroadcastsd 232(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 224(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 224(%r9), %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 256(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 256(%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 256(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 256(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX-NEXT: vmovaps 256(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX-NEXT: vbroadcastsd 264(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 256(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 256(%r9), %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 288(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 288(%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 288(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 288(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 296(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 288(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 288(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vbroadcastsd 296(%r8), %ymm0 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 288(%r9), %ymm3, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 320(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 320(%r8), %ymm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 320(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 320(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 328(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 320(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 320(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, 320(%r9), %ymm0, %ymm3 +; AVX-NEXT: vbroadcastsd 328(%r8), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 352(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovaps 352(%r8), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovapd 352(%r8), %ymm11 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 352(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 352(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX-NEXT: vmovaps 352(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX-NEXT: vbroadcastsd 360(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 352(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 352(%r9), %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 384(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vmovapd 384(%r8), %ymm12 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 384(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 384(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX-NEXT: vmovaps 384(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX-NEXT: vbroadcastsd 392(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 384(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 384(%r9), %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 416(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovapd 416(%r8), %ymm13 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] +; AVX-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovapd 416(%r8), %ymm9 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 416(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 416(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 424(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 416(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 416(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vbroadcastsd 424(%r8), %ymm0 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 416(%r9), %ymm3, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 448(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 448(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovapd 448(%r8), %ymm14 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3] +; AVX-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovapd 448(%r8), %ymm15 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 448(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 448(%rdx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vbroadcastsd 456(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 448(%r9), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 448(%rdx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, 448(%r9), %ymm0, %ymm3 +; AVX-NEXT: vbroadcastsd 456(%r8), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 480(%rsi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX-NEXT: vmovapd 480(%r8), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm2[1],ymm3[2,3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 480(%rcx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 480(%rdx), %xmm2 -; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; AVX-NEXT: vbroadcastsd 488(%r8), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 480(%r9), %ymm2, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd (%rdi), %ymm2 -; AVX-NEXT: vmovapd (%rsi), %ymm3 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] -; AVX-NEXT: vmovapd (%r9), %ymm4 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[3] +; AVX-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX-NEXT: vmovapd 480(%r8), %ymm6 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX-NEXT: vmovapd 32(%rsi), %ymm3 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = mem[2,3],ymm2[2,3] -; AVX-NEXT: vmovapd 32(%r9), %ymm6 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] +; AVX-NEXT: vmovaps 480(%rcx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 480(%rdx), %xmm3 +; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX-NEXT: vbroadcastsd 488(%r8), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 480(%r9), %ymm3, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd (%rdi), %ymm3 +; AVX-NEXT: vmovapd (%rsi), %ymm4 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm3[2,3] +; AVX-NEXT: vmovapd (%r9), %ymm5 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 64(%rdi), %ymm0 -; AVX-NEXT: vmovapd 64(%rsi), %ymm3 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX-NEXT: vmovapd 64(%r9), %ymm15 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] +; AVX-NEXT: vmovapd 32(%rdi), %ymm3 +; AVX-NEXT: vmovapd 32(%rsi), %ymm4 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX-NEXT: vmovapd 32(%r9), %ymm8 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3],ymm4[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 96(%rdi), %ymm3 +; AVX-NEXT: vmovapd 64(%rdi), %ymm3 +; AVX-NEXT: vmovapd 64(%rsi), %ymm4 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm3[2,3] +; AVX-NEXT: vmovapd 64(%r9), %ymm3 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm4[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[0],ymm4[0],ymm14[2],ymm4[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 96(%rdi), %ymm4 ; AVX-NEXT: vmovapd 96(%rsi), %ymm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm3[2,3] -; AVX-NEXT: vmovapd 96(%r9), %ymm3 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX-NEXT: # ymm4 = mem[2,3],ymm4[2,3] +; AVX-NEXT: vmovapd 96(%r9), %ymm14 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 128(%rdi), %ymm9 +; AVX-NEXT: vmovapd 128(%rdi), %ymm4 ; AVX-NEXT: vmovapd 128(%rsi), %ymm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] -; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX-NEXT: # ymm1 = mem[2,3],ymm9[2,3] -; AVX-NEXT: vmovapd 128(%r9), %ymm9 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm4[2,3] +; AVX-NEXT: vmovapd 128(%r9), %ymm4 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX-NEXT: vmovapd 160(%rsi), %ymm1 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] -; AVX-NEXT: vmovapd 160(%r9), %ymm8 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX-NEXT: vmovapd 160(%rsi), %ymm7 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX-NEXT: vmovapd 160(%r9), %ymm13 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 192(%rdi), %ymm0 -; AVX-NEXT: vmovapd 192(%rsi), %ymm1 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX-NEXT: vmovapd 192(%rdi), %ymm7 +; AVX-NEXT: vmovapd 192(%rsi), %ymm0 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm7[2,3] ; AVX-NEXT: vmovapd 192(%r9), %ymm7 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX-NEXT: vmovapd 224(%rsi), %ymm1 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX-NEXT: vmovapd 224(%r9), %ymm10 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 256(%rdi), %ymm1 -; AVX-NEXT: vmovapd 256(%rsi), %ymm0 +; AVX-NEXT: vmovapd 224(%rdi), %ymm1 +; AVX-NEXT: vmovapd 224(%rsi), %ymm0 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX-NEXT: vmovapd 256(%r9), %ymm11 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX-NEXT: vmovapd 224(%r9), %ymm10 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 288(%rdi), %ymm0 -; AVX-NEXT: vmovapd 288(%rsi), %ymm1 +; AVX-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX-NEXT: vmovapd 256(%rsi), %ymm1 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX-NEXT: vmovapd 288(%r9), %ymm2 +; AVX-NEXT: vmovapd 256(%r9), %ymm2 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 320(%rdi), %ymm1 -; AVX-NEXT: vmovapd 320(%rsi), %ymm0 +; AVX-NEXT: vmovapd 288(%rdi), %ymm1 +; AVX-NEXT: vmovapd 288(%rsi), %ymm0 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX-NEXT: vmovapd 320(%r9), %ymm2 +; AVX-NEXT: vmovapd 288(%r9), %ymm2 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 352(%rdi), %ymm0 -; AVX-NEXT: vmovapd 352(%rsi), %ymm1 +; AVX-NEXT: vmovapd 320(%rdi), %ymm0 +; AVX-NEXT: vmovapd 320(%rsi), %ymm1 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX-NEXT: # ymm2 = mem[2,3],ymm0[2,3] -; AVX-NEXT: vmovapd 352(%r9), %ymm0 +; AVX-NEXT: vmovapd 320(%r9), %ymm0 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 352(%rdi), %ymm0 +; AVX-NEXT: vmovapd 352(%rsi), %ymm1 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX-NEXT: vmovapd 352(%r9), %ymm11 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 384(%rdi), %ymm0 ; AVX-NEXT: vmovapd 384(%rsi), %ymm1 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] @@ -10089,25 +10105,26 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd 416(%rdi), %ymm0 ; AVX-NEXT: vmovapd 416(%rsi), %ymm1 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX-NEXT: vmovapd 416(%r9), %ymm13 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX-NEXT: vmovapd 416(%r9), %ymm9 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 448(%rdi), %ymm0 ; AVX-NEXT: vmovapd 448(%rsi), %ymm1 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX-NEXT: vmovapd 448(%r9), %ymm14 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX-NEXT: vmovapd 448(%r9), %ymm15 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 480(%rdi), %ymm0 ; AVX-NEXT: vmovapd 480(%rsi), %ymm1 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] -; AVX-NEXT: vmovapd 480(%r9), %ymm5 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX-NEXT: vmovapd 480(%r9), %ymm2 +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 16(%rdi), %xmm0 @@ -10120,19 +10137,19 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vmovapd 48(%rdx), %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX-NEXT: vbroadcastsd 48(%rcx), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 48(%rdx), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 56(%r8), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX-NEXT: vbroadcastsd 56(%r8), %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -10144,19 +10161,19 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 112(%rdx), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm2 +; AVX-NEXT: vbroadcastsd 120(%r8), %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 144(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -10168,19 +10185,19 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 176(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vmovapd 176(%rdx), %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX-NEXT: vbroadcastsd 176(%rcx), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 176(%rdx), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 184(%r8), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] +; AVX-NEXT: vbroadcastsd 184(%r8), %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -10192,18 +10209,18 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 216(%r8), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX-NEXT: vmovaps 240(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 240(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 240(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 240(%rdx), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 248(%r8), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX-NEXT: vbroadcastsd 240(%rcx), %ymm2 +; AVX-NEXT: vbroadcastsd 248(%r8), %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 272(%rdi), %xmm0 @@ -10212,22 +10229,23 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vbroadcastsd 272(%rcx), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 272(%rdx), %xmm0 +; AVX-NEXT: vmovaps 272(%rdx), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 280(%r8), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 304(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 304(%rdx), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 312(%r8), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vmovaps 304(%rdx), %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX-NEXT: vbroadcastsd 304(%rcx), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastsd 312(%r8), %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10241,22 +10259,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 344(%r8), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps 368(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 368(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 368(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 368(%rdx), %xmm0 +; AVX-NEXT: vmovapd 368(%rdx), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 376(%r8), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastsd 368(%rcx), %ymm2 +; AVX-NEXT: vbroadcastsd 376(%r8), %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 400(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] @@ -10267,19 +10284,19 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 408(%r8), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 432(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 432(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vmovapd 432(%rdx), %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX-NEXT: vbroadcastsd 432(%rcx), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 432(%rdx), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 440(%r8), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] +; AVX-NEXT: vbroadcastsd 440(%r8), %ymm0 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -10291,20 +10308,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 472(%r8), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] +; AVX-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 496(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 496(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 496(%rdx), %xmm0 +; AVX-NEXT: vmovaps 496(%rdx), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 504(%r8), %ymm1 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastsd 496(%rcx), %ymm2 +; AVX-NEXT: vbroadcastsd 504(%r8), %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] @@ -10652,15 +10670,15 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps (%rcx), %xmm2 +; AVX2-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rdx), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rdx), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 8(%r8), %ymm1 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%r9), %xmm0 @@ -11078,7 +11096,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-NEXT: vbroadcastsd 24(%r8), %ymm1 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm2 @@ -11088,14 +11106,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-NEXT: vbroadcastsd 56(%r8), %ymm1 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-NEXT: vmovaps 64(%rdx), %ymm2 @@ -11122,15 +11140,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-NEXT: vmovaps 128(%rdx), %ymm2 @@ -11158,9 +11175,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -11194,9 +11211,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 240(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 240(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -11230,9 +11247,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 304(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 304(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -11266,9 +11283,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 368(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 368(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -11291,7 +11308,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-NEXT: vbroadcastsd 408(%r8), %ymm1 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 416(%rsi), %ymm1 ; AVX2-NEXT: vmovaps 416(%rdx), %ymm2 @@ -11301,29 +11319,29 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd 432(%r9), %ymm1 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcastsd 432(%r9), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-NEXT: vbroadcastsd 440(%r8), %ymm1 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 448(%rsi), %ymm1 -; AVX2-NEXT: vmovaps 448(%rdx), %ymm2 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-NEXT: vmovaps 448(%rdx), %ymm4 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] ; AVX2-NEXT: vbroadcastsd 464(%rcx), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-NEXT: vbroadcastsd 464(%r9), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-NEXT: vbroadcastsd 472(%r8), %ymm2 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-NEXT: vbroadcastsd 472(%r8), %ymm1 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX2-NEXT: vmovaps 480(%rsi), %ymm2 ; AVX2-NEXT: vmovaps 480(%rdx), %ymm4 @@ -11332,82 +11350,82 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 496(%rcx), %ymm8 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-NEXT: vbroadcastsd 496(%r9), %ymm2 ; AVX2-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-NEXT: vbroadcastsd 496(%r9), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX2-NEXT: vbroadcastsd 504(%r8), %ymm2 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm2[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovaps %ymm1, 3040(%rax) +; AVX2-NEXT: vmovaps %ymm3, 3008(%rax) +; AVX2-NEXT: vmovaps %ymm5, 2976(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovaps %ymm1, 2880(%rax) +; AVX2-NEXT: vmovaps %ymm9, 2848(%rax) +; AVX2-NEXT: vmovaps %ymm10, 2816(%rax) +; AVX2-NEXT: vmovaps %ymm13, 2784(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovaps %ymm1, 2688(%rax) +; AVX2-NEXT: vmovaps %ymm12, 2656(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovaps %ymm1, 2624(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovaps %ymm1, 2592(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovaps %ymm0, 3040(%rax) -; AVX2-NEXT: vmovaps %ymm3, 3008(%rax) -; AVX2-NEXT: vmovaps %ymm5, 2976(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 2880(%rax) -; AVX2-NEXT: vmovaps %ymm6, 2848(%rax) -; AVX2-NEXT: vmovaps %ymm9, 2816(%rax) -; AVX2-NEXT: vmovaps %ymm11, 2784(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 2688(%rax) -; AVX2-NEXT: vmovaps %ymm10, 2656(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 2624(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 2592(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 2496(%rax) -; AVX2-NEXT: vmovaps %ymm15, 2464(%rax) +; AVX2-NEXT: vmovaps %ymm1, 2496(%rax) +; AVX2-NEXT: vmovaps %ymm0, 2464(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2432(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2400(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2304(%rax) -; AVX2-NEXT: vmovaps %ymm1, 2272(%rax) +; AVX2-NEXT: vmovaps %ymm2, 2272(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2240(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2208(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2112(%rax) -; AVX2-NEXT: vmovaps %ymm2, 2080(%rax) +; AVX2-NEXT: vmovaps %ymm4, 2080(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2048(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2016(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1920(%rax) -; AVX2-NEXT: vmovaps %ymm4, 1888(%rax) +; AVX2-NEXT: vmovaps %ymm6, 1888(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1856(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11428,21 +11446,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 1440(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1344(%rax) -; AVX2-NEXT: vmovaps %ymm12, 1312(%rax) +; AVX2-NEXT: vmovaps %ymm11, 1312(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1248(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX2-NEXT: vmovaps %ymm13, 1120(%rax) +; AVX2-NEXT: vmovaps %ymm14, 1120(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1056(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 960(%rax) -; AVX2-NEXT: vmovaps %ymm14, 928(%rax) +; AVX2-NEXT: vmovaps %ymm15, 928(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 896(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11571,15 +11589,15 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps (%rcx), %xmm2 +; AVX2-FP-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rdx), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdx), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vbroadcastsd 8(%r8), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 8(%r8), %ymm1 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm0 @@ -11997,7 +12015,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 24(%r8), %ymm1 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm2 @@ -12007,14 +12025,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 56(%r8), %ymm1 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm2 @@ -12041,15 +12059,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm2 @@ -12077,9 +12094,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -12113,9 +12130,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 240(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 240(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -12149,9 +12166,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 304(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 304(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -12185,9 +12202,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 368(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 368(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -12210,7 +12227,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 408(%r8), %ymm1 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 416(%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps 416(%rdx), %ymm2 @@ -12220,29 +12238,29 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd 432(%r9), %ymm1 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FP-NEXT: vbroadcastsd 432(%r9), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 440(%r8), %ymm1 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 448(%rsi), %ymm1 -; AVX2-FP-NEXT: vmovaps 448(%rdx), %ymm2 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vmovaps 448(%rdx), %ymm4 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] ; AVX2-FP-NEXT: vbroadcastsd 464(%rcx), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vbroadcastsd 464(%r9), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-FP-NEXT: vbroadcastsd 472(%r8), %ymm2 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-FP-NEXT: vbroadcastsd 472(%r8), %ymm1 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps 480(%rsi), %ymm2 ; AVX2-FP-NEXT: vmovaps 480(%rdx), %ymm4 @@ -12251,82 +12269,82 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 496(%rcx), %ymm8 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-FP-NEXT: vbroadcastsd 496(%r9), %ymm2 ; AVX2-FP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FP-NEXT: vbroadcastsd 496(%r9), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX2-FP-NEXT: vbroadcastsd 504(%r8), %ymm2 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovaps %ymm1, 3040(%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, 3008(%rax) +; AVX2-FP-NEXT: vmovaps %ymm5, 2976(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovaps %ymm1, 2880(%rax) +; AVX2-FP-NEXT: vmovaps %ymm9, 2848(%rax) +; AVX2-FP-NEXT: vmovaps %ymm10, 2816(%rax) +; AVX2-FP-NEXT: vmovaps %ymm13, 2784(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovaps %ymm1, 2688(%rax) +; AVX2-FP-NEXT: vmovaps %ymm12, 2656(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovaps %ymm1, 2624(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovaps %ymm1, 2592(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovaps %ymm0, 3040(%rax) -; AVX2-FP-NEXT: vmovaps %ymm3, 3008(%rax) -; AVX2-FP-NEXT: vmovaps %ymm5, 2976(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 2880(%rax) -; AVX2-FP-NEXT: vmovaps %ymm6, 2848(%rax) -; AVX2-FP-NEXT: vmovaps %ymm9, 2816(%rax) -; AVX2-FP-NEXT: vmovaps %ymm11, 2784(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 2688(%rax) -; AVX2-FP-NEXT: vmovaps %ymm10, 2656(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 2624(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 2592(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 2496(%rax) -; AVX2-FP-NEXT: vmovaps %ymm15, 2464(%rax) +; AVX2-FP-NEXT: vmovaps %ymm1, 2496(%rax) +; AVX2-FP-NEXT: vmovaps %ymm0, 2464(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2432(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2400(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2304(%rax) -; AVX2-FP-NEXT: vmovaps %ymm1, 2272(%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 2272(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2240(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2208(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2112(%rax) -; AVX2-FP-NEXT: vmovaps %ymm2, 2080(%rax) +; AVX2-FP-NEXT: vmovaps %ymm4, 2080(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2048(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2016(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1920(%rax) -; AVX2-FP-NEXT: vmovaps %ymm4, 1888(%rax) +; AVX2-FP-NEXT: vmovaps %ymm6, 1888(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1856(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12347,21 +12365,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 1440(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1344(%rax) -; AVX2-FP-NEXT: vmovaps %ymm12, 1312(%rax) +; AVX2-FP-NEXT: vmovaps %ymm11, 1312(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1248(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX2-FP-NEXT: vmovaps %ymm13, 1120(%rax) +; AVX2-FP-NEXT: vmovaps %ymm14, 1120(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1056(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 960(%rax) -; AVX2-FP-NEXT: vmovaps %ymm14, 928(%rax) +; AVX2-FP-NEXT: vmovaps %ymm15, 928(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 896(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12490,15 +12508,15 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm2 +; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vbroadcastsd 8(%r8), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 8(%r8), %ymm1 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm0 @@ -12916,7 +12934,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 24(%r8), %ymm1 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm2 @@ -12926,14 +12944,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 48(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 56(%r8), %ymm1 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm2 @@ -12960,15 +12978,14 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 112(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm2 @@ -12996,9 +13013,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -13032,9 +13049,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 240(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 240(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -13068,9 +13085,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 304(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 304(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -13104,9 +13121,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 368(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 368(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] @@ -13129,7 +13146,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 408(%r8), %ymm1 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 416(%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 416(%rdx), %ymm2 @@ -13139,29 +13157,29 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd 432(%r9), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 432(%r9), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 440(%r8), %ymm1 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 448(%rsi), %ymm1 -; AVX2-FCP-NEXT: vmovaps 448(%rdx), %ymm2 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vmovaps 448(%rdx), %ymm4 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 464(%rcx), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 464(%r9), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-FCP-NEXT: vbroadcastsd 472(%r8), %ymm2 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-FCP-NEXT: vbroadcastsd 472(%r8), %ymm1 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 480(%rsi), %ymm2 ; AVX2-FCP-NEXT: vmovaps 480(%rdx), %ymm4 @@ -13170,82 +13188,82 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastsd 496(%rcx), %ymm8 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-FCP-NEXT: vbroadcastsd 496(%r9), %ymm2 ; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 496(%r9), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX2-FCP-NEXT: vbroadcastsd 504(%r8), %ymm2 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm1, 3040(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm3, 3008(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm5, 2976(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm1, 2880(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm9, 2848(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm10, 2816(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm13, 2784(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm1, 2688(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm12, 2656(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm1, 2624(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm1, 2592(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm0, 3040(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm3, 3008(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm5, 2976(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 2880(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm6, 2848(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm9, 2816(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm11, 2784(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 2688(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm10, 2656(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 2624(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 2592(%rax) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 2496(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm15, 2464(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm1, 2496(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm0, 2464(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2432(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2400(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2304(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm1, 2272(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm2, 2272(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2240(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2208(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2112(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, 2080(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm4, 2080(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2048(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2016(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1920(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm4, 1888(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm6, 1888(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1856(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -13266,21 +13284,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 1440(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1344(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm12, 1312(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm11, 1312(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1248(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm13, 1120(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm14, 1120(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1056(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm14, 928(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm15, 928(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -13389,32 +13407,32 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride6_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512-NEXT: subq $3592, %rsp # imm = 0xE08 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm28 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm27 ; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm14 ; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm24 ; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 @@ -13422,439 +13440,442 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [2,10,2,10,2,10,2,10] +; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm30, %zmm13, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512-NEXT: vpermt2q %zmm30, %zmm16, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512-NEXT: vpermt2q %zmm12, %zmm3, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512-NEXT: vpermt2q %zmm28, %zmm13, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512-NEXT: vpermt2q %zmm28, %zmm2, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512-NEXT: vpermt2q %zmm28, %zmm16, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512-NEXT: vpermt2q %zmm27, %zmm13, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512-NEXT: vpermt2q %zmm27, %zmm16, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512-NEXT: vpermt2q %zmm14, %zmm3, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm24, %zmm3, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm22 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm13 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm18 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512-NEXT: vpermt2q %zmm9, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm17 +; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm12 ; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm23 ; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm15 ; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] ; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm14, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512-NEXT: vpermt2q %zmm11, %zmm27, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512-NEXT: vpermt2q %zmm11, %zmm28, %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512-NEXT: vpermt2q %zmm9, %zmm28, %zmm13 +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm6 ; AVX512-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm7, %zmm28, %zmm12 +; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vpermt2q %zmm10, %zmm27, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512-NEXT: vpermt2q %zmm2, %zmm27, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $12, %al ; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm22 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $48, %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, (%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload +; AVX512-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512-NEXT: # zmm13 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512-NEXT: # zmm29 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm26 +; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512-NEXT: vmovdqa64 448(%r8), %zmm10 ; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 ; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm30 # 64-byte Folded Reload +; AVX512-NEXT: # zmm30 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm16 +; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 +; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm30 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm8 ; AVX512-NEXT: movb $16, %al ; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm26 ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm19 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 ; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm20 ; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm31 {%k2} ; AVX512-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13868,21 +13889,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512-NEXT: vmovdqa64 320(%r9), %zmm9 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 ; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 @@ -13894,32 +13915,30 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 ; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm8, %zmm6, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm22, %zmm6, %zmm30 ; AVX512-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 @@ -13928,96 +13947,96 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 64(%rdx), %xmm6 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512-NEXT: vmovdqa 192(%rdx), %xmm6 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm3 {%k1} ; AVX512-NEXT: vmovdqa 320(%rdx), %xmm6 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm7[0],mem[0] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm6 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} ; AVX512-NEXT: vmovdqa 448(%rdx), %xmm6 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512-NEXT: vinserti32x4 $2, 128(%r8), %zmm10, %zmm18 +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm23 +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512-NEXT: vinserti32x4 $2, 192(%r8), %zmm12, %zmm17 +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm17 +; AVX512-NEXT: vinserti32x4 $2, 256(%r8), %zmm3, %zmm16 +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm16 +; AVX512-NEXT: vinserti32x4 $2, 320(%r8), %zmm7, %zmm15 +; AVX512-NEXT: vinserti32x4 $2, 384(%r8), %zmm2, %zmm14 +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm14 +; AVX512-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm12 +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm31 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm26 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm25 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm22, %zmm0, %zmm19 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, 3008(%rax) ; AVX512-NEXT: vmovdqa64 %zmm28, 2944(%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, 2816(%rax) ; AVX512-NEXT: vmovdqa64 %zmm27, 2752(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 2624(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 2560(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, 2432(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 2368(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14025,7 +14044,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512-NEXT: vmovdqa64 %zmm21, 2048(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14033,7 +14052,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 1664(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1600(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14041,73 +14060,74 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1216(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 896(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 704(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 2688(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 2304(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 1920(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 1536(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 768(%rax) ; AVX512-NEXT: vmovdqa64 %zmm23, 384(%rax) ; AVX512-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512-NEXT: addq $3592, %rsp # imm = 0xE08 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride6_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512-FCP-NEXT: subq $3592, %rsp # imm = 0xE08 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 @@ -14115,439 +14135,442 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [2,10,2,10,2,10,2,10] +; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm16, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm16, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] ; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm22, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] ; AVX512-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $12, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $48, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm13 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm29 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 ; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm30 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm30 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm30 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm8 ; AVX512-FCP-NEXT: movb $16, %al ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm26 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm20 ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14561,21 +14584,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 ; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 @@ -14587,32 +14610,30 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 ; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm30 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 @@ -14621,96 +14642,96 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %xmm6 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm7[0],mem[0] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqa 448(%rdx), %xmm6 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm10, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm12, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm3, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm7, %zmm15 +; AVX512-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm26 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 3008(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 2944(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 2816(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 2624(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 2560(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 2432(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 2368(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14718,7 +14739,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 2048(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14726,7 +14747,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 1664(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1600(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14734,73 +14755,74 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1216(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 896(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 704(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 2688(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 2304(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 1920(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 1536(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 768(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 384(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512-FCP-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512-FCP-NEXT: addq $3592, %rsp # imm = 0xE08 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride6_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512DQ-NEXT: subq $3592, %rsp # imm = 0xE08 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm28 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm24 ; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 @@ -14808,439 +14830,442 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [2,10,2,10,2,10,2,10] +; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm13, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm16, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm2, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm3, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm13, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm2, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm16, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm13, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm16, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm3, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm3, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 384(%rdx), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm16, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] ; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm14, %zmm22, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] ; AVX512DQ-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm27, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm28, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm28, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm28, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm27, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm27, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm28, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $12, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $48, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $228, (%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm13 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm29 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512DQ-NEXT: vmovdqa64 448(%r8), %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 ; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm30 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm30 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm30 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm8 ; AVX512DQ-NEXT: movb $16, %al ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm26 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm19 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 ; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm20 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm31 {%k2} ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15254,21 +15279,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm9 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 ; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 @@ -15280,32 +15305,30 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 ; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm6, %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm30 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 @@ -15314,96 +15337,96 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm6 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm6 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqa 320(%rdx), %xmm6 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm7[0],mem[0] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512DQ-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm6 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} ; AVX512DQ-NEXT: vmovdqa 448(%rdx), %xmm6 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512DQ-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512DQ-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512DQ-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512DQ-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512DQ-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512DQ-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512DQ-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-NEXT: vinserti32x4 $2, 128(%r8), %zmm10, %zmm18 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512DQ-NEXT: vinserti32x4 $2, 192(%r8), %zmm12, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm17 +; AVX512DQ-NEXT: vinserti32x4 $2, 256(%r8), %zmm3, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm16 +; AVX512DQ-NEXT: vinserti32x4 $2, 320(%r8), %zmm7, %zmm15 +; AVX512DQ-NEXT: vinserti32x4 $2, 384(%r8), %zmm2, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm14 +; AVX512DQ-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm31 +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm26 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm0, %zmm19 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 3008(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 2944(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 2816(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 2752(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 2624(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 2560(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 2432(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 2368(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15411,7 +15434,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 2048(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15419,7 +15442,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1664(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1600(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15427,73 +15450,74 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1216(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 704(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 2688(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 2304(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1920(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1536(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 768(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 384(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512DQ-NEXT: addq $3592, %rsp # imm = 0xE08 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i64_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512DQ-FCP-NEXT: subq $3592, %rsp # imm = 0xE08 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 @@ -15501,439 +15525,442 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm16, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] ; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm16, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] ; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] ; AVX512DQ-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $12, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $48, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm13 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm29 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm30 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm30 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm8 ; AVX512DQ-FCP-NEXT: movb $16, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15947,21 +15974,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 @@ -15973,32 +16000,30 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 @@ -16007,96 +16032,96 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdx), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm7[0],mem[0] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdx), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm10, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm12, %zmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm3, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm7, %zmm15 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 3008(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 2944(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 2816(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2624(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2560(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 2432(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2368(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16104,7 +16129,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 2048(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16112,7 +16137,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 1664(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1600(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16120,73 +16145,74 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1216(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 896(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 704(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 2688(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 2304(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 1920(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1536(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 768(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 384(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-FCP-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512DQ-FCP-NEXT: addq $3592, %rsp # imm = 0xE08 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512BW-NEXT: subq $3592, %rsp # imm = 0xE08 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 @@ -16194,439 +16220,442 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm16, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm16, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm22 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm13 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm18 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm12 ; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm23 ; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm15 ; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm28, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $12, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $48, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm13 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm29 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 ; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm30 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm30 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm30 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm8 ; AVX512BW-NEXT: movb $16, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm26 ; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm20 ; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 {%k2} ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16640,21 +16669,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm9 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 ; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 @@ -16666,32 +16695,30 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm30 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 @@ -16700,96 +16727,96 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm6 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm6 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm6 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm7[0],mem[0] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm6 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm10, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm12, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm17 +; AVX512BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm16 +; AVX512BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm7, %zmm15 +; AVX512BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm14 +; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm26 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm19 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 3008(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 2944(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 2816(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 2624(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 2560(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 2432(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 2368(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16797,7 +16824,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 2048(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16805,7 +16832,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 1664(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1600(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16813,73 +16840,74 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1216(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 896(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 704(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 2688(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 2304(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512BW-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512BW-NEXT: addq $3592, %rsp # imm = 0xE08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride6_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512BW-FCP-NEXT: subq $3592, %rsp # imm = 0xE08 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 @@ -16887,439 +16915,442 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [2,10,2,10,2,10,2,10] +; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm16, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm16, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm22, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $12, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $48, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm13 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm29 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm30 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm30 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm8 ; AVX512BW-FCP-NEXT: movb $16, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17333,21 +17364,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 @@ -17359,32 +17390,30 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 @@ -17393,96 +17422,96 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm6 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm7[0],mem[0] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm6 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm6 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm10, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm3, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm7, %zmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 3008(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 2944(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 2816(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2624(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2560(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 2432(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2368(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17490,7 +17519,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 2048(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17498,7 +17527,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 1664(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1600(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17506,73 +17535,74 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1216(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 896(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 704(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 576(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 2688(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 2304(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 1920(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 1536(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 768(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 384(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512BW-FCP-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512BW-FCP-NEXT: addq $3592, %rsp # imm = 0xE08 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride6_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512DQ-BW-NEXT: subq $3592, %rsp # imm = 0xE08 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 @@ -17580,439 +17610,442 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm16, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm16, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdx), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm14, %zmm22, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm28, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm27, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $12, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $48, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 $228, (%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm13 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm29 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%r8), %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm30 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm30 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm8 ; AVX512DQ-BW-NEXT: movb $16, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm31 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18026,21 +18059,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 @@ -18052,32 +18085,30 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 @@ -18086,96 +18117,96 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 320(%rdx), %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm7[0],mem[0] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm6 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 448(%rdx), %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm10, %zmm18 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm17 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm3, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm7, %zmm15 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm2, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm14 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 3008(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 2944(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 2816(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 2752(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2624(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2560(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 2432(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2368(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18183,7 +18214,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 2048(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18191,7 +18222,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1664(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1600(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18199,73 +18230,74 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1216(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 896(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 704(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 576(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 2688(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 2304(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 1920(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 1536(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 768(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 384(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-BW-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512DQ-BW-NEXT: addq $3592, %rsp # imm = 0xE08 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i64_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX512DQ-BW-FCP-NEXT: subq $3592, %rsp # imm = 0xE08 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,5,13,4,12,5,13] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 @@ -18273,439 +18305,442 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm30, %zmm11, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [2,10,2,10,2,10,2,10] +; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm16, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm3, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm16, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm2, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm21, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,4,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [1,9,2,10,1,9,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm22, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,13,6,14,5,13,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm27, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm28, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm27, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm28, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm16, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $12, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $48, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm0[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, (%rsp), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,9,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm13 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm29 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,13,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,0,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm30 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm30 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $16, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [10,0,2,3,4,5,11,0] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm17 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm19 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm31 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,9,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18719,21 +18754,21 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,13,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 @@ -18745,32 +18780,30 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,14,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm6, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 @@ -18779,96 +18812,96 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm7[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r8), %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm15, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm10, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm12, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm2, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm9, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 128(%r8), %zmm10, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 192(%r8), %zmm12, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 256(%r8), %zmm3, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 320(%r8), %zmm7, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 384(%r8), %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,10,2,3,4,5,6,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,12,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 3008(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 3008(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 2944(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 2816(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 2816(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 2752(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2624(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2560(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 2496(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 2432(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2368(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18876,7 +18909,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 2112(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 2048(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 2048(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18884,7 +18917,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 1664(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 1664(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1600(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18892,42 +18925,43 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1216(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1088(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 896(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 896(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 704(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 2688(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 2304(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 1920(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 1536(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 1152(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 768(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 2688(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 2304(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 1920(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1536(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 768(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 384(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rax) -; AVX512DQ-BW-FCP-NEXT: addq $3720, %rsp # imm = 0xE88 +; AVX512DQ-BW-FCP-NEXT: addq $3592, %rsp # imm = 0xE08 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index d1fd4a360036b..3aef91c5c7dfe 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -169,15 +169,15 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -191,15 +191,15 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -213,15 +213,15 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -235,15 +235,15 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -257,15 +257,15 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -279,15 +279,15 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -301,15 +301,15 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -323,15 +323,15 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [3,5,7,9,11,13,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -410,50 +410,50 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX: # %bb.0: ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: vmovaps (%rdi), %ymm1 -; AVX-NEXT: vmovaps (%rsi), %ymm2 -; AVX-NEXT: vmovaps (%r8), %ymm3 -; AVX-NEXT: vmovaps (%r10), %xmm4 -; AVX-NEXT: vmovaps 16(%r10), %xmm0 -; AVX-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] -; AVX-NEXT: vmovaps (%r9), %xmm5 -; AVX-NEXT: vmovaps (%r8), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vmovaps (%rsi), %ymm4 +; AVX-NEXT: vmovaps (%r8), %ymm5 +; AVX-NEXT: vmovaps (%r10), %xmm1 +; AVX-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovaps 16(%r10), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps (%r9), %xmm6 +; AVX-NEXT: vmovaps (%r8), %xmm7 +; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0] ; AVX-NEXT: vmovaps (%rdi), %xmm8 -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm9 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm7 -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm9 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX-NEXT: vmovaps (%rdx), %xmm9 ; AVX-NEXT: vmovaps (%rsi), %xmm10 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] ; AVX-NEXT: vbroadcastsd 8(%rcx), %ymm12 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX-NEXT: vmovaps 16(%rcx), %xmm11 ; AVX-NEXT: vmovaps 16(%rdx), %xmm12 ; AVX-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],mem[1] -; AVX-NEXT: vbroadcastsd 24(%r9), %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm10[0] -; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm9[0],mem[0] +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],mem[1] +; AVX-NEXT: vbroadcastsd 24(%r9), %ymm11 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm10[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm9[0],mem[0] ; AVX-NEXT: vmovaps %ymm1, 96(%rax) -; AVX-NEXT: vmovaps %xmm5, 16(%rax) -; AVX-NEXT: vmovaps %xmm4, (%rax) -; AVX-NEXT: vmovaps %ymm3, 128(%rax) -; AVX-NEXT: vmovaps %ymm2, 192(%rax) -; AVX-NEXT: vmovaps %ymm6, 64(%rax) -; AVX-NEXT: vmovaps %ymm7, 32(%rax) +; AVX-NEXT: vmovaps %xmm6, 16(%rax) +; AVX-NEXT: vmovaps %xmm2, (%rax) +; AVX-NEXT: vmovaps %ymm5, 128(%rax) +; AVX-NEXT: vmovaps %ymm4, 192(%rax) +; AVX-NEXT: vmovaps %ymm7, 64(%rax) +; AVX-NEXT: vmovaps %ymm3, 32(%rax) ; AVX-NEXT: vmovaps %ymm0, 160(%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -488,24 +488,24 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm10, %ymm10 ; AVX2-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm10[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] ; AVX2-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] ; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-NEXT: vbroadcastsd 24(%r9), %ymm7 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-NEXT: vbroadcastsd 24(%r9), %ymm2 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] ; AVX2-NEXT: vmovaps %ymm6, 128(%rax) ; AVX2-NEXT: vmovaps %ymm1, 192(%rax) ; AVX2-NEXT: vmovaps %ymm9, (%rax) ; AVX2-NEXT: vmovaps %ymm4, 96(%rax) ; AVX2-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -540,24 +540,24 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm10, %ymm10 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm10[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 ; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vbroadcastsd 24(%r9), %ymm7 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FP-NEXT: vbroadcastsd 24(%r9), %ymm2 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] ; AVX2-FP-NEXT: vmovaps %ymm6, 128(%rax) ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rax) ; AVX2-FP-NEXT: vmovaps %ymm9, (%rax) ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rax) ; AVX2-FP-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -592,24 +592,24 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm10, %ymm10 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm11 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm10[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 24(%r9), %ymm7 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-FCP-NEXT: vbroadcastsd 24(%r9), %ymm2 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm6, 128(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm9, (%rax) ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -618,40 +618,40 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512-NEXT: vmovdqa (%r8), %ymm3 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512-NEXT: vmovdqa (%r8), %ymm2 +; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm3 +; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm4 +; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512-NEXT: vmovdqa (%r10), %ymm0 -; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] -; AVX512-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] -; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512-NEXT: movb $112, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] -; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512-NEXT: movb $56, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] -; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm1 +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,u,u,u,11,15,3] +; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 ; AVX512-NEXT: movb $28, %cl ; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -663,40 +663,40 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512-FCP-NEXT: movb $112, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512-FCP-NEXT: movb $56, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,u,u,u,11,15,3] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 ; AVX512-FCP-NEXT: movb $28, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -708,40 +708,40 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm4 +; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512DQ-NEXT: movb $112, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512DQ-NEXT: movb $56, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,u,u,u,11,15,3] +; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 ; AVX512DQ-NEXT: movb $28, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -753,40 +753,40 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: movb $112, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: movb $56, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,u,u,u,11,15,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: movb $28, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -798,40 +798,40 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512BW-NEXT: vmovdqa (%r8), %ymm3 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm4 +; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa (%r10), %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: movb $112, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512BW-NEXT: movb $56, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,u,u,u,11,15,3] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 ; AVX512BW-NEXT: movb $28, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -843,40 +843,40 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512BW-FCP-NEXT: movb $112, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: movb $56, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,u,u,u,11,15,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 ; AVX512BW-FCP-NEXT: movb $28, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -888,40 +888,40 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512DQ-BW-NEXT: movb $112, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: movb $56, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,u,u,u,11,15,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: movb $28, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -933,40 +933,40 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [15,3,7,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [15,3,7,u] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,4,8,12,0,0,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movb $112, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [5,9,13,0,0,0,2,6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movb $56, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,6,0,0,0,11,15,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,6,u,u,u,11,15,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $28, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -1121,7 +1121,6 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: pushq %rax ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps (%rsi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rdx), %ymm7 ; AVX-NEXT: vmovaps (%r9), %ymm3 ; AVX-NEXT: vmovaps (%rax), %xmm5 @@ -1152,7 +1151,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm8[0] ; AVX-NEXT: vinsertf128 $1, 32(%rcx), %ymm8, %ymm10 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vbroadcastsd 8(%rcx), %ymm8 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX-NEXT: vmovaps (%rsi), %xmm8 @@ -1180,45 +1180,45 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovapd 48(%rdi), %xmm0 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0,0,3,2] -; AVX-NEXT: vmovapd 32(%rax), %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],ymm15[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] -; AVX-NEXT: vmovapd 32(%r8), %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX-NEXT: vmovapd 48(%rcx), %xmm0 -; AVX-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX-NEXT: vmovaps (%rdi), %ymm4 -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX-NEXT: vmovapd 32(%rax), %ymm1 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3],ymm15[2,3] +; AVX-NEXT: vmovapd 32(%r8), %ymm4 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] +; AVX-NEXT: vmovapd 48(%rcx), %xmm15 +; AVX-NEXT: vmovapd 48(%rdx), %xmm6 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm15[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX-NEXT: vmovaps (%rdi), %ymm6 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm9[1],xmm5[1] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vmovaps (%r8), %ymm5 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] ; AVX-NEXT: vmovaps 16(%rdx), %xmm5 ; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX-NEXT: vbroadcastsd 56(%r9), %ymm5 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm15[1],mem[1] +; AVX-NEXT: vbroadcastsd 56(%r9), %ymm6 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] +; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm14[0],xmm8[0] +; AVX-NEXT: vmovaps (%rdx), %xmm6 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm14[0],xmm8[0] -; AVX-NEXT: vmovaps (%rdx), %xmm5 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX-NEXT: vmovaps %xmm5, 16(%rax) -; AVX-NEXT: vmovaps %xmm2, (%rax) +; AVX-NEXT: vmovaps %xmm6, 16(%rax) +; AVX-NEXT: vmovaps %xmm5, (%rax) ; AVX-NEXT: vmovaps %ymm3, 128(%rax) -; AVX-NEXT: vmovaps %ymm4, 96(%rax) -; AVX-NEXT: vmovapd %ymm1, 352(%rax) -; AVX-NEXT: vmovapd %ymm15, 384(%rax) +; AVX-NEXT: vmovaps %ymm2, 96(%rax) +; AVX-NEXT: vmovapd %ymm4, 352(%rax) +; AVX-NEXT: vmovapd %ymm0, 384(%rax) ; AVX-NEXT: vmovapd %ymm11, 320(%rax) ; AVX-NEXT: vmovaps %ymm12, 32(%rax) ; AVX-NEXT: vmovaps %ymm10, 192(%rax) ; AVX-NEXT: vmovaps %ymm7, 64(%rax) -; AVX-NEXT: vmovapd %ymm6, 224(%rax) -; AVX-NEXT: vmovapd %ymm0, 416(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 224(%rax) +; AVX-NEXT: vmovapd %ymm1, 416(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 256(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1235,93 +1235,93 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-NEXT: vmovaps (%rsi), %ymm7 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm10 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm9 ; AVX2-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-NEXT: vmovaps 32(%rdx), %ymm9 -; AVX2-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-NEXT: vmovaps (%rax), %xmm2 +; AVX2-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX2-NEXT: vmovaps 32(%r8), %ymm2 +; AVX2-NEXT: vmovaps (%rax), %xmm1 ; AVX2-NEXT: vmovaps 16(%rax), %xmm0 -; AVX2-NEXT: vmovaps 32(%rax), %xmm13 -; AVX2-NEXT: vmovaps 48(%rax), %xmm12 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 32(%rax), %xmm12 +; AVX2-NEXT: vmovaps 48(%rax), %xmm13 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%r8), %xmm15 -; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 +; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm3 ; AVX2-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] +; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] ; AVX2-NEXT: vbroadcastsd 40(%rcx), %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm3[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm14[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm7 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 +; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm12, %ymm12 -; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm15 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX2-NEXT: vmovaps (%r8), %xmm15 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-NEXT: vbroadcastsd 8(%rcx), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vmovaps (%r8), %ymm13 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vmovaps (%rsi), %xmm15 +; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm15, %ymm15 +; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm10, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-NEXT: vmovaps (%r8), %xmm0 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-NEXT: vbroadcastsd 8(%rcx), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3,4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vmovaps (%r8), %ymm12 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-NEXT: vmovaps (%r9), %ymm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX2-NEXT: vmovaps (%r9), %ymm15 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[2,3] ; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-NEXT: vbroadcastsd 56(%r9), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm14[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-NEXT: vbroadcastsd 32(%rcx), %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm15[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 -; AVX2-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5],ymm6[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX2-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-NEXT: vmovaps %ymm9, 64(%rcx) ; AVX2-NEXT: vmovaps %ymm8, 320(%rcx) -; AVX2-NEXT: vmovaps %ymm12, (%rcx) -; AVX2-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-NEXT: vmovaps %ymm9, 384(%rcx) +; AVX2-NEXT: vmovaps %ymm13, (%rcx) +; AVX2-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-NEXT: vmovaps %ymm11, 384(%rcx) ; AVX2-NEXT: vmovaps %ymm7, 256(%rcx) -; AVX2-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 352(%rcx) @@ -1340,93 +1340,93 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm7 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm10 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm9 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm9 -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-FP-NEXT: vmovaps (%rax), %xmm2 +; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm2 +; AVX2-FP-NEXT: vmovaps (%rax), %xmm1 ; AVX2-FP-NEXT: vmovaps 16(%rax), %xmm0 -; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm13 -; AVX2-FP-NEXT: vmovaps 48(%rax), %xmm12 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm12 +; AVX2-FP-NEXT: vmovaps 48(%rax), %xmm13 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm15 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm3 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] ; AVX2-FP-NEXT: vbroadcastsd 40(%rcx), %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm3[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm14[2,3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm7 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm12, %ymm12 -; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm15 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX2-FP-NEXT: vmovaps (%r8), %xmm15 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-FP-NEXT: vbroadcastsd 8(%rcx), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vmovaps (%r8), %ymm13 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vmovaps (%rsi), %xmm15 +; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm15, %ymm15 +; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm10, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-FP-NEXT: vmovaps (%r8), %xmm0 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-FP-NEXT: vbroadcastsd 8(%rcx), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovaps (%r8), %ymm12 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-FP-NEXT: vmovaps (%r9), %ymm0 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX2-FP-NEXT: vmovaps (%r9), %ymm15 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[2,3] ; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-FP-NEXT: vbroadcastsd 56(%r9), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm14[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vbroadcastsd 32(%rcx), %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm15[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 -; AVX2-FP-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5],ymm6[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm9, 64(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm8, 320(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm12, (%rcx) -; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm9, 384(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm13, (%rcx) +; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm11, 384(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm7, 256(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rcx) @@ -1445,93 +1445,93 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm9 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm9 -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-FCP-NEXT: vmovaps (%rax), %xmm2 +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm2 +; AVX2-FCP-NEXT: vmovaps (%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm0 -; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm13 -; AVX2-FCP-NEXT: vmovaps 48(%rax), %xmm12 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm12 +; AVX2-FCP-NEXT: vmovaps 48(%rax), %xmm13 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm15 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] ; AVX2-FCP-NEXT: vbroadcastsd 40(%rcx), %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm3[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm14[2,3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm7 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm12, %ymm12 -; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm15 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX2-FCP-NEXT: vmovaps (%r8), %xmm15 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-FCP-NEXT: vbroadcastsd 8(%rcx), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm13 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm15 +; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm15, %ymm15 +; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm10, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-FCP-NEXT: vmovaps (%r8), %xmm0 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-FCP-NEXT: vbroadcastsd 8(%rcx), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm12 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-FCP-NEXT: vmovaps (%r9), %ymm0 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX2-FCP-NEXT: vmovaps (%r9), %ymm15 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 56(%r9), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm14[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vbroadcastsd 32(%rcx), %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm15[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm9, 64(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm8, 320(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm12, (%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm9, 384(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm13, (%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm11, 384(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm7, 256(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rcx) @@ -1576,9 +1576,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm14 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 ; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1595,27 +1595,27 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movb $96, %sil ; AVX512-NEXT: kmovw %esi, %k1 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm3 ; AVX512-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512-NEXT: movb $12, %sil ; AVX512-NEXT: kmovw %esi, %k2 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm8 {%k2} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512-NEXT: vinserti32x4 $3, (%r10), %zmm5, %zmm5 +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512-NEXT: vinserti32x4 $3, (%r10), %zmm6, %zmm5 ; AVX512-NEXT: movb $112, %sil ; AVX512-NEXT: kmovw %esi, %k2 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 {%k2} ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm9[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,0,14,6,5,0,14,6] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 ; AVX512-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 ; AVX512-NEXT: movb $-61, %sil ; AVX512-NEXT: kmovw %esi, %k2 @@ -1631,16 +1631,16 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX512-NEXT: movb $6, %cl ; AVX512-NEXT: kmovw %ecx, %k2 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [9,1,9,1,9,1,9,1] +; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] -; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm14 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm1, %zmm14, %zmm13 ; AVX512-NEXT: movb $56, %cl ; AVX512-NEXT: kmovw %ecx, %k2 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm7 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -1675,9 +1675,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 @@ -1687,12 +1687,12 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movb $24, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 ; AVX512-FCP-NEXT: movb $96, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm6 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm8 @@ -1701,26 +1701,26 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512-FCP-NEXT: movb $12, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512-FCP-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vinserti32x4 $3, (%r10), %zmm10, %zmm9 ; AVX512-FCP-NEXT: movb $112, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm9 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm10 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm11 ; AVX512-FCP-NEXT: movb $-61, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} @@ -1732,7 +1732,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm12 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,3,7,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,7,u] ; AVX512-FCP-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: movb $14, %sil @@ -1748,39 +1748,39 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512-FCP-NEXT: movb $28, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm2[2,3,2,3] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm11 ; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm12 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512-FCP-NEXT: movb $6, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm13 ; AVX512-FCP-NEXT: movb $56, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 ; AVX512-FCP-NEXT: movb $120, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) @@ -1816,9 +1816,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm12 ; AVX512DQ-NEXT: movb $-61, %sil ; AVX512DQ-NEXT: kmovw %esi, %k1 @@ -1840,9 +1840,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm13, %zmm9 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm15, %zmm11 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -1857,7 +1857,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] @@ -1872,9 +1872,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 ; AVX512DQ-NEXT: movb $56, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 {%k2} ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm14 {%k2} ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: movb $12, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 @@ -1914,14 +1914,13 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-LABEL: store_i64_stride7_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 @@ -1930,10 +1929,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: movb $-61, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} @@ -1942,20 +1941,20 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: movb $48, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,3,7,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,7,u] ; AVX512DQ-FCP-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] ; AVX512DQ-FCP-NEXT: movb $14, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 ; AVX512DQ-FCP-NEXT: movb $96, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 @@ -1963,7 +1962,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQ-FCP-NEXT: movb $28, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 @@ -1973,24 +1972,24 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movb $24, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm11 ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm11 ; AVX512DQ-FCP-NEXT: movb $6, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: movb $56, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} @@ -2002,28 +2001,29 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-FCP-NEXT: movb $12, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2} -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 ; AVX512DQ-FCP-NEXT: movb $112, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm12 {%k2} +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm14, %zmm12 {%k2} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: movb $120, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) @@ -2065,9 +2065,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -2084,27 +2084,27 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movb $96, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-NEXT: movb $12, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vinserti32x4 $3, (%r10), %zmm5, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vinserti32x4 $3, (%r10), %zmm6, %zmm5 ; AVX512BW-NEXT: movb $112, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k2} ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,0,14,6,5,0,14,6] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 ; AVX512BW-NEXT: movb $-61, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 @@ -2120,16 +2120,16 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: movb $6, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [9,1,9,1,9,1,9,1] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm14, %zmm13 ; AVX512BW-NEXT: movb $56, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] @@ -2164,9 +2164,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 @@ -2176,12 +2176,12 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movb $24, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: movb $96, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm9, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm6 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm8 @@ -2190,26 +2190,26 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512BW-FCP-NEXT: movb $12, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%r10), %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%r10), %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: movb $112, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,0,14,6,5,0,14,6] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,6,7,0,13,6,7] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm11 ; AVX512BW-FCP-NEXT: movb $-61, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 {%k2} @@ -2221,7 +2221,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm11 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm12 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [1,3,7,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,7,u] ; AVX512BW-FCP-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: movb $14, %sil @@ -2237,39 +2237,39 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX512BW-FCP-NEXT: movb $28, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm11[2,3,2,3],zmm2[2,3,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-FCP-NEXT: movb $6, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm13 ; AVX512BW-FCP-NEXT: movb $56, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: movb $120, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) @@ -2305,9 +2305,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm12 ; AVX512DQ-BW-NEXT: movb $-61, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k1 @@ -2329,9 +2329,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm11 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] @@ -2346,7 +2346,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] @@ -2361,9 +2361,9 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 ; AVX512DQ-BW-NEXT: movb $56, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: movb $12, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 @@ -2403,14 +2403,13 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-LABEL: store_i64_stride7_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8 @@ -2419,10 +2418,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm8[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $-61, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} @@ -2431,20 +2430,20 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 ; AVX512DQ-BW-FCP-NEXT: movb $48, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [1,3,7,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,7,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] ; AVX512DQ-BW-FCP-NEXT: movb $14, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movb $96, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 @@ -2452,7 +2451,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQ-BW-FCP-NEXT: movb $28, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11 @@ -2462,24 +2461,24 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movb $24, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movb $6, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movb $56, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k2} @@ -2491,28 +2490,29 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-BW-FCP-NEXT: movb $12, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movb $112, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm14, %zmm12 {%k2} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,0,5,4,12,0,5] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movb $120, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) @@ -2810,7 +2810,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX-LABEL: store_i64_stride7_vf16: ; AVX: # %bb.0: -; AVX-NEXT: subq $520, %rsp # imm = 0x208 +; AVX-NEXT: subq $552, %rsp # imm = 0x228 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps (%rsi), %ymm5 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2868,18 +2868,18 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX-NEXT: vbroadcastsd 40(%rcx), %ymm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 32(%rsi), %ymm10 +; AVX-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 48(%rax), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 48(%rax), %xmm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 32(%r9), %ymm9 ; AVX-NEXT: vmovaps 48(%r8), %xmm0 @@ -2930,41 +2930,42 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[2] -; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 ; AVX-NEXT: vmovaps 96(%r8), %xmm2 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm2[0],mem[0] ; AVX-NEXT: vinsertf128 $1, 96(%rax), %ymm14, %ymm14 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm1[1] ; AVX-NEXT: vbroadcastsd 104(%rcx), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rdi), %ymm0 ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps (%r8), %ymm0 ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX-NEXT: vmovaps 16(%rdx), %xmm1 ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm12[1] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm12[1] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 32(%r8), %ymm0 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX-NEXT: vmovaps 48(%rdx), %xmm9 -; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vmovaps 48(%rdx), %xmm2 +; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm6[1] @@ -2982,48 +2983,49 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovapd 112(%rdi), %xmm6 ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,2] -; AVX-NEXT: vmovapd 96(%rax), %ymm8 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3],ymm4[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3] -; AVX-NEXT: vmovapd 112(%rcx), %xmm6 +; AVX-NEXT: vmovapd 96(%rax), %ymm7 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] +; AVX-NEXT: vmovapd 112(%rcx), %xmm8 ; AVX-NEXT: vmovapd 112(%rdx), %xmm9 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],xmm8[0] ; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] ; AVX-NEXT: vbroadcastsd 112(%r9), %ymm10 -; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],mem[1] -; AVX-NEXT: vbroadcastsd 120(%r9), %ymm10 -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1,2],ymm10[3] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],mem[1] +; AVX-NEXT: vbroadcastsd 120(%r9), %ymm9 +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm15[0] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX-NEXT: # xmm9 = xmm9[0],mem[0] ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps 64(%rdx), %xmm11 +; AVX-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX-NEXT: vmovaps (%rdx), %xmm11 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX-NEXT: vmovaps (%rdx), %xmm12 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX-NEXT: vmovaps %xmm12, 16(%rax) -; AVX-NEXT: vmovaps %xmm10, (%rax) -; AVX-NEXT: vmovaps %xmm11, 464(%rax) +; AVX-NEXT: vmovaps %xmm11, 16(%rax) +; AVX-NEXT: vmovaps %xmm9, (%rax) +; AVX-NEXT: vmovaps %xmm10, 464(%rax) ; AVX-NEXT: vmovaps %xmm8, 448(%rax) ; AVX-NEXT: vmovapd %ymm4, 832(%rax) ; AVX-NEXT: vmovapd %ymm3, 768(%rax) ; AVX-NEXT: vmovaps %ymm0, 576(%rax) ; AVX-NEXT: vmovaps %ymm5, 544(%rax) -; AVX-NEXT: vmovaps %ymm7, 352(%rax) -; AVX-NEXT: vmovaps %ymm2, 320(%rax) -; AVX-NEXT: vmovaps %ymm1, 128(%rax) -; AVX-NEXT: vmovaps %ymm14, 96(%rax) -; AVX-NEXT: vmovapd %ymm6, 864(%rax) -; AVX-NEXT: vmovapd %ymm9, 800(%rax) -; AVX-NEXT: vmovaps %ymm13, 736(%rax) +; AVX-NEXT: vmovaps %ymm2, 352(%rax) +; AVX-NEXT: vmovaps %ymm1, 320(%rax) +; AVX-NEXT: vmovaps %ymm14, 128(%rax) +; AVX-NEXT: vmovaps %ymm13, 96(%rax) +; AVX-NEXT: vmovapd %ymm7, 864(%rax) +; AVX-NEXT: vmovapd %ymm6, 800(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 704(%rax) +; AVX-NEXT: vmovaps %ymm0, 736(%rax) ; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 704(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 672(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 640(%rax) @@ -3051,7 +3053,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm0, 64(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 32(%rax) -; AVX-NEXT: addq $520, %rsp # imm = 0x208 +; AVX-NEXT: addq $552, %rsp # imm = 0x228 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3059,133 +3061,133 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2: # %bb.0: ; AVX2-NEXT: subq $552, %rsp # imm = 0x228 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-NEXT: vmovaps (%rsi), %ymm14 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-NEXT: vmovaps 64(%rsi), %ymm15 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-NEXT: vmovaps 64(%rsi), %ymm14 ; AVX2-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm3 ; AVX2-NEXT: vmovaps 16(%rax), %xmm0 ; AVX2-NEXT: vmovaps 32(%rax), %xmm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm14[1],ymm7[3],ymm14[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 32(%r8), %xmm5 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] +; AVX2-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-NEXT: vbroadcastsd 40(%rcx), %ymm8 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovaps 48(%rax), %xmm8 +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; AVX2-NEXT: vbroadcastsd 40(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vmovaps 48(%rax), %xmm6 -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,2,3,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[0,2,3,3] ; AVX2-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps 80(%rax), %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%r8), %xmm5 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 96(%rax), %ymm6, %ymm6 -; AVX2-NEXT: vmovaps 96(%rdi), %xmm8 -; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovaps 80(%rax), %xmm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%r8), %xmm6 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm6[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 96(%rax), %ymm8, %ymm8 +; AVX2-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX2-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-NEXT: vbroadcastsd 104(%rcx), %ymm9 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 96(%rdx), %xmm8 -; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; AVX2-NEXT: vbroadcastsd 104(%rcx), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-NEXT: vmovaps 96(%rsi), %ymm11 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-NEXT: vmovaps 96(%rdx), %ymm6 +; AVX2-NEXT: vmovaps 96(%rsi), %ymm10 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-NEXT: vmovaps 96(%rdx), %ymm8 +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vmovaps 112(%rax), %xmm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovaps 112(%rax), %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm5, %ymm5 -; AVX2-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm6, %ymm6 +; AVX2-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm8, %ymm8 +; AVX2-NEXT: vbroadcastsd 8(%rcx), %ymm9 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps (%r8), %xmm13 -; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] ; AVX2-NEXT: vmovaps (%rax), %xmm9 ; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-NEXT: vmovaps (%r8), %ymm8 -; AVX2-NEXT: vmovaps (%r9), %ymm6 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX2-NEXT: vmovaps (%r9), %ymm7 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm11[0],ymm5[2],ymm11[2] ; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm2, %ymm2 +; AVX2-NEXT: vmovaps 32(%r8), %ymm6 ; AVX2-NEXT: vmovaps 32(%r9), %ymm5 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm15 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm2 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 72(%rcx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps 64(%r8), %xmm4 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] ; AVX2-NEXT: vmovaps 64(%rax), %xmm3 ; AVX2-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] @@ -3196,57 +3198,57 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] ; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 120(%r9), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vmovaps 96(%rax), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] +; AVX2-NEXT: vbroadcastsd 120(%r9), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 96(%rax), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vbroadcastsd %xmm9, %ymm9 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm8 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd 32(%rcx), %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 32(%rcx), %ymm6 ; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-NEXT: vbroadcastsd %xmm3, %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 88(%rcx), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload +; AVX2-NEXT: vbroadcastsd 88(%rcx), %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload ; AVX2-NEXT: vbroadcastsd 96(%rcx), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] -; AVX2-NEXT: vbroadcastsd 112(%r9), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vbroadcastsd 112(%r9), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps %ymm4, 800(%rax) -; AVX2-NEXT: vmovaps %ymm12, 768(%rax) +; AVX2-NEXT: vmovaps %ymm10, 768(%rax) ; AVX2-NEXT: vmovaps %ymm14, 576(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm4, 544(%rax) @@ -3272,11 +3274,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm4, 736(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm4, 704(%rax) -; AVX2-NEXT: vmovaps %ymm2, 672(%rax) +; AVX2-NEXT: vmovaps %ymm3, 672(%rax) ; AVX2-NEXT: vmovaps %ymm1, 640(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 608(%rax) -; AVX2-NEXT: vmovaps %ymm3, 480(%rax) +; AVX2-NEXT: vmovaps %ymm2, 480(%rax) ; AVX2-NEXT: vmovaps %ymm5, 416(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 384(%rax) @@ -3285,11 +3287,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 256(%rax) ; AVX2-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 160(%rax) ; AVX2-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-NEXT: vmovaps %ymm11, 864(%rax) +; AVX2-NEXT: vmovaps %ymm12, 864(%rax) ; AVX2-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3298,133 +3300,133 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $552, %rsp # imm = 0x228 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-FP-NEXT: vmovaps (%rsi), %ymm14 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm15 +; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FP-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm14 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-FP-NEXT: vmovaps 32(%rdx), %ymm3 ; AVX2-FP-NEXT: vmovaps 16(%rax), %xmm0 ; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm14[1],ymm7[3],ymm14[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm5 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-FP-NEXT: vbroadcastsd 40(%rcx), %ymm8 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 48(%rax), %xmm8 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; AVX2-FP-NEXT: vbroadcastsd 40(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 48(%rax), %xmm6 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,2,3,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 80(%rax), %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%r8), %xmm5 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 96(%rax), %ymm6, %ymm6 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm8 -; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps 80(%rax), %xmm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%r8), %xmm6 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm6[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 96(%rax), %ymm8, %ymm8 +; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX2-FP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX2-FP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-FP-NEXT: vbroadcastsd 104(%rcx), %ymm9 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 96(%rdx), %xmm8 -; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; AVX2-FP-NEXT: vbroadcastsd 104(%rcx), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm11 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm6 +; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm10 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-FP-NEXT: vmovaps 96(%rdx), %ymm8 +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vmovaps 112(%rax), %xmm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovaps 112(%rax), %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm5, %ymm5 -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-FP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm6, %ymm6 +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-FP-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm8, %ymm8 +; AVX2-FP-NEXT: vbroadcastsd 8(%rcx), %ymm9 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps (%r8), %xmm13 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] ; AVX2-FP-NEXT: vmovaps (%rax), %xmm9 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-FP-NEXT: vmovaps (%r8), %ymm8 -; AVX2-FP-NEXT: vmovaps (%r9), %ymm6 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX2-FP-NEXT: vmovaps (%r9), %ymm7 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm11[0],ymm5[2],ymm11[2] ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm6 ; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm5 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm15 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-FP-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm2 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd 72(%rcx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm4 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] ; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm3 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] @@ -3435,57 +3437,57 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] ; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 120(%r9), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 96(%rax), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] +; AVX2-FP-NEXT: vbroadcastsd 120(%r9), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 96(%rax), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vbroadcastsd %xmm9, %ymm9 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm8 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd 32(%rcx), %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 32(%rcx), %ymm6 ; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm2 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 88(%rcx), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: vbroadcastsd 88(%rcx), %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload ; AVX2-FP-NEXT: vbroadcastsd 96(%rcx), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 112(%r9), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vbroadcastsd 112(%r9), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps %ymm4, 800(%rax) -; AVX2-FP-NEXT: vmovaps %ymm12, 768(%rax) +; AVX2-FP-NEXT: vmovaps %ymm10, 768(%rax) ; AVX2-FP-NEXT: vmovaps %ymm14, 576(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm4, 544(%rax) @@ -3511,11 +3513,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm4, 736(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm4, 704(%rax) -; AVX2-FP-NEXT: vmovaps %ymm2, 672(%rax) +; AVX2-FP-NEXT: vmovaps %ymm3, 672(%rax) ; AVX2-FP-NEXT: vmovaps %ymm1, 640(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 608(%rax) -; AVX2-FP-NEXT: vmovaps %ymm3, 480(%rax) +; AVX2-FP-NEXT: vmovaps %ymm2, 480(%rax) ; AVX2-FP-NEXT: vmovaps %ymm5, 416(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rax) @@ -3524,11 +3526,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rax) ; AVX2-FP-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-FP-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-FP-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rax) ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FP-NEXT: vmovaps %ymm11, 864(%rax) +; AVX2-FP-NEXT: vmovaps %ymm12, 864(%rax) ; AVX2-FP-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -3537,133 +3539,133 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $552, %rsp # imm = 0x228 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm14 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm15 +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm14 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm3 ; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm0 ; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm14[1],ymm7[3],ymm14[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm5 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FCP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-FCP-NEXT: vbroadcastsd 40(%rcx), %ymm8 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 48(%rax), %xmm8 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; AVX2-FCP-NEXT: vbroadcastsd 40(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 48(%rax), %xmm6 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,2,3,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 80(%rax), %xmm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%r8), %xmm5 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rax), %ymm6, %ymm6 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm8 -; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 80(%rax), %xmm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%r8), %xmm6 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm6[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rax), %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX2-FCP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-FCP-NEXT: vbroadcastsd 104(%rcx), %ymm9 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 96(%rdx), %xmm8 -; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; AVX2-FCP-NEXT: vbroadcastsd 104(%rcx), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm11 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm6 +; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm10 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm8 +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vmovaps 112(%rax), %xmm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovaps 112(%rax), %xmm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm6, %ymm6 +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-FCP-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm8, %ymm8 +; AVX2-FCP-NEXT: vbroadcastsd 8(%rcx), %ymm9 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps (%r8), %xmm13 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] ; AVX2-FCP-NEXT: vmovaps (%rax), %xmm9 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-FCP-NEXT: vmovaps (%r8), %ymm8 -; AVX2-FCP-NEXT: vmovaps (%r9), %ymm6 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX2-FCP-NEXT: vmovaps (%r9), %ymm7 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm11[0],ymm5[2],ymm11[2] ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm6 ; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm5 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm15 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm2 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd 72(%rcx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm4 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] ; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm3 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] @@ -3674,57 +3676,57 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] ; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 120(%r9), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 96(%rax), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] +; AVX2-FCP-NEXT: vbroadcastsd 120(%r9), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 96(%rax), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vbroadcastsd %xmm9, %ymm9 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm8 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd 32(%rcx), %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 32(%rcx), %ymm6 ; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm2 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 88(%rcx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vbroadcastsd 88(%rcx), %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vbroadcastsd 96(%rcx), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 112(%r9), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 112(%r9), %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps %ymm4, 800(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm12, 768(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm10, 768(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm14, 576(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm4, 544(%rax) @@ -3750,11 +3752,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm4, 736(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm4, 704(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm2, 672(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm3, 672(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm1, 640(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 608(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm3, 480(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm2, 480(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm5, 416(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rax) @@ -3763,17 +3765,18 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rax) ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FCP-NEXT: vmovaps %ymm11, 864(%rax) +; AVX2-FCP-NEXT: vmovaps %ymm12, 864(%rax) ; AVX2-FCP-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i64_stride7_vf16: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm18 @@ -3781,49 +3784,49 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm9 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm11 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm3 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm3, %zmm16, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm21 +; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm22 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512-NEXT: vpermt2q %zmm7, %zmm5, %zmm14 ; AVX512-NEXT: vpermi2q %zmm9, %zmm18, %zmm5 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] ; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512-NEXT: vpermt2q %zmm18, %zmm17, %zmm14 +; AVX512-NEXT: vpermt2q %zmm3, %zmm13, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] ; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm20, %zmm29, %zmm2 +; AVX512-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 +; AVX512-NEXT: vpermt2q %zmm20, %zmm29, %zmm3 ; AVX512-NEXT: movb $48, %sil ; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6] ; AVX512-NEXT: vpermt2q %zmm9, %zmm16, %zmm18 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] ; AVX512-NEXT: movb $64, %sil ; AVX512-NEXT: kmovw %esi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm19 {%k1} ; AVX512-NEXT: vmovdqa64 (%r8), %zmm20 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm30 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm31 @@ -3832,35 +3835,36 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $24, %sil ; AVX512-NEXT: kmovw %esi, %k1 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] -; AVX512-NEXT: vpermi2q %zmm3, %zmm22, %zmm18 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u] +; AVX512-NEXT: vpermi2q %zmm4, %zmm22, %zmm18 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u] ; AVX512-NEXT: vpermi2q %zmm31, %zmm18, %zmm21 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] ; AVX512-NEXT: vpermi2q %zmm28, %zmm21, %zmm18 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14] ; AVX512-NEXT: vpermi2q %zmm31, %zmm19, %zmm21 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm28, %zmm21, %zmm19 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] ; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm27 ; AVX512-NEXT: vpermt2q %zmm30, %zmm26, %zmm27 -; AVX512-NEXT: vpermi2q %zmm31, %zmm3, %zmm26 -; AVX512-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512-NEXT: vpermi2q %zmm31, %zmm4, %zmm26 +; AVX512-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $96, %sil ; AVX512-NEXT: kmovw %esi, %k2 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm25 +; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm24 +; AVX512-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 -; AVX512-NEXT: vpermi2q %zmm1, %zmm11, %zmm13 -; AVX512-NEXT: vpermt2q %zmm11, %zmm29, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-NEXT: vpermt2q %zmm2, %zmm10, %zmm22 +; AVX512-NEXT: vpermi2q %zmm2, %zmm11, %zmm13 +; AVX512-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 ; AVX512-NEXT: vpermt2q %zmm31, %zmm21, %zmm11 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] ; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -3871,13 +3875,13 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm12 {%k2} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [9,1,9,1,9,1,9,1] ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm31, %zmm23, %zmm3 +; AVX512-NEXT: vpermt2q %zmm31, %zmm23, %zmm4 ; AVX512-NEXT: vmovdqa64 64(%r9), %ymm31 ; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 ; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 ; AVX512-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 ; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm17 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] ; AVX512-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 ; AVX512-NEXT: vpermi2q %zmm30, %zmm20, %zmm8 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] @@ -3892,11 +3896,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm30[2,3,2,3],zmm28[2,3,2,3] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] ; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 +; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm4 ; AVX512-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] -; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermi2q %zmm8, %zmm28, %zmm0 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermi2q %zmm8, %zmm28, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7] ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermi2q %zmm28, %zmm7, %zmm8 @@ -3912,11 +3916,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm28 ; AVX512-NEXT: movb $12, %sil ; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vinserti64x4 $0, %ymm28, %zmm0, %zmm4 {%k3} +; AVX512-NEXT: vinserti64x4 $0, %ymm28, %zmm0, %zmm14 {%k3} ; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm27, %zmm27 ; AVX512-NEXT: movb $112, %sil ; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 {%k4} +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm14 {%k4} ; AVX512-NEXT: vmovdqa64 64(%rdx), %xmm27 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] ; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 @@ -3924,57 +3928,59 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti32x4 $3, 64(%rax), %zmm26, %zmm26 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm5 {%k4} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512-NEXT: movb $120, %sil ; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11 {%k3} -; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm11 {%k3} +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: movb $6, %sil -; AVX512-NEXT: kmovw %esi, %k4 -; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k4} -; AVX512-NEXT: movb $56, %sil ; AVX512-NEXT: kmovw %esi, %k5 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 {%k5} +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} +; AVX512-NEXT: movb $56, %sil +; AVX512-NEXT: kmovw %esi, %k4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 {%k4} ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512-NEXT: movb $-31, %sil ; AVX512-NEXT: kmovw %esi, %k1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm24[4,5,6,7] ; AVX512-NEXT: movb $-61, %sil ; AVX512-NEXT: kmovw %esi, %k1 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} +; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm1 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} -; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm17 {%k5} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm31[1],ymm23[3],ymm31[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k5} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm31[1],ymm23[3],ymm31[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm17 {%k4} ; AVX512-NEXT: movb $14, %cl ; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm21, 256(%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 512(%rax) ; AVX512-NEXT: vmovdqa64 %zmm12, 576(%rax) ; AVX512-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm19, 768(%rax) ; AVX512-NEXT: vmovdqa64 %zmm18, 832(%rax) +; AVX512-NEXT: popq %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -3990,12 +3996,12 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] -; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] +; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm4 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm4[0,1,2,3],zmm0[4,5,6,7] ; AVX512-FCP-NEXT: movb $64, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 @@ -4013,16 +4019,16 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $24, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,1,2,3,4,15,u,u] ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] ; AVX512-FCP-NEXT: vpermi2q %zmm23, %zmm7, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm23, %zmm8, %zmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -4033,22 +4039,22 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm22 ; AVX512-FCP-NEXT: vpermi2q %zmm19, %zmm26, %zmm10 -; AVX512-FCP-NEXT: vpermi2q %zmm23, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermi2q %zmm23, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm24 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm17 ; AVX512-FCP-NEXT: movb $48, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,3,11,3,11,3,11,3] ; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 @@ -4059,8 +4065,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm17, %zmm28 -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm28 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm30 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm14 @@ -4069,9 +4075,9 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm18 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] ; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm18 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm31, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4088,13 +4094,13 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm0 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,3,7,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,7,7] ; AVX512-FCP-NEXT: vpermt2q %ymm23, %ymm15, %ymm0 ; AVX512-FCP-NEXT: movb $96, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 {%k2} ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,12,7,0,1,12,7] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 {%k2} ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm18 ; AVX512-FCP-NEXT: movb $28, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k3 @@ -4136,11 +4142,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k2} ; AVX512-FCP-NEXT: movb $120, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k3} ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k3} ; AVX512-FCP-NEXT: movb $14, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} ; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: movb $6, %sil @@ -4153,16 +4159,16 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movb $-31, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm28[4,5,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm28[4,5,6,7] ; AVX512-FCP-NEXT: movb $-61, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 {%k3} ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm6 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4} -; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k5} ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k6} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -4174,7 +4180,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 384(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 704(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 448(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rax) @@ -4186,6 +4192,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-LABEL: store_i64_stride7_vf16: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm18 @@ -4193,199 +4200,202 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm17 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] +; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm16, %zmm19 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm22 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] +; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm15, %zmm14 ; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm18, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] ; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm29, %zmm3 ; AVX512DQ-NEXT: movb $48, %sil ; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm17, %zmm18 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm16, %zmm18 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] ; AVX512DQ-NEXT: movb $64, %sil ; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm28 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm9, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm8, %zmm22 ; AVX512DQ-NEXT: movb $24, %sil ; AVX512DQ-NEXT: kmovw %esi, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] -; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm22, %zmm18 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u] +; AVX512DQ-NEXT: vpermi2q %zmm17, %zmm22, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u] ; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm18, %zmm21 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm21, %zmm18 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm30, %zmm19, %zmm21 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm21, %zmm19 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm26 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm31, %zmm26 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $96, %sil ; AVX512DQ-NEXT: kmovw %esi, %k2 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm13, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm9, %zmm22 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm13, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm29, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] ; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm29, %zmm23 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm8 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm31, %zmm16 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm1, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm10 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm31, %zmm17 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [9,1,9,1,9,1,9,1] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm31, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %ymm23 -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm17 -; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm31, %zmm23 +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm12 +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 ; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm7, %zmm15 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm9, %zmm6 -; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm20, %zmm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm20, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermi2q %zmm20, %zmm28, %zmm7 ; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm20, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm31, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm23[0],ymm28[2],ymm23[2] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm12[0],ymm28[2],ymm12[2] ; AVX512DQ-NEXT: movb $28, %sil ; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] ; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm30, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm27 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm27, %zmm31 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm7, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm21 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm27, %zmm31 +; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm11 {%k2} -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm12 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm29[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm8[2,3,2,3],zmm27[2,3,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm30, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %xmm27 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-NEXT: movb $12, %sil ; AVX512DQ-NEXT: kmovw %esi, %k5 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm4 {%k5} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm4 {%k5} ; AVX512DQ-NEXT: movb $112, %sil -; AVX512DQ-NEXT: kmovw %esi, %k7 -; AVX512DQ-NEXT: vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k7} +; AVX512DQ-NEXT: kmovw %esi, %k6 +; AVX512DQ-NEXT: vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k6} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm8 {%k2} ; AVX512DQ-NEXT: movb $120, %sil ; AVX512DQ-NEXT: kmovw %esi, %k3 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} -; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 {%k3} +; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-NEXT: movb $6, %sil ; AVX512DQ-NEXT: kmovw %esi, %k4 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm14 {%k4} +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k5} +; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm5 {%k6} ; AVX512DQ-NEXT: movb $56, %sil -; AVX512DQ-NEXT: kmovw %esi, %k6 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm14 {%k6} -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k5} -; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm16, %zmm5 {%k7} +; AVX512DQ-NEXT: kmovw %esi, %k5 +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm14 {%k5} ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512DQ-NEXT: movb $-31, %sil ; AVX512DQ-NEXT: kmovw %esi, %k1 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm17[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm16[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-NEXT: movb $-61, %sil ; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} -; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm15 {%k6} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm23[1],ymm28[3],ymm23[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm9 {%k2} +; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm28[1],ymm12[1],ymm28[3],ymm12[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm15 {%k5} ; AVX512DQ-NEXT: movb $14, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 640(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 704(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 768(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 832(%rax) +; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4413,12 +4423,12 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 @@ -4426,7 +4436,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $24, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,1,2,3,4,15,u,u] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm21, %zmm16, %zmm31 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1] ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -4442,7 +4452,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm26, %zmm28 ; AVX512DQ-FCP-NEXT: movb $112, %sil ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] @@ -4471,23 +4481,23 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $96, %sil ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm21 ; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm22 ; AVX512DQ-FCP-NEXT: movb $120, %sil ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm7, %zmm20 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7] ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm30 @@ -4500,9 +4510,9 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm11, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 ; AVX512DQ-FCP-NEXT: movb $-61, %dil ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1 @@ -4513,23 +4523,23 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm20 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k3} -; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm21 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm7, %zmm21 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %ymm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-FCP-NEXT: vpermt2q %ymm24, %ymm4, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512DQ-FCP-NEXT: vpermt2q %ymm24, %ymm4, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: movb $14, %sil ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm14, %zmm17 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm10[0],zmm2[0],zmm10[2],zmm2[2],zmm10[4],zmm2[4],zmm10[6],zmm2[6] ; AVX512DQ-FCP-NEXT: kmovw %esi, %k1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FCP-NEXT: vpermi2q %ymm20, %ymm11, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQ-FCP-NEXT: vpermi2q %ymm20, %ymm12, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] @@ -4546,22 +4556,22 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k3} ; AVX512DQ-FCP-NEXT: movb $28, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm11[0],ymm20[0],ymm11[2],ymm20[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm20[0],ymm12[2],ymm20[2] +; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm4[2,3,2,3],zmm9[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FCP-NEXT: movb $6, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm27 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k1} @@ -4569,15 +4579,15 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm31, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,1,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm31, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) @@ -4592,12 +4602,13 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 832(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride7_vf16: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 @@ -4605,49 +4616,49 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm11 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm3 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm14 ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm18, %zmm5 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm3 ; AVX512BW-NEXT: movb $48, %sil ; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm18[0],zmm9[0],zmm18[2],zmm9[2],zmm18[4],zmm9[4],zmm18[6],zmm9[6] ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm18 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] ; AVX512BW-NEXT: movb $64, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm20 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm31 @@ -4656,35 +4667,36 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $24, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm22, %zmm18 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm22, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u] ; AVX512BW-NEXT: vpermi2q %zmm31, %zmm18, %zmm21 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm28, %zmm21, %zmm18 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14] ; AVX512BW-NEXT: vpermi2q %zmm31, %zmm19, %zmm21 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm28, %zmm21, %zmm19 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm26, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm3, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm4, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $96, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm21, %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4695,13 +4707,13 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 {%k2} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [9,1,9,1,9,1,9,1] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm23, %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %ymm31 ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 ; AVX512BW-NEXT: vpermi2q %zmm30, %zmm20, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] @@ -4716,11 +4728,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k3} = zmm30[2,3,2,3],zmm28[2,3,2,3] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] ; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm30, %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm28 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm28, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm28, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,6,7,0,13,6,7] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm28, %zmm7, %zmm8 @@ -4736,11 +4748,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm28 ; AVX512BW-NEXT: movb $12, %sil ; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm0, %zmm4 {%k3} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm0, %zmm14 {%k3} ; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm27, %zmm27 ; AVX512BW-NEXT: movb $112, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 {%k4} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm14 {%k4} ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm27 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 @@ -4748,57 +4760,59 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti32x4 $3, 64(%rax), %zmm26, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 {%k4} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512BW-NEXT: movb $120, %sil ; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k3} -; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k3} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: movb $6, %sil -; AVX512BW-NEXT: kmovd %esi, %k4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k4} -; AVX512BW-NEXT: movb $56, %sil ; AVX512BW-NEXT: kmovd %esi, %k5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k5} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} +; AVX512BW-NEXT: movb $56, %sil +; AVX512BW-NEXT: kmovd %esi, %k4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k4} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512BW-NEXT: movb $-31, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-NEXT: movb $-61, %sil ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} +; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm1 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} -; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k5} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm23[1],ymm31[1],ymm23[3],ymm31[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k5} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm31[1],ymm23[3],ymm31[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k4} ; AVX512BW-NEXT: movb $14, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 832(%rax) +; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -4814,12 +4828,12 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm6 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] +; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm17, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm16, %zmm4 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm4[0,1,2,3],zmm0[4,5,6,7] ; AVX512BW-FCP-NEXT: movb $64, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 @@ -4837,16 +4851,16 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $24, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,1,2,3,4,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,1,2,3,4,15,u,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm7, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm8, %zmm9 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] @@ -4857,22 +4871,22 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm7, %zmm22 ; AVX512BW-FCP-NEXT: vpermi2q %zmm19, %zmm26, %zmm10 -; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm23, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm24 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm14, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm17 ; AVX512BW-FCP-NEXT: movb $48, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,3,11,3,11,3,11,3] ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 @@ -4883,8 +4897,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm17, %zmm28 -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm28 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm30 ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm14 @@ -4893,9 +4907,9 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm27 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm18 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] ; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm18 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm31, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -4912,13 +4926,13 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm0 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm23[0],ymm0[2],ymm23[2] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm15 = [1,3,7,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,7,7] ; AVX512BW-FCP-NEXT: vpermt2q %ymm23, %ymm15, %ymm0 ; AVX512BW-FCP-NEXT: movb $96, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 {%k2} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,1,12,7,0,1,12,7] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 {%k2} ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm18 ; AVX512BW-FCP-NEXT: movb $28, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k3 @@ -4960,11 +4974,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: movb $120, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k3} ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18 {%k3} ; AVX512BW-FCP-NEXT: movb $14, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} ; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: movb $6, %sil @@ -4977,16 +4991,16 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movb $-31, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm28[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm16[0,1,2,3],zmm28[4,5,6,7] ; AVX512BW-FCP-NEXT: movb $-61, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 {%k3} ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm6 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4} -; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k5} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2 {%k6} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -4998,7 +5012,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 384(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 704(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 448(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) @@ -5010,6 +5024,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-BW-LABEL: store_i64_stride7_vf16: ; AVX512DQ-BW: # %bb.0: +; AVX512DQ-BW-NEXT: pushq %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm18 @@ -5017,199 +5032,202 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm17 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm19 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm19 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm22 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] +; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm15, %zmm14 ; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm18, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] ; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm3 ; AVX512DQ-BW-NEXT: movb $48, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm17, %zmm18 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm18 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm18[0,1,2,3],zmm19[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $64, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm22 ; AVX512DQ-BW-NEXT: movb $24, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm22, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm22, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u] ; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm18, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm21, %zmm18 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm30, %zmm19, %zmm21 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm21, %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm26 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm10, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $96, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k2 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm22 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm22 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm13, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm29, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm21, %zmm13 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm23 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm16 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm17 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [9,1,9,1,9,1,9,1] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %ymm23 -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm17 -; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm31, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm12 +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 ; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm29 ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm15 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm6 -; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm20, %zmm9 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm20, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,0,14,6,5,0,14,6] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermi2q %zmm20, %zmm28, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm23[0],ymm28[2],ymm23[2] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm28[0],ymm12[0],ymm28[2],ymm12[2] ; AVX512DQ-BW-NEXT: movb $28, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k3} = zmm30[2,3,2,3],zmm27[2,3,2,3] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] ; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm30, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm30, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm27 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm27, %zmm31 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm7, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm27, %zmm31 +; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm29[2,3,2,3],zmm27[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k3} = zmm8[2,3,2,3],zmm27[2,3,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm30, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm27 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm27[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-BW-NEXT: movb $12, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k5 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm27, %zmm0, %zmm4 {%k5} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm4 {%k5} ; AVX512DQ-BW-NEXT: movb $112, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k7 -; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k7} +; AVX512DQ-BW-NEXT: kmovd %esi, %k6 +; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%rax), %zmm26, %zmm4 {%k6} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8 {%k2} ; AVX512DQ-BW-NEXT: movb $120, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} -; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 {%k3} +; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm8 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $6, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm14 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm14 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k5} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm5 {%k6} ; AVX512DQ-BW-NEXT: movb $56, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k6} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k5} -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm16, %zmm5 {%k7} +; AVX512DQ-BW-NEXT: kmovd %esi, %k5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k5} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: movb $-31, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm17[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm16[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-BW-NEXT: movb $-61, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k3} -; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k6} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm23[1],ymm28[3],ymm23[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k2} +; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm28[1],ymm12[1],ymm28[3],ymm12[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k5} ; AVX512DQ-BW-NEXT: movb $14, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 448(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 640(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 704(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 768(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 832(%rax) +; AVX512DQ-BW-NEXT: popq %rax ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -5237,12 +5255,12 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm20, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 @@ -5250,7 +5268,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $24, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [u,1,2,3,4,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm21, %zmm16, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] @@ -5266,7 +5284,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm26, %zmm28 ; AVX512DQ-BW-FCP-NEXT: movb $112, %sil ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] @@ -5295,23 +5313,23 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $96, %sil ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,12,0,5,4,12,0,5] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm22 ; AVX512DQ-BW-FCP-NEXT: movb $120, %sil ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm7, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7] ; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm9, %zmm30 @@ -5324,9 +5342,9 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] ; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm11, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 ; AVX512DQ-BW-FCP-NEXT: movb $-61, %dil ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 @@ -5337,23 +5355,23 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm28, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm28 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm7, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %ymm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm24, %ymm4, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm24, %ymm4, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: movb $14, %sil ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm14, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k2} = zmm10[0],zmm2[0],zmm10[2],zmm2[2],zmm10[4],zmm2[4],zmm10[6],zmm2[6] ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm20, %ymm11, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm20, %ymm12, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] @@ -5370,22 +5388,22 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k3} ; AVX512DQ-BW-FCP-NEXT: movb $28, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm11[0],ymm20[0],ymm11[2],ymm20[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm10[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm20[0],ymm12[2],ymm20[2] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm4[2,3,2,3],zmm9[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: movb $6, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [9,1,9,1,9,1,9,1] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,9,0,3,4,9,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k1} @@ -5393,15 +5411,15 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm31, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm31, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) @@ -5416,7 +5434,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 704(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 832(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -5996,7 +6014,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX-LABEL: store_i64_stride7_vf32: ; AVX: # %bb.0: -; AVX-NEXT: subq $1320, %rsp # imm = 0x528 +; AVX-NEXT: subq $1352, %rsp # imm = 0x548 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps (%rdi), %ymm3 ; AVX-NEXT: vmovaps (%rsi), %ymm1 @@ -6006,8 +6024,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps (%r9), %xmm6 ; AVX-NEXT: vmovaps (%r8), %xmm7 ; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] -; AVX-NEXT: vmovaps (%rdi), %xmm12 -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm9 +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm9 ; AVX-NEXT: vmovaps (%rax), %xmm10 ; AVX-NEXT: vmovaps 16(%rax), %xmm11 ; AVX-NEXT: vmovaps 32(%rax), %xmm0 @@ -6042,51 +6061,51 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 16(%r8), %xmm2 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovaps 32(%rsi), %xmm3 +; AVX-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] +; AVX-NEXT: vinsertf128 $1, 32(%rcx), %ymm5, %ymm6 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX-NEXT: vinsertf128 $1, 32(%rcx), %ymm3, %ymm4 -; AVX-NEXT: vmovaps 32(%rdx), %xmm5 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vmovaps 32(%r9), %xmm3 -; AVX-NEXT: vmovaps 32(%r8), %xmm4 -; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] +; AVX-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm2 +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[2] +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 +; AVX-NEXT: vmovaps 32(%r9), %xmm4 +; AVX-NEXT: vmovaps 32(%r8), %xmm5 +; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX-NEXT: vbroadcastsd 40(%rcx), %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; AVX-NEXT: vbroadcastsd 40(%rcx), %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 32(%r8), %ymm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] -; AVX-NEXT: vmovaps 32(%r9), %ymm3 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] +; AVX-NEXT: vmovaps 32(%r9), %ymm4 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 48(%rcx), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX-NEXT: vmovaps 48(%rdx), %xmm2 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX-NEXT: vmovaps 48(%rax), %xmm2 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 48(%r8), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] @@ -6104,8 +6123,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX-NEXT: vbroadcastsd 72(%rcx), %ymm3 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vmovaps 64(%rsi), %xmm11 -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,3,2,3] +; AVX-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] @@ -6134,52 +6154,52 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX-NEXT: vinsertf128 $1, 96(%rcx), %ymm4, %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, 96(%rcx), %ymm2, %ymm3 -; AVX-NEXT: vmovaps 96(%rdx), %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovaps 96(%r9), %xmm2 -; AVX-NEXT: vmovaps 96(%r8), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0] +; AVX-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm1 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[2] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 96(%r9), %xmm3 +; AVX-NEXT: vmovaps 96(%r8), %xmm4 +; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] ; AVX-NEXT: vmovaps 96(%rax), %xmm6 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX-NEXT: vbroadcastsd 104(%rcx), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm6[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm6[1] ; AVX-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX-NEXT: vmovaps 96(%rsi), %ymm2 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%r8), %ymm0 -; AVX-NEXT: vmovaps 96(%r9), %ymm1 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX-NEXT: vmovaps 96(%r8), %ymm1 +; AVX-NEXT: vmovaps 96(%r9), %ymm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vmovaps 112(%rcx), %xmm3 ; AVX-NEXT: vmovaps 112(%rdx), %xmm4 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 112(%rax), %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 112(%rax), %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 112(%r8), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -6187,8 +6207,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 128(%r9), %xmm0 ; AVX-NEXT: vmovaps 128(%r8), %xmm1 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovaps 128(%rdi), %xmm10 -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 +; AVX-NEXT: vmovaps 128(%rdi), %xmm14 +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm3 ; AVX-NEXT: vmovaps 128(%rax), %xmm4 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] @@ -6272,34 +6292,34 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 176(%r8), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%r9), %xmm0 ; AVX-NEXT: vmovaps 192(%r8), %xmm1 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm3 -; AVX-NEXT: vmovaps 192(%rax), %xmm5 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX-NEXT: vmovaps 192(%rdi), %xmm10 +; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 +; AVX-NEXT: vmovaps 192(%rax), %xmm6 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rdx), %ymm2 -; AVX-NEXT: vbroadcastsd 200(%rcx), %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vmovaps 192(%rsi), %xmm8 -; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX-NEXT: vbroadcastsd 200(%rcx), %ymm2 +; AVX-NEXT: vmovaps 192(%rdx), %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovaps 192(%rsi), %xmm9 +; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; AVX-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX-NEXT: vmovaps 192(%rsi), %ymm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%r8), %ymm0 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX-NEXT: vmovaps 208(%rcx), %xmm4 @@ -6308,175 +6328,177 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX-NEXT: vmovaps 208(%rax), %xmm2 +; AVX-NEXT: vmovaps 224(%rsi), %xmm3 +; AVX-NEXT: vmovaps 224(%rdi), %xmm6 +; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm3[0] +; AVX-NEXT: vinsertf128 $1, 224(%rcx), %ymm7, %ymm8 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rsi), %xmm2 -; AVX-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm2[0] -; AVX-NEXT: vinsertf128 $1, 224(%rcx), %ymm5, %ymm6 -; AVX-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5 -; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX-NEXT: vmovapd 224(%r8), %ymm3 -; AVX-NEXT: vinsertf128 $1, 224(%rax), %ymm3, %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX-NEXT: vbroadcastsd 232(%rcx), %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX-NEXT: vmovapd 224(%rdi), %ymm1 -; AVX-NEXT: vmovapd 224(%rsi), %ymm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm2 +; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm2[0],ymm8[1],ymm2[2],ymm8[2] +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm6 +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovapd 224(%r8), %ymm5 +; AVX-NEXT: vinsertf128 $1, 224(%rax), %ymm5, %ymm7 +; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX-NEXT: vbroadcastsd 232(%rcx), %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] +; AVX-NEXT: vmovapd 224(%rdi), %ymm2 +; AVX-NEXT: vmovapd 224(%rsi), %ymm3 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] -; AVX-NEXT: vmovapd 240(%rdi), %xmm15 -; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm2[2,3] +; AVX-NEXT: vmovapd 240(%rdi), %xmm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,2] ; AVX-NEXT: vmovapd 224(%rax), %ymm0 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2],ymm15[3] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX-NEXT: vbroadcastsd 216(%r9), %ymm15 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX-NEXT: vmovapd 240(%rcx), %xmm15 -; AVX-NEXT: vmovapd 240(%rdx), %xmm14 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3] -; AVX-NEXT: vbroadcastsd 240(%r9), %ymm14 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],mem[1] -; AVX-NEXT: vbroadcastsd 248(%r9), %ymm15 -; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm10[0],xmm13[0] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm11[0] -; AVX-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = xmm12[0],mem[0] -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm3[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],mem[1] +; AVX-NEXT: vbroadcastsd 216(%r9), %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX-NEXT: vmovapd 240(%rcx), %xmm4 +; AVX-NEXT: vmovapd 240(%rdx), %xmm3 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX-NEXT: vbroadcastsd 240(%r9), %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm3[0,1,2],ymm1[3] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],mem[1] +; AVX-NEXT: vbroadcastsd 248(%r9), %ymm4 +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0],ymm0[1],ymm15[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3] +; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm14[0],xmm13[0] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps 128(%rdx), %xmm15 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX-NEXT: vmovaps 64(%rdx), %xmm12 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX-NEXT: vmovaps 192(%rdx), %xmm11 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX-NEXT: vmovaps (%rdx), %xmm10 +; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0] +; AVX-NEXT: vmovaps 64(%rdx), %xmm10 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX-NEXT: vmovaps %xmm10, 16(%rsi) -; AVX-NEXT: vmovaps %xmm9, (%rsi) -; AVX-NEXT: vmovaps %xmm11, 1360(%rsi) -; AVX-NEXT: vmovaps %xmm8, 1344(%rsi) -; AVX-NEXT: vmovaps %xmm12, 464(%rsi) -; AVX-NEXT: vmovaps %xmm14, 448(%rsi) -; AVX-NEXT: vmovaps %xmm15, 912(%rsi) -; AVX-NEXT: vmovaps %xmm13, 896(%rsi) -; AVX-NEXT: vmovapd %ymm0, 1760(%rsi) -; AVX-NEXT: vmovapd %ymm2, 1728(%rsi) -; AVX-NEXT: vmovapd %ymm3, 1696(%rsi) -; AVX-NEXT: vmovapd %ymm1, 1664(%rsi) -; AVX-NEXT: vmovaps %ymm5, 1632(%rsi) -; AVX-NEXT: vmovaps %ymm6, 1600(%rsi) -; AVX-NEXT: vmovapd %ymm7, 1568(%rsi) -; AVX-NEXT: vmovaps %ymm4, 1536(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1504(%rsi) +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 192(%rdx), %xmm14 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: vmovaps (%rdx), %xmm13 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX-NEXT: vmovaps %xmm13, 16(%rax) +; AVX-NEXT: vmovaps %xmm0, (%rax) +; AVX-NEXT: vmovaps %xmm14, 1360(%rax) +; AVX-NEXT: vmovaps %xmm9, 1344(%rax) +; AVX-NEXT: vmovaps %xmm10, 464(%rax) +; AVX-NEXT: vmovaps %xmm2, 448(%rax) +; AVX-NEXT: vmovaps %xmm15, 912(%rax) +; AVX-NEXT: vmovaps %xmm5, 896(%rax) +; AVX-NEXT: vmovapd %ymm1, 1760(%rax) +; AVX-NEXT: vmovapd %ymm7, 1728(%rax) +; AVX-NEXT: vmovapd %ymm6, 1696(%rax) +; AVX-NEXT: vmovapd %ymm4, 1664(%rax) +; AVX-NEXT: vmovaps %ymm8, 1632(%rax) +; AVX-NEXT: vmovaps %ymm11, 1600(%rax) +; AVX-NEXT: vmovapd %ymm12, 1568(%rax) +; AVX-NEXT: vmovaps %ymm3, 1536(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1472(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1504(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1440(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1408(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1440(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 1408(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1376(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1376(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1312(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1312(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1280(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1248(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1248(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1216(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1216(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1184(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1184(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1152(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1152(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1120(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1120(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1088(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1056(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1056(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 1024(%rsi) +; AVX-NEXT: vmovaps %ymm0, 1024(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 992(%rsi) +; AVX-NEXT: vmovaps %ymm0, 992(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 960(%rsi) +; AVX-NEXT: vmovaps %ymm0, 960(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 928(%rsi) +; AVX-NEXT: vmovaps %ymm0, 928(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 864(%rsi) +; AVX-NEXT: vmovaps %ymm0, 864(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 832(%rsi) +; AVX-NEXT: vmovaps %ymm0, 832(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 800(%rsi) +; AVX-NEXT: vmovaps %ymm0, 800(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 768(%rsi) +; AVX-NEXT: vmovaps %ymm0, 768(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 736(%rsi) +; AVX-NEXT: vmovaps %ymm0, 736(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 704(%rsi) +; AVX-NEXT: vmovaps %ymm0, 704(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 672(%rsi) +; AVX-NEXT: vmovaps %ymm0, 672(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 640(%rsi) +; AVX-NEXT: vmovaps %ymm0, 640(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 608(%rsi) +; AVX-NEXT: vmovaps %ymm0, 608(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 576(%rsi) +; AVX-NEXT: vmovaps %ymm0, 576(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 544(%rsi) +; AVX-NEXT: vmovaps %ymm0, 544(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 512(%rsi) +; AVX-NEXT: vmovaps %ymm0, 512(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX-NEXT: vmovaps %ymm0, 480(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX-NEXT: vmovaps %ymm0, 416(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX-NEXT: vmovaps %ymm0, 384(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX-NEXT: vmovaps %ymm0, 352(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX-NEXT: vmovaps %ymm0, 320(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX-NEXT: vmovaps %ymm0, 288(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 256(%rsi) +; AVX-NEXT: vmovaps %ymm0, 256(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX-NEXT: vmovaps %ymm0, 224(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX-NEXT: vmovaps %ymm0, 192(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX-NEXT: vmovaps %ymm0, 160(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX-NEXT: vmovaps %ymm0, 128(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX-NEXT: vmovaps %ymm0, 96(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX-NEXT: vmovaps %ymm0, 64(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX-NEXT: addq $1320, %rsp # imm = 0x528 +; AVX-NEXT: vmovaps %ymm0, 32(%rax) +; AVX-NEXT: addq $1352, %rsp # imm = 0x548 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -6487,15 +6509,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-NEXT: vmovaps (%r8), %ymm12 -; AVX2-NEXT: vmovaps (%r9), %ymm9 -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%r8), %ymm13 +; AVX2-NEXT: vmovaps (%r9), %ymm8 +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6510,17 +6532,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vmovaps (%rax), %xmm8 -; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rax), %xmm7 +; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 16(%rax), %xmm5 ; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX2-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovaps 16(%rdx), %xmm4 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rax), %xmm4 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] @@ -6528,7 +6550,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -6572,10 +6594,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-NEXT: vbroadcastsd 72(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vbroadcastsd 72(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps 64(%r8), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -6652,28 +6674,27 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rdx), %ymm0 -; AVX2-NEXT: vbroadcastsd 136(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vbroadcastsd 136(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps 128(%r8), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rax), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 128(%rax), %xmm15 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] ; AVX2-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%r8), %ymm15 +; AVX2-NEXT: vmovaps 128(%r8), %ymm14 ; AVX2-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6687,13 +6708,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps 160(%rdi), %xmm13 -; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX2-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rdx), %xmm11 +; AVX2-NEXT: vmovaps 160(%rdx), %xmm10 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; AVX2-NEXT: vbroadcastsd 168(%rcx), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -6707,9 +6728,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-NEXT: vmovaps 160(%r8), %ymm10 -; AVX2-NEXT: vmovaps 160(%r9), %ymm9 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-NEXT: vmovaps 160(%r8), %ymm9 +; AVX2-NEXT: vmovaps 160(%r9), %ymm7 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6721,168 +6742,169 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 192(%rdi), %xmm7 -; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm7, %ymm1 +; AVX2-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm6, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rdx), %ymm0 -; AVX2-NEXT: vbroadcastsd 200(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps 192(%r8), %xmm5 -; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vbroadcastsd 200(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 192(%r8), %xmm4 +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rax), %xmm4 +; AVX2-NEXT: vmovaps 192(%rax), %xmm3 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vmovaps 192(%rsi), %ymm5 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-NEXT: vmovaps 208(%rdx), %xmm6 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovaps 208(%rdx), %xmm8 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovaps 208(%rax), %xmm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-NEXT: vmovaps 224(%r8), %ymm2 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm0[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 224(%rax), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps 224(%r8), %ymm2 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[0,1],ymm0[0,1] +; AVX2-NEXT: vbroadcastsd 224(%rax), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] ; AVX2-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd 232(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 224(%rsi), %ymm3 -; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; AVX2-NEXT: vbroadcastsd 232(%rcx), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 224(%rsi), %ymm5 +; AVX2-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,2,3,3] ; AVX2-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps 240(%rax), %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd 32(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 88(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd 96(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 152(%rcx), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX2-NEXT: vbroadcastsd 160(%rcx), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-NEXT: vmovaps 240(%rax), %xmm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 16-byte Folded Reload +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5],ymm5[6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm8, %ymm5 # 16-byte Folded Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 32(%rcx), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX2-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-NEXT: vbroadcastsd %xmm4, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-NEXT: vbroadcastsd 216(%rcx), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 216(%r9), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps 224(%rdi), %xmm7 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX2-NEXT: vbroadcastsd 224(%rcx), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 88(%rcx), %ymm5 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX2-NEXT: vbroadcastsd 96(%rcx), %ymm8 +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 120(%rcx), %ymm8 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm5[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX2-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5],ymm5[6,7] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX2-NEXT: # ymm15 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 152(%rcx), %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 160(%rcx), %ymm7 +; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm11 +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-NEXT: vbroadcastsd 216(%rcx), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],mem[2,3] +; AVX2-NEXT: vmovaps 224(%rdi), %xmm6 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-NEXT: vbroadcastsd 216(%r9), %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX2-NEXT: vbroadcastsd 224(%rcx), %ymm6 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vbroadcastsd 240(%r9), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vbroadcastsd 248(%rcx), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 248(%r9), %ymm7 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-NEXT: # ymm10 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vbroadcastsd 240(%r9), %ymm6 +; AVX2-NEXT: vbroadcastsd 248(%r9), %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovaps 224(%rax), %ymm6 +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vmovaps %ymm2, 1760(%rcx) -; AVX2-NEXT: vmovaps %ymm14, 1728(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm2, 1728(%rcx) ; AVX2-NEXT: vmovaps %ymm0, 1696(%rcx) ; AVX2-NEXT: vmovaps %ymm10, 1664(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6890,7 +6912,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1600(%rcx) ; AVX2-NEXT: vmovaps %ymm1, 1568(%rcx) -; AVX2-NEXT: vmovaps %ymm5, 1536(%rcx) +; AVX2-NEXT: vmovaps %ymm4, 1536(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1504(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6899,7 +6921,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1408(%rcx) -; AVX2-NEXT: vmovaps %ymm4, 1376(%rcx) +; AVX2-NEXT: vmovaps %ymm3, 1376(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1344(%rcx) ; AVX2-NEXT: vmovaps %ymm9, 1312(%rcx) @@ -6913,8 +6935,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 1184(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1152(%rcx) -; AVX2-NEXT: vmovaps %ymm11, 1120(%rcx) -; AVX2-NEXT: vmovaps %ymm3, 1088(%rcx) +; AVX2-NEXT: vmovaps %ymm7, 1120(%rcx) +; AVX2-NEXT: vmovaps %ymm15, 1088(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 1056(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6923,7 +6945,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 992(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 960(%rcx) -; AVX2-NEXT: vmovaps %ymm6, 928(%rcx) +; AVX2-NEXT: vmovaps %ymm5, 928(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 896(%rcx) ; AVX2-NEXT: vmovaps %ymm8, 864(%rcx) @@ -6938,8 +6960,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 704(%rcx) ; AVX2-NEXT: vmovaps %ymm12, 672(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 640(%rcx) +; AVX2-NEXT: vmovaps %ymm13, 640(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 608(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6948,11 +6969,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 544(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 512(%rcx) -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 480(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 416(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 384(%rcx) @@ -6991,15 +7012,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FP-NEXT: vmovaps (%r8), %ymm12 -; AVX2-FP-NEXT: vmovaps (%r9), %ymm9 -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps (%r8), %ymm13 +; AVX2-FP-NEXT: vmovaps (%r9), %ymm8 +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-FP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7014,17 +7035,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FP-NEXT: vmovaps (%rax), %xmm8 -; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rax), %xmm7 +; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 16(%rax), %xmm5 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX2-FP-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 16(%rdx), %xmm4 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rax), %xmm4 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] @@ -7032,7 +7053,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -7076,10 +7097,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-FP-NEXT: vbroadcastsd 72(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vbroadcastsd 72(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -7156,28 +7177,27 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rdx), %ymm0 -; AVX2-FP-NEXT: vbroadcastsd 136(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vbroadcastsd 136(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rax), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%rax), %xmm15 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%r8), %ymm15 +; AVX2-FP-NEXT: vmovaps 128(%r8), %ymm14 ; AVX2-FP-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7191,13 +7211,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm13 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rdx), %xmm11 +; AVX2-FP-NEXT: vmovaps 160(%rdx), %xmm10 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; AVX2-FP-NEXT: vbroadcastsd 168(%rcx), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7211,9 +7231,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm10 -; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm9 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-FP-NEXT: vmovaps 160(%r8), %ymm9 +; AVX2-FP-NEXT: vmovaps 160(%r9), %ymm7 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7225,168 +7245,169 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-FP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm7 -; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm7, %ymm1 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm6, %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rdx), %ymm0 -; AVX2-FP-NEXT: vbroadcastsd 200(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm5 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vbroadcastsd 200(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm4 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm4 +; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm3 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm5 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FP-NEXT: vmovaps 208(%rdx), %xmm6 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 208(%rdx), %xmm8 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovaps 208(%rax), %xmm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm2 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm0[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 224(%rax), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovaps 224(%r8), %ymm2 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[0,1],ymm0[0,1] +; AVX2-FP-NEXT: vbroadcastsd 224(%rax), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] ; AVX2-FP-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; AVX2-FP-NEXT: vbroadcastsd 232(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm3 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; AVX2-FP-NEXT: vbroadcastsd 232(%rcx), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm5 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 240(%rax), %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, (%rsp), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd 32(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 88(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd 96(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 152(%rcx), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX2-FP-NEXT: vbroadcastsd 160(%rcx), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-FP-NEXT: vmovaps 240(%rax), %xmm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, (%rsp), %ymm8, %ymm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 32(%rcx), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-FP-NEXT: vbroadcastsd %xmm4, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FP-NEXT: vbroadcastsd 216(%rcx), %ymm5 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 216(%r9), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm7 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX2-FP-NEXT: vbroadcastsd 224(%rcx), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd 88(%rcx), %ymm5 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd 96(%rcx), %ymm8 +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 120(%rcx), %ymm8 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5],ymm5[6,7] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm15 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 152(%rcx), %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 160(%rcx), %ymm7 +; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm11 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-FP-NEXT: vbroadcastsd 216(%rcx), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],mem[2,3] +; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm6 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd 216(%r9), %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX2-FP-NEXT: vbroadcastsd 224(%rcx), %ymm6 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-FP-NEXT: vbroadcastsd 240(%r9), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vbroadcastsd 248(%rcx), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 248(%r9), %ymm7 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm10 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vbroadcastsd 240(%r9), %ymm6 +; AVX2-FP-NEXT: vbroadcastsd 248(%r9), %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 224(%rax), %ymm6 +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vmovaps %ymm2, 1760(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm14, 1728(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm2, 1728(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm0, 1696(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm10, 1664(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7394,7 +7415,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1600(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm1, 1568(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm5, 1536(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm4, 1536(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1504(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7403,7 +7424,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1408(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm4, 1376(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm3, 1376(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1344(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm9, 1312(%rcx) @@ -7417,8 +7438,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 1184(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1152(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm11, 1120(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm3, 1088(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm7, 1120(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm15, 1088(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 1056(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7427,7 +7448,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 992(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 960(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm6, 928(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm5, 928(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 896(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm8, 864(%rcx) @@ -7442,8 +7463,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm12, 672(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 640(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm13, 640(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 608(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7452,11 +7472,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 544(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rcx) -; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rcx) @@ -7495,15 +7515,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm12 -; AVX2-FCP-NEXT: vmovaps (%r9), %ymm9 -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm13 +; AVX2-FCP-NEXT: vmovaps (%r9), %ymm8 +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7518,17 +7538,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FCP-NEXT: vmovaps (%rax), %xmm8 -; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rax), %xmm7 +; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm5 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX2-FCP-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 16(%rdx), %xmm4 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm4 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] @@ -7536,7 +7556,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -7580,10 +7600,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-FCP-NEXT: vbroadcastsd 72(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vbroadcastsd 72(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -7660,28 +7680,27 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm0 -; AVX2-FCP-NEXT: vbroadcastsd 136(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vbroadcastsd 136(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 128(%r8), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%rax), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%rax), %xmm15 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm15 +; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm14 ; AVX2-FCP-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7695,13 +7714,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm13 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 160(%rdx), %xmm11 +; AVX2-FCP-NEXT: vmovaps 160(%rdx), %xmm10 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; AVX2-FCP-NEXT: vbroadcastsd 168(%rcx), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7715,9 +7734,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm10 -; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm9 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm9 +; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm7 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7729,168 +7748,169 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm7 -; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm7, %ymm1 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm6, %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm0 -; AVX2-FCP-NEXT: vbroadcastsd 200(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm5 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vbroadcastsd 200(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm4 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm4 +; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm3 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm5 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FCP-NEXT: vmovaps 208(%rdx), %xmm6 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 208(%rdx), %xmm8 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovaps 208(%rax), %xmm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm2 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm0[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 224(%rax), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm2 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[0,1],ymm0[0,1] +; AVX2-FCP-NEXT: vbroadcastsd 224(%rax), %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastsd 232(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm3 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; AVX2-FCP-NEXT: vbroadcastsd 232(%rcx), %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm5 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 240(%rax), %xmm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, (%rsp), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd 32(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 88(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd 96(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 152(%rcx), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX2-FCP-NEXT: vbroadcastsd 160(%rcx), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-FCP-NEXT: vmovaps 240(%rax), %xmm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, (%rsp), %ymm8, %ymm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 32(%rcx), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vbroadcastsd %xmm4, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 216(%rcx), %ymm5 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 216(%r9), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm7 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd 224(%rcx), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd 88(%rcx), %ymm5 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd 96(%rcx), %ymm8 +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 120(%rcx), %ymm8 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm15 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 152(%rcx), %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 160(%rcx), %ymm7 +; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm11 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-FCP-NEXT: vbroadcastsd 216(%rcx), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],mem[2,3] +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm6 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd 216(%r9), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vbroadcastsd 224(%rcx), %ymm6 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 240(%r9), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 248(%rcx), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 248(%r9), %ymm7 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 240(%r9), %ymm6 +; AVX2-FCP-NEXT: vbroadcastsd 248(%r9), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm6 +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm2, 1760(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm14, 1728(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 1728(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm0, 1696(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm10, 1664(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7898,7 +7918,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1600(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm1, 1568(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm5, 1536(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm4, 1536(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1504(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7907,7 +7927,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1408(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm4, 1376(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm3, 1376(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1344(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm9, 1312(%rcx) @@ -7921,8 +7941,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 1184(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1152(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm11, 1120(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm3, 1088(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm7, 1120(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm15, 1088(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 1056(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7931,7 +7951,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 992(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 960(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm6, 928(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm5, 928(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 896(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm8, 864(%rcx) @@ -7946,8 +7966,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 704(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm12, 672(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm13, 640(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 608(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7956,11 +7975,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 544(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 512(%rcx) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 480(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rcx) @@ -7994,33 +8013,32 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i64_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $2184, %rsp # imm = 0x888 +; AVX512-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm16 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm23 ; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm29 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512-NEXT: movb $96, %r10b ; AVX512-NEXT: kmovw %r10d, %k1 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 +; AVX512-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -8029,7 +8047,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,10,0,3,2,10,0,3] ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512-NEXT: vpermt2q %zmm8, %zmm22, %zmm4 +; AVX512-NEXT: vpermt2q %zmm14, %zmm22, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa (%r9), %ymm10 ; AVX512-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill @@ -8042,14 +8060,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX512-NEXT: movb $28, %r10b ; AVX512-NEXT: kmovw %r10d, %k2 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 ; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 @@ -8067,12 +8085,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm21 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm31 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] @@ -8099,13 +8116,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm22 +; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512-NEXT: vpermt2q %zmm22, %zmm15, %zmm1 +; AVX512-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm0 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm28 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512-NEXT: vpermt2q %zmm28, %zmm21, %zmm5 +; AVX512-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512-NEXT: vmovdqa64 128(%rax), %zmm30 ; AVX512-NEXT: vmovdqa 128(%r9), %ymm13 @@ -8135,21 +8152,20 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm12, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 +; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512-NEXT: vpermt2q %zmm7, %zmm2, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 ; AVX512-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8188,15 +8204,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 ; AVX512-NEXT: vpermt2q %zmm16, %zmm13, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm21 -; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm23 +; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm22 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm19 ; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512-NEXT: vpermt2q %zmm22, %zmm13, %zmm20 +; AVX512-NEXT: vpermt2q %zmm21, %zmm13, %zmm20 ; AVX512-NEXT: vpermi2q %zmm2, %zmm7, %zmm13 ; AVX512-NEXT: vpermi2q %zmm7, %zmm2, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm25 @@ -8207,43 +8223,43 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 ; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] -; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm15 ; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 ; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm16 ; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm15 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm14[0],zmm8[0],zmm14[2],zmm8[2],zmm14[4],zmm8[4],zmm14[6],zmm8[6] -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm9[0],zmm8[0],zmm9[2],zmm8[2],zmm9[4],zmm8[4],zmm9[6],zmm8[6] +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 ; AVX512-NEXT: vpermt2q %zmm8, %zmm12, %zmm17 ; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512-NEXT: vpermi2q %zmm22, %zmm18, %zmm3 +; AVX512-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm9 ; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 -; AVX512-NEXT: vpermi2q %zmm22, %zmm18, %zmm12 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512-NEXT: vpermt2q %zmm22, %zmm1, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm22 -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512-NEXT: vpermi2q %zmm21, %zmm18, %zmm12 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512-NEXT: vpermt2q %zmm21, %zmm1, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm10 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7] ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm24, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm25 = [12,0,0,3,4,5,6,13] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [12,u,u,3,4,5,6,13] ; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm25 ; AVX512-NEXT: movb $24, %sil ; AVX512-NEXT: kmovw %esi, %k3 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm6 {%k3} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6 {%k3} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,15,u,u] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 ; AVX512-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] @@ -8253,35 +8269,35 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k4} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm22 # 64-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqa 64(%rdx), %xmm8 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4} -; AVX512-NEXT: vmovdqa 128(%rdx), %xmm8 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-NEXT: vmovdqa64 128(%rdx), %xmm21 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm22 {%k4} +; AVX512-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm8 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k4} ; AVX512-NEXT: vmovdqa 192(%rdx), %xmm8 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k4} -; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm8 +; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm1, %zmm8 ; AVX512-NEXT: movb $112, %sil ; AVX512-NEXT: kmovw %esi, %k4 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm27 {%k4} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload ; AVX512-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm23 {%k4} +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm22 {%k4} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} +; AVX512-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm8 ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm1 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vinserti32x4 $3, 192(%rax), %zmm6, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm20 {%k4} ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 {%k4} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} @@ -8289,8 +8305,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: kmovw %esi, %k4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm24 {%k4} -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm22 # 64-byte Folded Reload -; AVX512-NEXT: # zmm22 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm21 # 64-byte Folded Reload +; AVX512-NEXT: # zmm21 = zmm16[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload @@ -8302,7 +8318,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movb $-61, %sil ; AVX512-NEXT: kmovw %esi, %k4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm22 {%k4} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm21 {%k4} ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload ; AVX512-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -8335,9 +8351,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} ; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm21 {%k4} +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm23 {%k4} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} ; AVX512-NEXT: vpbroadcastq 136(%rcx), %ymm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k4} @@ -8355,7 +8371,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,11,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,11,u,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm9, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512-NEXT: vmovdqa 192(%r8), %ymm7 @@ -8385,17 +8401,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm25, %zmm7 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14] ; AVX512-NEXT: vpermi2q %zmm1, %zmm31, %zmm8 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] ; AVX512-NEXT: vpermi2q %zmm1, %zmm5, %zmm9 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm6, %zmm8, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] ; AVX512-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm10, 1472(%rax) @@ -8411,9 +8427,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 512(%rax) ; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm21, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm24, 256(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 128(%rax) @@ -8422,14 +8438,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512-NEXT: vmovdqa64 %zmm20, 896(%rax) ; AVX512-NEXT: vmovdqa64 %zmm29, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm22, 448(%rax) ; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm27, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm7, 1728(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 1600(%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 1536(%rax) -; AVX512-NEXT: addq $2184, %rsp # imm = 0x888 +; AVX512-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -8437,435 +8453,437 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $2152, %rsp # imm = 0x868 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512-FCP-NEXT: movb $96, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] +; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm6 ; AVX512-FCP-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm10 -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512-FCP-NEXT: vmovdqa 128(%r8), %ymm11 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX512-FCP-NEXT: vmovdqa 128(%r8), %ymm12 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX512-FCP-NEXT: movb $28, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k2 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] ; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 ; AVX512-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] +; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX512-FCP-NEXT: vpermt2q %ymm6, %ymm5, %ymm12 -; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm11 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] +; AVX512-FCP-NEXT: vpermt2q %ymm6, %ymm5, %ymm11 ; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] +; AVX512-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm12 +; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm11 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm15, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm8, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm7 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm6[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm24 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm12 ; AVX512-FCP-NEXT: movb $48, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k3 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm3[0],zmm12[2],zmm3[2],zmm12[4],zmm3[4],zmm12[6],zmm3[6] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1] -; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm14 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1] +; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm14, %zmm16 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] +; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm20 -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm13, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm22 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm3, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm15, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm13, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm30 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm31 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm15, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm31 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm18 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm6, %zmm31 +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm27, %zmm1 -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm27, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] +; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm1 +; AVX512-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm20, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm11 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,11,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,11,u,u,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,11,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,11,u,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm20, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [12,0,0,3,4,5,6,13] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [12,u,u,3,4,5,6,13] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $24, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 {%k3} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,15,0,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,1,2,3,4,15,u,u] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm24 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512-FCP-NEXT: movb $12, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k4} ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm9 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} ; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4} ; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k4} ; AVX512-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm9, %zmm5 ; AVX512-FCP-NEXT: movb $112, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 {%k4} -; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm5 +; AVX512-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm9, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm5, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k4} -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm2, %zmm6 +; AVX512-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm6, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 {%k4} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k4} +; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: movb $14, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm17 {%k4} +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k4} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm30 {%k4} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k4} ; AVX512-FCP-NEXT: movb $120, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k4} -; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k4} +; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm7 = zmm23[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k4} +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k4} ; AVX512-FCP-NEXT: movb $-61, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm29[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm29[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm28[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm28[4,5,6,7] ; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k4} ; AVX512-FCP-NEXT: movb $6, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k3} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k3} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} ; AVX512-FCP-NEXT: movb $-31, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k3} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} ; AVX512-FCP-NEXT: movb $56, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} ; AVX512-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k4} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 {%k3} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm22 {%k4} ; AVX512-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k3} ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k4} ; AVX512-FCP-NEXT: movb $64, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k4} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 ; AVX512-FCP-NEXT: movb $8, %al ; AVX512-FCP-NEXT: kmovw %eax, %k4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k4} -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa 192(%r8), %ymm2 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,3,4,5,6,14] -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 {%k4} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,12,u,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [13,u,2,3,4,5,6,14] +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u] ; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm6, %zmm5 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 1472(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 1152(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, 1024(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 1152(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 960(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 832(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 640(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 1344(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 896(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 1344(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 896(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 1728(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 1600(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1536(%rax) ; AVX512-FCP-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -8874,11 +8892,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm7 @@ -8888,20 +8906,20 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512DQ-NEXT: movb $96, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k1 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [15,7,15,7,15,7,15,7] +; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm25, %zmm1 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] @@ -8919,34 +8937,34 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX512DQ-NEXT: movb $28, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k2 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] +; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm21 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm20, %zmm21 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} @@ -8968,17 +8986,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm20, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm16, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm25, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm25, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm31 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 @@ -8997,16 +9015,16 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm21 -; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm0, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm2, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm3, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] @@ -9021,58 +9039,58 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm15 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,12,4,3,0,12,4] ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm3, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm3, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm29 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm1, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm6, %zmm30 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm17, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm3, %zmm31 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm21 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm3, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm3, %zmm20 ; AVX512DQ-NEXT: movb $48, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k3 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm14, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm14, %zmm28 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm13, %zmm27 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm26 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm14, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm13, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm13, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm14, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm13, %zmm19 ; AVX512DQ-NEXT: vpermi2q %zmm7, %zmm8, %zmm13 ; AVX512DQ-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm24 @@ -9081,48 +9099,48 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm17, %zmm4 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6] +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm15, %zmm4 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm17[0],zmm0[0],zmm17[2],zmm0[2],zmm17[4],zmm0[4],zmm17[6],zmm0[6] ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm25 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm17 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm15, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm1, %zmm9 +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm11, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm6, %zmm10 -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm11, %zmm6 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm19[0],zmm11[2],zmm19[2],zmm11[4],zmm19[4],zmm11[6],zmm19[6] +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm6, %zmm10 +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm11, %zmm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm15, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm15, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm15, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm15, %zmm12 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm22, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm20, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm22 = [12,0,0,3,4,5,6,13] -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm3, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [12,u,u,3,4,5,6,13] +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm3, %zmm20 ; AVX512DQ-NEXT: movb $24, %sil ; AVX512DQ-NEXT: kmovw %esi, %k3 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 {%k3} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,15,0,0] -; AVX512DQ-NEXT: vpermi2q %zmm19, %zmm4, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,1,2,3,4,15,u,u] +; AVX512DQ-NEXT: vpermi2q %zmm22, %zmm4, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} ; AVX512DQ-NEXT: movb $120, %sil @@ -9160,18 +9178,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} -; AVX512DQ-NEXT: movb $-31, %sil -; AVX512DQ-NEXT: kmovw %esi, %k3 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm18 {%k3} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} ; AVX512DQ-NEXT: movb $112, %sil ; AVX512DQ-NEXT: kmovw %esi, %k3 @@ -9180,10 +9190,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3} ; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm2 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k4} ; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm2 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -9195,13 +9208,18 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k4} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm18 {%k3} +; AVX512DQ-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm19 {%k3} ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm19, %zmm0 +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm22, %zmm0 ; AVX512DQ-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm13 {%k3} +; AVX512DQ-NEXT: movb $-31, %sil +; AVX512DQ-NEXT: kmovw %esi, %k3 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 {%k3} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512DQ-NEXT: movb $56, %sil ; AVX512DQ-NEXT: kmovw %esi, %k3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9213,22 +9231,22 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} ; AVX512DQ-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm20 {%k3} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} ; AVX512DQ-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4} ; AVX512DQ-NEXT: movb $64, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k4 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm24 {%k4} +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm24 {%k4} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm19 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm14 {%k3} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,11,u,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} ; AVX512DQ-NEXT: vmovdqa 192(%r8), %ymm6 @@ -9248,8 +9266,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm22 {%k1} ; AVX512DQ-NEXT: movb $8, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} @@ -9259,17 +9277,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,12,0,3,4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm22, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,12,u,3,4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm20, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm24, %zmm8 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm21, %zmm9 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm16, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1472(%rax) @@ -9281,17 +9299,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1088(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 832(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 960(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 896(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 832(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 768(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 640(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 512(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 384(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) @@ -9311,460 +9329,457 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $2088, %rsp # imm = 0x828 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-FCP-NEXT: movb $96, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%r8), %ymm15 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm1[0],ymm17[2],ymm1[2] +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %ymm16 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] ; AVX512DQ-FCP-NEXT: movb $28, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm5, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] -; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm5, %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] -; AVX512DQ-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX512DQ-FCP-NEXT: vpermt2q %ymm6, %ymm5, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm4[0],ymm16[2],ymm4[2] +; AVX512DQ-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm19, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm7 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm30[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm5 ; AVX512DQ-FCP-NEXT: movb $48, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm18[0],zmm7[0],zmm18[2],zmm7[2],zmm18[4],zmm7[4],zmm18[6],zmm7[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm25[0],zmm6[0],zmm25[2],zmm6[2],zmm25[4],zmm6[4],zmm25[6],zmm6[6] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1] ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm18, %zmm16 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm18, %zmm7, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm31, %zmm18 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm8[0],zmm26[0],zmm8[2],zmm26[2],zmm8[4],zmm26[4],zmm8[6],zmm26[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm25, %zmm16 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm15 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm19 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm14, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm9[0],zmm24[0],zmm9[2],zmm24[2],zmm9[4],zmm24[4],zmm9[6],zmm24[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm19, %zmm9 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm9[0],zmm21[0],zmm9[2],zmm21[2],zmm9[4],zmm21[4],zmm9[6],zmm21[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm29, %zmm23, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm10[0],zmm24[0],zmm10[2],zmm24[2],zmm10[4],zmm24[4],zmm10[6],zmm24[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm29, %zmm23, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm13 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm10 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,11,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,11,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm17, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm6, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm17, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm9 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm27[0],zmm22[0],zmm27[2],zmm22[2],zmm27[4],zmm22[4],zmm27[6],zmm22[6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm27, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm22, %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm11 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,11,u,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,11,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm23, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [12,u,u,3,4,5,6,13] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm19, %zmm23 ; AVX512DQ-FCP-NEXT: movb $24, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,15,0,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: movb $14, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm26 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k4} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k4} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm20 {%k4} ; AVX512DQ-FCP-NEXT: movb $120, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k4} -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm12[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm7[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} ; AVX512DQ-FCP-NEXT: movb $-61, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k4} -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm5[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 {%k4} +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm4[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k4} +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: movb $12, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k4} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 {%k3} ; AVX512DQ-FCP-NEXT: movb $-31, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k3} ; AVX512DQ-FCP-NEXT: movb $112, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm12 {%k3} +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm10 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm17 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k4} -; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: movb $6, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm14 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm19 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm16 {%k3} -; AVX512DQ-FCP-NEXT: movb $56, %sil +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm24 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,15,u,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm26, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm16 {%k4} +; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: movb $6, %sil +; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k4} +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm16 {%k3} +; AVX512DQ-FCP-NEXT: movb $56, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k4} ; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k4} ; AVX512DQ-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k4} ; AVX512DQ-FCP-NEXT: movb $64, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: movb $8, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,12,0,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [13,0,2,3,4,5,6,14] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,15,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 192(%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm0[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,12,u,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,3,4,5,6,14] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm15, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1472(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 1408(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 1472(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 1344(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 1216(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 1088(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 896(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1216(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 1088(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 896(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 768(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 512(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1600(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 1536(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 1536(%rax) ; AVX512DQ-FCP-NEXT: addq $2088, %rsp # imm = 0x828 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride7_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2184, %rsp # imm = 0x888 +; AVX512BW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm23 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm29 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512BW-NEXT: movb $96, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,3,11,3,11,3,11,3] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -9773,7 +9788,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,10,0,3,2,10,0,3] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa (%r9), %ymm10 ; AVX512BW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill @@ -9786,14 +9801,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX512BW-NEXT: movb $28, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 @@ -9811,12 +9826,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] @@ -9843,13 +9857,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm28 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa 128(%r9), %ymm13 @@ -9879,21 +9893,20 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9932,15 +9945,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm13, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm13, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm13, %zmm20 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm13 ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 @@ -9951,43 +9964,43 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k3} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm14[0],zmm8[0],zmm14[2],zmm8[2],zmm14[4],zmm8[4],zmm14[6],zmm8[6] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm9[0],zmm8[0],zmm9[2],zmm8[2],zmm9[4],zmm8[4],zmm9[6],zmm8[6] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm22, %zmm18, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm22, %zmm18, %zmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm18, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm10 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm24, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm25 = [12,0,0,3,4,5,6,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [12,u,u,3,4,5,6,13] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm25 ; AVX512BW-NEXT: movb $24, %sil ; AVX512BW-NEXT: kmovd %esi, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k3} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,15,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,15,u,u] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] @@ -9997,35 +10010,35 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm27 {%k4} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm22 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm8 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %xmm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm22 {%k4} +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm8 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k4} ; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm8 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k4} -; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm8 +; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm1, %zmm8 ; AVX512BW-NEXT: movb $112, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm27 {%k4} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 {%k4} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k4} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} +; AVX512BW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm1 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vinserti32x4 $3, 192(%rax), %zmm6, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k4} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k4} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} @@ -10033,8 +10046,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: kmovd %esi, %k4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k4} -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm22 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm22 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm21 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm21 = zmm16[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload @@ -10046,7 +10059,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movb $-61, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k4} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k4} ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -10079,9 +10092,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} ; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm21 {%k4} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm23 {%k4} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} ; AVX512BW-NEXT: vpbroadcastq 136(%rcx), %ymm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k4} @@ -10099,7 +10112,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,11,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,11,u,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%r8), %ymm7 @@ -10129,17 +10142,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm25, %zmm7 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm8 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm9 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm8, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm10, 1472(%rax) @@ -10155,9 +10168,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 128(%rax) @@ -10166,14 +10179,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 896(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 1728(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 1536(%rax) -; AVX512BW-NEXT: addq $2184, %rsp # imm = 0x888 +; AVX512BW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -10181,435 +10194,437 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $2152, %rsp # imm = 0x868 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512BW-FCP-NEXT: movb $96, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [15,7,15,7,15,7,15,7] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] +; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,9,0,3,4,9,0,3] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm15, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa 64(%r9), %ymm6 ; AVX512BW-FCP-NEXT: vmovdqa 128(%r9), %ymm4 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512BW-FCP-NEXT: vmovdqa 128(%r8), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 128(%r8), %ymm12 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX512BW-FCP-NEXT: movb $28, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] ; AVX512BW-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm10 ; AVX512BW-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] +; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] -; AVX512BW-FCP-NEXT: vpermt2q %ymm6, %ymm5, %ymm12 -; AVX512BW-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512BW-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm11 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] +; AVX512BW-FCP-NEXT: vpermt2q %ymm6, %ymm5, %ymm11 ; AVX512BW-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] +; AVX512BW-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm12 +; AVX512BW-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm15, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm11 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm7[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm8, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm6[2,3,2,3],zmm7[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm4 -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm5 +; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm24 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm15, %zmm12 ; AVX512BW-FCP-NEXT: movb $48, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm12[0],zmm3[0],zmm12[2],zmm3[2],zmm12[4],zmm3[4],zmm12[6],zmm3[6] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1] -; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm12, %zmm14 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm14[0],zmm3[0],zmm14[2],zmm3[2],zmm14[4],zmm3[4],zmm14[6],zmm3[6] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1] +; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm14, %zmm16 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] +; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm20 -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm13, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm22 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm3, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm15, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm16[0],zmm29[0],zmm16[2],zmm29[2],zmm16[4],zmm29[4],zmm16[6],zmm29[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm21, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm13, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm17[0],zmm28[0],zmm17[2],zmm28[2],zmm17[4],zmm28[4],zmm17[6],zmm28[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm6, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm6, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm31 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm18[0],zmm28[0],zmm18[2],zmm28[2],zmm18[4],zmm28[4],zmm18[6],zmm28[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm21, %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm15, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm18 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm6, %zmm31 +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm27, %zmm1 -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm27, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm1 +; AVX512BW-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm20, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm2, %zmm11 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,11,0,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm24, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,11,u,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm12, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,11,0,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,11,u,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm20, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [12,0,0,3,4,5,6,13] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [12,u,u,3,4,5,6,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $24, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,2,3,4,15,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,1,2,3,4,15,u,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm9, %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-FCP-NEXT: movb $12, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k4} ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm9 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm20 {%k4} ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm16 {%k4} ; AVX512BW-FCP-NEXT: vinserti32x4 $3, (%rax), %zmm9, %zmm5 ; AVX512BW-FCP-NEXT: movb $112, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm9 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm5 +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm9, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 {%k4} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm2, %zmm6 +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: movb $14, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm17 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k4} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm30 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm15 {%k4} ; AVX512BW-FCP-NEXT: movb $120, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 {%k4} -; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm5 # 64-byte Folded Reload -; AVX512BW-FCP-NEXT: # zmm5 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 {%k4} +; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm7 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: # zmm7 = zmm23[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19 {%k4} ; AVX512BW-FCP-NEXT: movb $-61, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm29[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm29[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k4} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm28[4,5,6,7] ; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k4} ; AVX512BW-FCP-NEXT: movb $6, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k3} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k3} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} ; AVX512BW-FCP-NEXT: movb $-31, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm31 {%k3} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k3} ; AVX512BW-FCP-NEXT: movb $56, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} ; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm26 {%k3} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm22 {%k4} ; AVX512BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm22 {%k3} ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k4} ; AVX512BW-FCP-NEXT: movb $64, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: movb $8, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 192(%r8), %ymm2 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,12,0,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [13,0,2,3,4,5,6,14] -; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,5,15,0] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,12,u,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [13,u,2,3,4,5,6,14] +; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u] ; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 1472(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 1152(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm1, 1024(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 1152(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 960(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 832(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 640(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 512(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 1344(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 896(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 1344(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 896(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 1728(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 1600(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1536(%rax) ; AVX512BW-FCP-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -10618,11 +10633,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm7 @@ -10632,20 +10647,20 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512DQ-BW-NEXT: movb $96, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [15,7,15,7,15,7,15,7] +; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] @@ -10663,34 +10678,34 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX512DQ-BW-NEXT: movb $28, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] +; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm21 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} @@ -10712,17 +10727,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 @@ -10741,16 +10756,16 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] @@ -10765,58 +10780,58 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm15 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,12,4,3,0,12,4] ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm29 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm20 ; AVX512DQ-BW-NEXT: movb $48, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm28 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,8,0,1,0,8,0,1] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm13, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm26 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm13, %zmm19 ; AVX512DQ-BW-NEXT: vpermi2q %zmm7, %zmm8, %zmm13 ; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm24 @@ -10825,48 +10840,48 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm17, %zmm4 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6] +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm17[0],zmm0[0],zmm17[2],zmm0[2],zmm17[4],zmm0[4],zmm17[6],zmm0[6] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm25 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm17 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm11, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm10 -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm11, %zmm6 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm19[0],zmm11[2],zmm19[2],zmm11[4],zmm19[4],zmm11[6],zmm19[6] +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm11, %zmm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm12 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm19 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm22, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm20, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm22 = [12,0,0,3,4,5,6,13] -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm3, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [12,u,u,3,4,5,6,13] +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm3, %zmm20 ; AVX512DQ-BW-NEXT: movb $24, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-NEXT: vpermi2q %zmm22, %zmm4, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: movb $120, %sil @@ -10904,18 +10919,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} -; AVX512DQ-BW-NEXT: movb $-31, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: movb $112, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k3 @@ -10924,10 +10931,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k4} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k4} ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -10939,13 +10949,18 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k4} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm23 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm18 {%k3} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 128(%rax), %zmm2, %zmm19 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm19, %zmm0 +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm13 {%k3} +; AVX512DQ-BW-NEXT: movb $-31, %sil +; AVX512DQ-BW-NEXT: kmovd %esi, %k3 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k3} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: movb $56, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10957,22 +10972,22 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} ; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} ; AVX512DQ-BW-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k4} ; AVX512DQ-BW-NEXT: movb $64, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm24 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm24 {%k4} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm19 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,11,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,11,u,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm10, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 192(%r8), %ymm6 @@ -10992,8 +11007,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm22 {%k1} ; AVX512DQ-BW-NEXT: movb $8, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} @@ -11003,17 +11018,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,12,0,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm22, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,12,u,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm20, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm24, %zmm8 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm9 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm16, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm8 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 1472(%rax) @@ -11025,17 +11040,17 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1088(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 896(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 832(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 960(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 896(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 832(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 768(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 640(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 512(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 384(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) @@ -11055,427 +11070,425 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $2088, %rsp # imm = 0x828 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-BW-FCP-NEXT: movb $96, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] +; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm31, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,10,0,3,2,10,0,3] +; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm19, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r8), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm1[0],ymm17[2],ymm1[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r8), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] ; AVX512DQ-BW-FCP-NEXT: movb $28, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm1, %ymm5, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] -; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm0, %ymm5, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] +; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm0, %ymm5, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] +; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm6, %ymm5, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm4[0],ymm16[2],ymm4[2] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm4, %ymm5, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm19, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm13, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm30[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm30[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movb $48, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm18[0],zmm7[0],zmm18[2],zmm7[2],zmm18[4],zmm7[4],zmm18[6],zmm7[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm25[0],zmm6[0],zmm25[2],zmm6[2],zmm25[4],zmm6[4],zmm25[6],zmm6[6] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,0,1,0,8,0,1] ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm18, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm7, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm31, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm25, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm13, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm8[0],zmm26[0],zmm8[2],zmm26[2],zmm8[4],zmm26[4],zmm8[6],zmm26[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] +; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm25, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] +; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm17, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm14, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm9[0],zmm24[0],zmm9[2],zmm24[2],zmm9[4],zmm24[4],zmm9[6],zmm24[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm13, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm19, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm9[0],zmm21[0],zmm9[2],zmm21[2],zmm9[4],zmm21[4],zmm9[6],zmm21[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm14, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm17, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm29, %zmm23, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm10[0],zmm24[0],zmm10[2],zmm24[2],zmm10[4],zmm24[4],zmm10[6],zmm24[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm29, %zmm23, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm13, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm14, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm19, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm20, %zmm27, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm19, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,11,0,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,11,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm17, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm6, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm17, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm27[0],zmm22[0],zmm27[2],zmm22[2],zmm27[4],zmm22[4],zmm27[6],zmm22[6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm27, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm22, %zmm27, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm17, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm17, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,11,u,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,11,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm23, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [12,u,u,3,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm19, %zmm23 ; AVX512DQ-BW-FCP-NEXT: movb $24, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm21 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: movb $14, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm26 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm25 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm20 {%k4} ; AVX512DQ-BW-FCP-NEXT: movb $120, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k4} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm12[0,1,2,3],zmm22[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm7[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} ; AVX512DQ-BW-FCP-NEXT: movb $-61, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21 {%k4} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm5[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 {%k4} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm4[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: movb $12, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm12 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k4} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 {%k3} ; AVX512DQ-BW-FCP-NEXT: movb $-31, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm28 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm27 {%k3} ; AVX512DQ-BW-FCP-NEXT: movb $112, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm12 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm3, %zmm10 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm17 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm17 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: movb $6, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm24 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm19 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm24 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm26, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm16 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: movb $6, %sil +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm2, %zmm16 {%k3} ; AVX512DQ-BW-FCP-NEXT: movb $56, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k4} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k4} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k4} ; AVX512DQ-BW-FCP-NEXT: movb $64, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movb $8, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm20 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%r8), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,12,0,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [13,0,2,3,4,5,6,14] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%r8), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm0[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,12,u,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [13,u,2,3,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm3, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1472(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 1408(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 1472(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 1408(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 1344(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 1216(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 1088(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 960(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 896(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1216(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 1152(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 1088(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 1024(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 960(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 896(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 832(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 768(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 768(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 576(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1728(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1600(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 1536(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1728(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 1536(%rax) ; AVX512DQ-BW-FCP-NEXT: addq $2088, %rsp # imm = 0x828 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -12632,7 +12645,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX-LABEL: store_i64_stride7_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $3816, %rsp # imm = 0xEE8 +; AVX-NEXT: subq $3848, %rsp # imm = 0xF08 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps (%rsi), %ymm4 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12672,30 +12685,30 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 16(%r8), %xmm1 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%rsi), %xmm0 -; AVX-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm3 -; AVX-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 32(%rdx), %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm1 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[2] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 32(%rax), %xmm5 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovaps 32(%r9), %xmm3 -; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps 32(%r8), %xmm2 -; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 32(%r9), %xmm4 +; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps 32(%r8), %xmm3 +; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm4[0] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX-NEXT: vbroadcastsd 40(%rcx), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 32(%rsi), %ymm1 @@ -12703,18 +12716,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 48(%rax), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 32(%r9), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 48(%r8), %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovaps 48(%rax), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 32(%r9), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 48(%r8), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 48(%rcx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 48(%rcx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 64(%r9), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12774,19 +12787,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX-NEXT: vbroadcastsd 104(%rcx), %ymm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 112(%rdi), %xmm2 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 112(%rax), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 112(%rax), %xmm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%r9), %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12892,21 +12905,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rdx), %ymm1 -; AVX-NEXT: vbroadcastsd 200(%rcx), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vbroadcastsd 200(%rcx), %ymm1 +; AVX-NEXT: vmovaps 192(%rdx), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vmovaps 192(%rsi), %xmm3 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rsi), %ymm2 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 192(%rsi), %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovaps 208(%rax), %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12989,41 +13002,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vmovaps 272(%rax), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 256(%r9), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 272(%r8), %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovaps 272(%rax), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 256(%r9), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 272(%r8), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 272(%rcx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 272(%rcx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vmovaps 288(%rsi), %xmm2 +; AVX-NEXT: vmovaps 288(%rdi), %xmm3 +; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX-NEXT: vinsertf128 $1, 288(%rcx), %ymm4, %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 288(%rsi), %xmm0 -; AVX-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, 288(%rcx), %ymm2, %ymm3 -; AVX-NEXT: vmovaps 288(%rdx), %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] -; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vmovaps 288(%r9), %xmm3 -; AVX-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps 288(%r8), %xmm2 -; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] +; AVX-NEXT: vmovaps 288(%rdx), %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm1 +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[2] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 288(%r9), %xmm4 +; AVX-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps 288(%r8), %xmm3 +; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm4[0] ; AVX-NEXT: vmovaps 288(%rax), %xmm5 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX-NEXT: vbroadcastsd 296(%rcx), %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 288(%rsi), %ymm1 @@ -13031,18 +13044,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 304(%rax), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 288(%r9), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 304(%r8), %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX-NEXT: vmovaps 304(%rax), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 288(%r9), %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 304(%r8), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 304(%rcx), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 304(%rcx), %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 320(%r9), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13102,35 +13115,36 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX-NEXT: vbroadcastsd 360(%rcx), %ymm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX-NEXT: vmovaps 352(%rsi), %ymm3 +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 368(%rdi), %xmm2 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 352(%rsi), %ymm15 -; AVX-NEXT: vmovaps 368(%rdi), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 368(%rax), %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: vmovaps 368(%rax), %xmm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 352(%r9), %ymm14 +; AVX-NEXT: vmovaps 352(%r9), %ymm15 ; AVX-NEXT: vmovaps 368(%r8), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 368(%rcx), %xmm12 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX-NEXT: vmovaps 368(%rcx), %xmm13 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 384(%r9), %xmm13 +; AVX-NEXT: vmovaps 384(%r9), %xmm14 ; AVX-NEXT: vmovaps 384(%r8), %xmm0 -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0] ; AVX-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX-NEXT: vmovaps 384(%rax), %xmm11 -; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX-NEXT: vmovaps 384(%rax), %xmm12 +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 384(%rdx), %ymm1 @@ -13143,19 +13157,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 384(%rsi), %ymm10 +; AVX-NEXT: vmovaps 384(%rsi), %ymm11 ; AVX-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vmovaps 400(%rax), %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 384(%r9), %ymm9 +; AVX-NEXT: vmovaps 384(%r9), %ymm10 ; AVX-NEXT: vmovaps 400(%r8), %xmm0 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX-NEXT: vmovaps 400(%rcx), %xmm8 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,3,2,3] +; AVX-NEXT: vmovaps 400(%rcx), %xmm9 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 416(%rsi), %xmm5 @@ -13167,11 +13181,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[2] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 -; AVX-NEXT: vmovapd 416(%r9), %xmm7 +; AVX-NEXT: vmovapd 416(%r9), %xmm8 ; AVX-NEXT: vmovapd 416(%r8), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm7[0] -; AVX-NEXT: vmovapd 416(%rax), %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm8[0] +; AVX-NEXT: vmovapd 416(%rax), %xmm7 +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm2[1] @@ -13184,24 +13198,24 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vbroadcastsd 456(%rcx), %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovapd 448(%rdx), %ymm5 -; AVX-NEXT: vinsertf128 $1, 448(%r8), %ymm5, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vinsertf128 $1, 448(%r8), %ymm5, %ymm3 ; AVX-NEXT: vmovaps 480(%rsi), %xmm2 ; AVX-NEXT: vmovaps 480(%rdi), %xmm6 -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm2[0] -; AVX-NEXT: vinsertf128 $1, 480(%rcx), %ymm1, %ymm3 -; AVX-NEXT: vmovaps 480(%rdx), %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[2] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX-NEXT: vbroadcastsd 488(%rcx), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm2[0] +; AVX-NEXT: vinsertf128 $1, 480(%rcx), %ymm0, %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 480(%rdx), %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX-NEXT: vbroadcastsd 488(%rcx), %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vinsertf128 $1, 480(%r8), %ymm3, %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps (%rdi), %ymm0 ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -13380,177 +13394,178 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 352(%r8), %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] ; AVX-NEXT: vmovaps 368(%rdx), %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm11[1] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm12[1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 384(%r8), %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm4[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm7[1] ; AVX-NEXT: vmovapd 416(%rdi), %ymm1 ; AVX-NEXT: vmovapd 416(%rsi), %ymm2 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 416(%r8), %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX-NEXT: vmovapd 432(%rcx), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX-NEXT: vmovapd 432(%rcx), %xmm0 ; AVX-NEXT: vmovapd 432(%rdx), %xmm3 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 432(%rdi), %xmm0 -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm2[0,0,3,2] -; AVX-NEXT: vmovapd 416(%rax), %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm3[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] -; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: vmovaps 448(%rdi), %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 432(%rdi), %xmm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1],mem[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; AVX-NEXT: vmovapd 416(%rax), %ymm1 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm2[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3] +; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vmovaps 448(%rdi), %xmm13 +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX-NEXT: vmovapd 448(%r8), %ymm3 -; AVX-NEXT: vinsertf128 $1, 448(%rax), %ymm3, %ymm7 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 448(%rdi), %ymm0 -; AVX-NEXT: vmovapd 448(%rsi), %ymm7 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] -; AVX-NEXT: vmovapd 464(%rdi), %xmm8 -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0,0,3,2] -; AVX-NEXT: vmovapd 448(%rax), %ymm10 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3] -; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 -; AVX-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] -; AVX-NEXT: vmovapd 480(%r8), %ymm13 -; AVX-NEXT: vinsertf128 $1, 480(%rax), %ymm13, %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 480(%rdi), %ymm5 -; AVX-NEXT: vmovapd 480(%rsi), %ymm6 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX-NEXT: vinsertf128 $1, 448(%rax), %ymm3, %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 448(%rdi), %ymm2 +; AVX-NEXT: vmovapd 448(%rsi), %ymm4 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] -; AVX-NEXT: vmovapd 496(%rdi), %xmm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,2] -; AVX-NEXT: vmovapd 480(%rax), %ymm15 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3] -; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX-NEXT: vbroadcastsd 440(%r9), %ymm6 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] -; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovapd 464(%rcx), %xmm1 -; AVX-NEXT: vmovapd 464(%rdx), %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] -; AVX-NEXT: vbroadcastsd 464(%r9), %ymm3 -; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm2[2,3] +; AVX-NEXT: vmovapd 464(%rdi), %xmm2 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,2] +; AVX-NEXT: vmovapd 448(%rax), %ymm10 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm10[2,3],ymm4[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3] ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX-NEXT: vbroadcastsd 472(%r9), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX-NEXT: vmovapd 480(%r8), %ymm6 +; AVX-NEXT: vinsertf128 $1, 480(%rax), %ymm6, %ymm4 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 480(%rdi), %ymm2 +; AVX-NEXT: vmovapd 480(%rsi), %ymm5 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX-NEXT: vmovapd 496(%rdi), %xmm4 +; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],mem[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3] +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,3,2] +; AVX-NEXT: vmovapd 480(%rax), %ymm2 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm5[2,3] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX-NEXT: vbroadcastsd 440(%r9), %ymm11 +; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2],ymm8[3] +; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm10[3] +; AVX-NEXT: vmovapd 464(%rcx), %xmm0 +; AVX-NEXT: vmovapd 464(%rdx), %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX-NEXT: vbroadcastsd 464(%r9), %ymm5 +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX-NEXT: vbroadcastsd 472(%r9), %ymm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3] +; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovapd 496(%rcx), %xmm0 ; AVX-NEXT: vmovapd 496(%rdx), %xmm1 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3] -; AVX-NEXT: vbroadcastsd 496(%r9), %ymm2 -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] +; AVX-NEXT: vbroadcastsd 496(%r9), %ymm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX-NEXT: vbroadcastsd 504(%r9), %ymm1 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0],ymm2[1],ymm4[2,3] ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX-NEXT: # xmm7 = xmm0[0],mem[0] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX-NEXT: # xmm9 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 256(%rdx), %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX-NEXT: # xmm6 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 128(%rdx), %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm0[0],mem[0] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX-NEXT: # xmm2 = xmm0[0],mem[0] -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm4[0],mem[0] +; AVX-NEXT: vmovaps 192(%rdx), %xmm11 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX-NEXT: # xmm13 = xmm13[0],mem[0] +; AVX-NEXT: vmovaps 320(%rdx), %xmm14 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 448(%rdx), %xmm10 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps 384(%rdx), %xmm15 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps 256(%rdx), %xmm14 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX-NEXT: vmovaps 128(%rdx), %xmm10 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX-NEXT: vmovaps (%rdx), %xmm12 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX-NEXT: vmovaps 192(%rdx), %xmm13 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX-NEXT: vmovaps 320(%rdx), %xmm15 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX-NEXT: vmovaps 448(%rdx), %xmm11 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX-NEXT: vmovaps 384(%rdx), %xmm9 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX-NEXT: vmovaps (%rdx), %xmm8 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX-NEXT: vmovaps %xmm8, 16(%rax) +; AVX-NEXT: vmovaps %xmm12, 16(%rax) ; AVX-NEXT: vmovaps %xmm0, (%rax) -; AVX-NEXT: vmovaps %xmm9, 2704(%rax) +; AVX-NEXT: vmovaps %xmm15, 2704(%rax) ; AVX-NEXT: vmovaps %xmm1, 2688(%rax) -; AVX-NEXT: vmovaps %xmm11, 3152(%rax) -; AVX-NEXT: vmovaps %xmm4, 3136(%rax) -; AVX-NEXT: vmovaps %xmm15, 2256(%rax) +; AVX-NEXT: vmovaps %xmm10, 3152(%rax) +; AVX-NEXT: vmovaps %xmm13, 3136(%rax) +; AVX-NEXT: vmovaps %xmm14, 2256(%rax) ; AVX-NEXT: vmovaps %xmm2, 2240(%rax) -; AVX-NEXT: vmovaps %xmm13, 1360(%rax) -; AVX-NEXT: vmovaps %xmm3, 1344(%rax) -; AVX-NEXT: vmovaps %xmm12, 464(%rax) -; AVX-NEXT: vmovaps %xmm5, 448(%rax) -; AVX-NEXT: vmovaps %xmm10, 912(%rax) -; AVX-NEXT: vmovaps %xmm6, 896(%rax) -; AVX-NEXT: vmovaps %xmm14, 1808(%rax) -; AVX-NEXT: vmovaps %xmm7, 1792(%rax) +; AVX-NEXT: vmovaps %xmm11, 1360(%rax) +; AVX-NEXT: vmovaps %xmm4, 1344(%rax) +; AVX-NEXT: vmovaps %xmm3, 464(%rax) +; AVX-NEXT: vmovaps %xmm6, 448(%rax) +; AVX-NEXT: vmovaps %xmm5, 912(%rax) +; AVX-NEXT: vmovaps %xmm8, 896(%rax) +; AVX-NEXT: vmovaps %xmm7, 1808(%rax) +; AVX-NEXT: vmovaps %xmm9, 1792(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 3520(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -13759,47 +13774,47 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm0, 64(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 32(%rax) -; AVX-NEXT: addq $3816, %rsp # imm = 0xEE8 +; AVX-NEXT: addq $3848, %rsp # imm = 0xF08 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: store_i64_stride7_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $3880, %rsp # imm = 0xF28 +; AVX2-NEXT: subq $3896, %rsp # imm = 0xF38 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-NEXT: vmovaps (%r8), %ymm5 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps (%r9), %ymm6 -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%r8), %ymm7 +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%r9), %ymm8 +; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 +; AVX2-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] +; AVX2-NEXT: vbroadcastsd 8(%rcx), %ymm6 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-NEXT: vbroadcastsd 8(%rcx), %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps (%r8), %xmm4 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vmovaps (%rax), %xmm8 -; AVX2-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rax), %xmm5 +; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-NEXT: vmovaps 16(%rdx), %xmm4 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -13811,7 +13826,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%r8), %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 @@ -13865,9 +13880,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-NEXT: vmovaps 64(%rax), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm3 @@ -13945,9 +13960,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-NEXT: vmovaps 128(%rax), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm3 @@ -14020,55 +14035,55 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vmovaps 192(%r8), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovaps 208(%rdx), %xmm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX2-NEXT: vmovaps 192(%r8), %xmm3 +; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rax), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 192(%rax), %xmm3 +; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 192(%rsi), %ymm4 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 208(%rdx), %xmm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-NEXT: vmovaps 192(%r8), %ymm5 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%r9), %ymm4 -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovaps 192(%r9), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps 208(%rax), %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 224(%r8), %xmm1 +; AVX2-NEXT: vmovaps 208(%rax), %xmm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 224(%r8), %xmm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-NEXT: vmovaps 224(%rax), %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps 224(%rax), %xmm0 +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm2 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vmovaps 224(%rdx), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-NEXT: vbroadcastsd 232(%rcx), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; AVX2-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-NEXT: vmovaps 224(%rsi), %ymm2 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -14086,15 +14101,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps 240(%rax), %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 256(%rsi), %xmm1 +; AVX2-NEXT: vinsertf128 $1, 256(%rcx), %ymm1, %ymm1 +; AVX2-NEXT: vmovaps 240(%rax), %xmm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 256(%rsi), %xmm0 -; AVX2-NEXT: vinsertf128 $1, 256(%rcx), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, 256(%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, 256(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 256(%rdx), %ymm0 ; AVX2-NEXT: vbroadcastsd 264(%rcx), %ymm1 @@ -14102,7 +14117,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps 256(%r8), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14117,11 +14132,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 272(%rdx), %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-NEXT: vmovaps 256(%r8), %ymm5 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 256(%r8), %ymm6 +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 256(%r9), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] @@ -14139,11 +14154,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 288(%rdx), %xmm3 -; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 288(%rdx), %xmm4 +; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-NEXT: vbroadcastsd 296(%rcx), %ymm3 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -14156,56 +14171,56 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 288(%rdx), %ymm0 -; AVX2-NEXT: vmovaps 288(%r8), %ymm4 +; AVX2-NEXT: vmovaps 288(%r8), %ymm6 +; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 288(%r9), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 288(%r9), %ymm3 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps 304(%rax), %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 320(%rsi), %xmm1 +; AVX2-NEXT: vinsertf128 $1, 320(%rcx), %ymm1, %ymm1 +; AVX2-NEXT: vmovaps 304(%rax), %xmm2 +; AVX2-NEXT: vmovaps 320(%rdi), %xmm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, 320(%rdx), %ymm3, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%rsi), %xmm0 -; AVX2-NEXT: vinsertf128 $1, 320(%rcx), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vinsertf128 $1, 320(%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%rdx), %ymm0 -; AVX2-NEXT: vbroadcastsd 328(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 328(%rcx), %ymm0 +; AVX2-NEXT: vmovaps 320(%rdx), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps 320(%r8), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%rax), %xmm10 -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 320(%rax), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-NEXT: vmovaps 320(%rsi), %ymm3 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 336(%rdx), %xmm0 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-NEXT: vmovaps 320(%r8), %ymm9 -; AVX2-NEXT: vmovaps 320(%r9), %ymm4 -; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovaps 320(%r9), %ymm11 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovaps 336(%rax), %xmm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14218,9 +14233,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 352(%rdx), %xmm7 +; AVX2-NEXT: vmovaps 352(%rdx), %xmm8 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; AVX2-NEXT: vbroadcastsd 360(%rcx), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -14234,10 +14249,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 352(%rdx), %ymm0 -; AVX2-NEXT: vmovaps 352(%r8), %ymm5 +; AVX2-NEXT: vmovaps 352(%r8), %ymm7 ; AVX2-NEXT: vmovaps 352(%r9), %ymm3 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14255,19 +14270,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 384(%rdx), %ymm0 -; AVX2-NEXT: vbroadcastsd 392(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vbroadcastsd 392(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps 384(%r8), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 384(%rax), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 384(%rax), %xmm6 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; AVX2-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-NEXT: vmovaps 384(%rsi), %ymm3 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] @@ -14275,10 +14289,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 400(%rdx), %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-NEXT: vmovaps 384(%r8), %ymm15 +; AVX2-NEXT: vmovaps 384(%r8), %ymm5 ; AVX2-NEXT: vmovaps 384(%r9), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] @@ -14291,13 +14305,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-NEXT: vmovaps 416(%rax), %xmm2 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX2-NEXT: vmovaps 416(%rdi), %xmm3 +; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%rdx), %xmm14 +; AVX2-NEXT: vmovaps 416(%rdx), %xmm15 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] ; AVX2-NEXT: vbroadcastsd 424(%rcx), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -14329,64 +14344,65 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-NEXT: vmovaps 448(%r8), %ymm11 +; AVX2-NEXT: vmovaps 448(%r8), %ymm13 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[0,1],ymm0[0,1] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[0,1],ymm0[0,1] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vbroadcastsd 448(%rax), %ymm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 448(%rdx), %ymm2 +; AVX2-NEXT: vmovaps 448(%rdx), %ymm14 ; AVX2-NEXT: vbroadcastsd 456(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 448(%rsi), %ymm1 -; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX2-NEXT: vmovaps 464(%rax), %xmm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-NEXT: vmovaps 480(%r8), %ymm12 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[0,1],ymm0[0,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vbroadcastsd 480(%rax), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-NEXT: vmovaps 480(%r8), %ymm2 +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm0[0,1] +; AVX2-NEXT: vbroadcastsd 480(%rax), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 480(%rdx), %xmm8 ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; AVX2-NEXT: vbroadcastsd 488(%rcx), %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vmovaps 480(%rdx), %xmm12 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] +; AVX2-NEXT: vbroadcastsd 488(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 480(%rsi), %ymm1 -; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vmovaps 480(%rdx), %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovaps 496(%rax), %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps 480(%rdx), %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vmovaps 496(%rax), %xmm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd (%rsp), %ymm1 # 16-byte Folded Reload +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -14394,41 +14410,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vbroadcastsd 24(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd 32(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 32(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastsd 56(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 88(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastsd 88(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vbroadcastsd 96(%rcx), %ymm1 +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -14456,30 +14472,30 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd 160(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 160(%rcx), %ymm2 +; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 216(%rcx), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 216(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -14492,12 +14508,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14506,117 +14522,118 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 280(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vbroadcastsd 280(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd 288(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vbroadcastsd 288(%rcx), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 312(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vbroadcastsd 312(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd %xmm10, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 344(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-NEXT: vbroadcastsd 344(%rcx), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX2-NEXT: vbroadcastsd 352(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-NEXT: vbroadcastsd %xmm6, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vbroadcastsd 408(%rcx), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-NEXT: vbroadcastsd 416(%rcx), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vbroadcastsd 408(%rcx), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm2 ; AVX2-NEXT: vbroadcastsd 440(%rcx), %ymm15 -; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 440(%r9), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm11[2,3] -; AVX2-NEXT: vbroadcastsd 464(%r9), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vbroadcastsd 472(%rcx), %ymm11 -; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 472(%r9), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vmovaps 448(%rax), %ymm0 -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovaps 480(%rdi), %xmm13 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX2-NEXT: vbroadcastsd 480(%rcx), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] -; AVX2-NEXT: vbroadcastsd 496(%r9), %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 416(%rcx), %ymm15 +; AVX2-NEXT: vbroadcastsd 440(%r9), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm13[2,3] +; AVX2-NEXT: vbroadcastsd 464(%r9), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vbroadcastsd 472(%rcx), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-NEXT: vbroadcastsd 472(%r9), %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovaps 448(%rax), %ymm2 +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-NEXT: # ymm13 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovaps 480(%rdi), %xmm14 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm2 +; AVX2-NEXT: vbroadcastsd 480(%rcx), %ymm12 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = ymm10[2,3],mem[2,3] ; AVX2-NEXT: vbroadcastsd 504(%rcx), %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-NEXT: vbroadcastsd 504(%r9), %ymm13 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vmovaps 480(%rax), %ymm13 -; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-NEXT: # ymm0 = mem[0,1],ymm13[2,3],mem[4,5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vbroadcastsd 496(%r9), %ymm14 +; AVX2-NEXT: vbroadcastsd 504(%r9), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 480(%rax), %ymm12 +; AVX2-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX2-NEXT: # ymm14 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovaps %ymm12, 3552(%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm12, 3520(%rcx) -; AVX2-NEXT: vmovaps %ymm6, 3488(%rcx) -; AVX2-NEXT: vmovaps %ymm0, 3456(%rcx) +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovaps %ymm0, 3552(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 3520(%rcx) +; AVX2-NEXT: vmovaps %ymm10, 3488(%rcx) +; AVX2-NEXT: vmovaps %ymm14, 3456(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 3424(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 3392(%rcx) -; AVX2-NEXT: vmovaps %ymm8, 3360(%rcx) +; AVX2-NEXT: vmovaps %ymm2, 3360(%rcx) ; AVX2-NEXT: vmovaps %ymm1, 3328(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 3296(%rcx) -; AVX2-NEXT: vmovaps %ymm2, 3264(%rcx) -; AVX2-NEXT: vmovaps %ymm11, 3232(%rcx) +; AVX2-NEXT: vmovaps %ymm4, 3264(%rcx) +; AVX2-NEXT: vmovaps %ymm13, 3232(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 3200(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 3168(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 3136(%rcx) -; AVX2-NEXT: vmovaps %ymm13, 3104(%rcx) +; AVX2-NEXT: vmovaps %ymm3, 3104(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 3072(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -14627,8 +14644,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 2976(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2944(%rcx) -; AVX2-NEXT: vmovaps %ymm14, 2912(%rcx) -; AVX2-NEXT: vmovaps %ymm3, 2880(%rcx) +; AVX2-NEXT: vmovaps %ymm15, 2912(%rcx) +; AVX2-NEXT: vmovaps %ymm5, 2880(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2848(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -14637,10 +14654,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 2784(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2752(%rcx) -; AVX2-NEXT: vmovaps %ymm4, 2720(%rcx) +; AVX2-NEXT: vmovaps %ymm6, 2720(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2688(%rcx) -; AVX2-NEXT: vmovaps %ymm5, 2656(%rcx) +; AVX2-NEXT: vmovaps %ymm7, 2656(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2624(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -14651,7 +14668,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 2528(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2496(%rcx) -; AVX2-NEXT: vmovaps %ymm7, 2464(%rcx) +; AVX2-NEXT: vmovaps %ymm8, 2464(%rcx) ; AVX2-NEXT: vmovaps %ymm9, 2432(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2400(%rcx) @@ -14661,7 +14678,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 2336(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2304(%rcx) -; AVX2-NEXT: vmovaps %ymm10, 2272(%rcx) +; AVX2-NEXT: vmovaps %ymm11, 2272(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 2240(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -14804,47 +14821,47 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-NEXT: addq $3880, %rsp # imm = 0xF28 +; AVX2-NEXT: addq $3896, %rsp # imm = 0xF38 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i64_stride7_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $3880, %rsp # imm = 0xF28 +; AVX2-FP-NEXT: subq $3896, %rsp # imm = 0xF38 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FP-NEXT: vmovaps (%r8), %ymm5 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps (%r9), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps (%r8), %ymm7 +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps (%r9), %ymm8 +; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-FP-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-FP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] +; AVX2-FP-NEXT: vbroadcastsd 8(%rcx), %ymm6 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-FP-NEXT: vbroadcastsd 8(%rcx), %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FP-NEXT: vmovaps (%rax), %xmm8 -; AVX2-FP-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rax), %xmm5 +; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FP-NEXT: vmovaps 16(%rdx), %xmm4 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -14856,7 +14873,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 @@ -14910,9 +14927,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm3 @@ -14990,9 +15007,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FP-NEXT: vmovaps 128(%rax), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm3 @@ -15065,55 +15082,55 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovaps 208(%rdx), %xmm2 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm3 +; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm3 +; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm4 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 208(%rdx), %xmm1 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-FP-NEXT: vmovaps 192(%r8), %ymm5 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm4 -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 192(%r9), %ymm1 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 208(%rax), %xmm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 224(%r8), %xmm1 +; AVX2-FP-NEXT: vmovaps 208(%rax), %xmm2 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 224(%r8), %xmm0 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-FP-NEXT: vmovaps 224(%rax), %xmm2 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovaps 224(%rax), %xmm0 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vmovaps 224(%rdx), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FP-NEXT: vbroadcastsd 232(%rcx), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm2 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -15131,15 +15148,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 240(%rax), %xmm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 256(%rsi), %xmm1 +; AVX2-FP-NEXT: vinsertf128 $1, 256(%rcx), %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovaps 240(%rax), %xmm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 256(%rsi), %xmm0 -; AVX2-FP-NEXT: vinsertf128 $1, 256(%rcx), %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdx), %ymm1, %ymm1 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 256(%rdx), %ymm0 ; AVX2-FP-NEXT: vbroadcastsd 264(%rcx), %ymm1 @@ -15147,7 +15164,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 256(%r8), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -15162,11 +15179,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 272(%rdx), %xmm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-FP-NEXT: vmovaps 256(%r8), %ymm5 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 256(%r8), %ymm6 +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 256(%r9), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] @@ -15184,11 +15201,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 288(%rdx), %xmm3 -; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 288(%rdx), %xmm4 +; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-FP-NEXT: vbroadcastsd 296(%rcx), %ymm3 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -15201,56 +15218,56 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 288(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovaps 288(%r8), %ymm4 +; AVX2-FP-NEXT: vmovaps 288(%r8), %ymm6 +; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 288(%r9), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 288(%r9), %ymm3 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 304(%rax), %xmm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 320(%rsi), %xmm1 +; AVX2-FP-NEXT: vinsertf128 $1, 320(%rcx), %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovaps 304(%rax), %xmm2 +; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, 320(%rdx), %ymm3, %ymm3 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%rsi), %xmm0 -; AVX2-FP-NEXT: vinsertf128 $1, 320(%rcx), %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vinsertf128 $1, 320(%rdx), %ymm1, %ymm1 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%rdx), %ymm0 -; AVX2-FP-NEXT: vbroadcastsd 328(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 328(%rcx), %ymm0 +; AVX2-FP-NEXT: vmovaps 320(%rdx), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 320(%r8), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%rax), %xmm10 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 320(%rax), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovaps 320(%rsi), %ymm3 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 336(%rdx), %xmm0 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vmovaps 320(%r8), %ymm9 -; AVX2-FP-NEXT: vmovaps 320(%r9), %ymm4 -; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovaps 320(%r9), %ymm11 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovaps 336(%rax), %xmm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -15263,9 +15280,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 352(%rdx), %xmm7 +; AVX2-FP-NEXT: vmovaps 352(%rdx), %xmm8 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; AVX2-FP-NEXT: vbroadcastsd 360(%rcx), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -15279,10 +15296,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 352(%rdx), %ymm0 -; AVX2-FP-NEXT: vmovaps 352(%r8), %ymm5 +; AVX2-FP-NEXT: vmovaps 352(%r8), %ymm7 ; AVX2-FP-NEXT: vmovaps 352(%r9), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -15300,19 +15317,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 384(%rdx), %ymm0 -; AVX2-FP-NEXT: vbroadcastsd 392(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FP-NEXT: vbroadcastsd 392(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps 384(%r8), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 384(%rax), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 384(%rax), %xmm6 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovaps 384(%rsi), %ymm3 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] @@ -15320,10 +15336,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 400(%rdx), %xmm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-FP-NEXT: vmovaps 384(%r8), %ymm15 +; AVX2-FP-NEXT: vmovaps 384(%r8), %ymm5 ; AVX2-FP-NEXT: vmovaps 384(%r9), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] @@ -15336,13 +15352,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vmovaps 416(%rax), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm3 +; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 416(%rdx), %xmm14 +; AVX2-FP-NEXT: vmovaps 416(%rdx), %xmm15 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] ; AVX2-FP-NEXT: vbroadcastsd 424(%rcx), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -15374,64 +15391,65 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps 448(%r8), %ymm11 +; AVX2-FP-NEXT: vmovaps 448(%r8), %ymm13 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[0,1],ymm0[0,1] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[0,1],ymm0[0,1] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vbroadcastsd 448(%rax), %ymm2 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 448(%rdx), %ymm2 +; AVX2-FP-NEXT: vmovaps 448(%rdx), %ymm14 ; AVX2-FP-NEXT: vbroadcastsd 456(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 448(%rsi), %ymm1 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FP-NEXT: vmovaps 464(%rax), %xmm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps 480(%r8), %ymm12 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[0,1],ymm0[0,1] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vbroadcastsd 480(%rax), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovaps 480(%r8), %ymm2 +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm0[0,1] +; AVX2-FP-NEXT: vbroadcastsd 480(%rax), %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 480(%rdx), %xmm8 ; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; AVX2-FP-NEXT: vbroadcastsd 488(%rcx), %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FP-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovaps 480(%rdx), %xmm12 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] +; AVX2-FP-NEXT: vbroadcastsd 488(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 480(%rsi), %ymm1 -; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vmovaps 480(%rdx), %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovaps 496(%rax), %xmm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 480(%rdx), %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FP-NEXT: vmovaps 496(%rax), %xmm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd (%rsp), %ymm1 # 16-byte Folded Reload +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -15439,41 +15457,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd 24(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd 32(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 32(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 56(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 88(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 88(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vbroadcastsd 96(%rcx), %ymm1 +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -15501,30 +15519,30 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd 160(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 160(%rcx), %ymm2 +; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 216(%rcx), %ymm4 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 216(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -15537,12 +15555,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -15551,117 +15569,118 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 280(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd 280(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd 288(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vbroadcastsd 288(%rcx), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 312(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 312(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd %xmm10, %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 344(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vbroadcastsd 344(%rcx), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX2-FP-NEXT: vbroadcastsd 352(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vbroadcastsd 408(%rcx), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd 416(%rcx), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vbroadcastsd 408(%rcx), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm2 ; AVX2-FP-NEXT: vbroadcastsd 440(%rcx), %ymm15 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 440(%r9), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm11[2,3] -; AVX2-FP-NEXT: vbroadcastsd 464(%r9), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vbroadcastsd 472(%rcx), %ymm11 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 472(%r9), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 448(%rax), %ymm0 -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm13 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX2-FP-NEXT: vbroadcastsd 480(%rcx), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] -; AVX2-FP-NEXT: vbroadcastsd 496(%r9), %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 416(%rcx), %ymm15 +; AVX2-FP-NEXT: vbroadcastsd 440(%r9), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm13[2,3] +; AVX2-FP-NEXT: vbroadcastsd 464(%r9), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vbroadcastsd 472(%rcx), %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 472(%r9), %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 448(%rax), %ymm2 +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm14 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm2 +; AVX2-FP-NEXT: vbroadcastsd 480(%rcx), %ymm12 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-FP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm10 = ymm10[2,3],mem[2,3] ; AVX2-FP-NEXT: vbroadcastsd 504(%rcx), %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 504(%r9), %ymm13 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 480(%rax), %ymm13 -; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm13[2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vbroadcastsd 496(%r9), %ymm14 +; AVX2-FP-NEXT: vbroadcastsd 504(%r9), %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 480(%rax), %ymm12 +; AVX2-FP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm14 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovaps %ymm12, 3552(%rcx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm12, 3520(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm6, 3488(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm0, 3456(%rcx) +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovaps %ymm0, 3552(%rcx) +; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vmovaps %ymm0, 3520(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm10, 3488(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm14, 3456(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 3424(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 3392(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm8, 3360(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm2, 3360(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm1, 3328(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 3296(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm2, 3264(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm11, 3232(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm4, 3264(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm13, 3232(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 3200(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 3168(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 3136(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm13, 3104(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm3, 3104(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 3072(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -15672,8 +15691,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 2976(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2944(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm14, 2912(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm3, 2880(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm15, 2912(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm5, 2880(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2848(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -15682,10 +15701,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 2784(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2752(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm4, 2720(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm6, 2720(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2688(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm5, 2656(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm7, 2656(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2624(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -15696,7 +15715,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 2528(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2496(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm7, 2464(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm8, 2464(%rcx) ; AVX2-FP-NEXT: vmovaps %ymm9, 2432(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2400(%rcx) @@ -15706,7 +15725,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 2336(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2304(%rcx) -; AVX2-FP-NEXT: vmovaps %ymm10, 2272(%rcx) +; AVX2-FP-NEXT: vmovaps %ymm11, 2272(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 2240(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -15849,47 +15868,47 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FP-NEXT: addq $3880, %rsp # imm = 0xF28 +; AVX2-FP-NEXT: addq $3896, %rsp # imm = 0xF38 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i64_stride7_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $3880, %rsp # imm = 0xF28 +; AVX2-FCP-NEXT: subq $3896, %rsp # imm = 0xF38 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps (%r9), %ymm6 -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm7 +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%r9), %ymm8 +; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-FCP-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] +; AVX2-FCP-NEXT: vbroadcastsd 8(%rcx), %ymm6 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-FCP-NEXT: vbroadcastsd 8(%rcx), %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FCP-NEXT: vmovaps (%rax), %xmm8 -; AVX2-FCP-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rax), %xmm5 +; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FCP-NEXT: vmovaps 16(%rdx), %xmm4 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -15901,7 +15920,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 @@ -15955,9 +15974,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm3 @@ -16035,9 +16054,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FCP-NEXT: vmovaps 128(%rax), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm3 @@ -16110,55 +16129,55 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovaps 208(%rdx), %xmm2 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm3 +; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm3 +; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm4 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 208(%rdx), %xmm1 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm1 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 208(%rax), %xmm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 224(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovaps 208(%rax), %xmm2 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 224(%r8), %xmm0 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-FCP-NEXT: vmovaps 224(%rax), %xmm2 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovaps 224(%rax), %xmm0 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vmovaps 224(%rdx), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FCP-NEXT: vbroadcastsd 232(%rcx), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm2 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -16176,15 +16195,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 240(%rax), %xmm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 256(%rsi), %xmm1 +; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rcx), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovaps 240(%rax), %xmm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 256(%rsi), %xmm0 -; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rcx), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdx), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 256(%rdx), %ymm0 ; AVX2-FCP-NEXT: vbroadcastsd 264(%rcx), %ymm1 @@ -16192,7 +16211,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 256(%r8), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -16207,11 +16226,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 272(%rdx), %xmm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-FCP-NEXT: vmovaps 256(%r8), %ymm5 -; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 256(%r8), %ymm6 +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 256(%r9), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] @@ -16229,11 +16248,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 288(%rdx), %xmm3 -; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 288(%rdx), %xmm4 +; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-FCP-NEXT: vbroadcastsd 296(%rcx), %ymm3 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -16246,56 +16265,56 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 288(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovaps 288(%r8), %ymm4 +; AVX2-FCP-NEXT: vmovaps 288(%r8), %ymm6 +; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 288(%r9), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 288(%r9), %ymm3 -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 304(%rax), %xmm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 320(%rsi), %xmm1 +; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rcx), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovaps 304(%rax), %xmm2 +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rdx), %ymm3, %ymm3 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%rsi), %xmm0 -; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rcx), %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rdx), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%rdx), %ymm0 -; AVX2-FCP-NEXT: vbroadcastsd 328(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 328(%rcx), %ymm0 +; AVX2-FCP-NEXT: vmovaps 320(%rdx), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 320(%r8), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%rax), %xmm10 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 320(%rax), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovaps 320(%rsi), %ymm3 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 336(%rdx), %xmm0 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vmovaps 320(%r8), %ymm9 -; AVX2-FCP-NEXT: vmovaps 320(%r9), %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovaps 320(%r9), %ymm11 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovaps 336(%rax), %xmm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -16308,9 +16327,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 352(%rdx), %xmm7 +; AVX2-FCP-NEXT: vmovaps 352(%rdx), %xmm8 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; AVX2-FCP-NEXT: vbroadcastsd 360(%rcx), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -16324,10 +16343,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 352(%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovaps 352(%r8), %ymm5 +; AVX2-FCP-NEXT: vmovaps 352(%r8), %ymm7 ; AVX2-FCP-NEXT: vmovaps 352(%r9), %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -16345,19 +16364,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 384(%rdx), %ymm0 -; AVX2-FCP-NEXT: vbroadcastsd 392(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-FCP-NEXT: vbroadcastsd 392(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 384(%r8), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 384(%rax), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 384(%rax), %xmm6 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovaps 384(%rsi), %ymm3 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] @@ -16365,10 +16383,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 400(%rdx), %xmm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-FCP-NEXT: vmovaps 384(%r8), %ymm15 +; AVX2-FCP-NEXT: vmovaps 384(%r8), %ymm5 ; AVX2-FCP-NEXT: vmovaps 384(%r9), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm15[0],ymm4[0],ymm15[2],ymm4[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] @@ -16381,13 +16399,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vmovaps 416(%rax), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 416(%rdx), %xmm14 +; AVX2-FCP-NEXT: vmovaps 416(%rdx), %xmm15 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] ; AVX2-FCP-NEXT: vbroadcastsd 424(%rcx), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -16419,64 +16438,65 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovaps 448(%r8), %ymm11 +; AVX2-FCP-NEXT: vmovaps 448(%r8), %ymm13 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[0,1],ymm0[0,1] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[0,1],ymm0[0,1] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vbroadcastsd 448(%rax), %ymm2 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 448(%rdx), %ymm2 +; AVX2-FCP-NEXT: vmovaps 448(%rdx), %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd 456(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 448(%rsi), %ymm1 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FCP-NEXT: vmovaps 464(%rax), %xmm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovaps 480(%r8), %ymm12 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[0,1],ymm0[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vbroadcastsd 480(%rax), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovaps 480(%r8), %ymm2 +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm0[0,1] +; AVX2-FCP-NEXT: vbroadcastsd 480(%rax), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 480(%rdx), %xmm8 ; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; AVX2-FCP-NEXT: vbroadcastsd 488(%rcx), %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovaps 480(%rdx), %xmm12 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] +; AVX2-FCP-NEXT: vbroadcastsd 488(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 480(%rsi), %ymm1 -; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-FCP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vmovaps 480(%rdx), %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovaps 496(%rax), %xmm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 480(%rdx), %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FCP-NEXT: vmovaps 496(%rax), %xmm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm1 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -16484,41 +16504,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd 24(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd 32(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 32(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 56(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 88(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 88(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vbroadcastsd 96(%rcx), %ymm1 +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -16546,30 +16566,30 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd 160(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 160(%rcx), %ymm2 +; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 216(%rcx), %ymm4 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 216(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -16582,12 +16602,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -16596,117 +16616,118 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 280(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd 280(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd 288(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 288(%rcx), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 312(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 312(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd %xmm10, %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 344(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 344(%rcx), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vbroadcastsd 352(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 408(%rcx), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd 416(%rcx), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 408(%rcx), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vbroadcastsd 440(%rcx), %ymm15 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 440(%r9), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm11[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 464(%r9), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vbroadcastsd 472(%rcx), %ymm11 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 472(%r9), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 448(%rax), %ymm0 -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm13 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX2-FCP-NEXT: vbroadcastsd 480(%rcx), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 496(%r9), %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 416(%rcx), %ymm15 +; AVX2-FCP-NEXT: vbroadcastsd 440(%r9), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm13[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 464(%r9), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 472(%rcx), %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 472(%r9), %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 448(%rax), %ymm2 +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm14 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm2 +; AVX2-FCP-NEXT: vbroadcastsd 480(%rcx), %ymm12 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-FCP-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm10 = ymm10[2,3],mem[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 504(%rcx), %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 504(%r9), %ymm13 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 480(%rax), %ymm13 -; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm13[2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vbroadcastsd 496(%r9), %ymm14 +; AVX2-FCP-NEXT: vbroadcastsd 504(%r9), %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 480(%rax), %ymm12 +; AVX2-FCP-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm14 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovaps %ymm12, 3552(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm12, 3520(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm6, 3488(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm0, 3456(%rcx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovaps %ymm0, 3552(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm0, 3520(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm10, 3488(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm14, 3456(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 3424(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 3392(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm8, 3360(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm2, 3360(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm1, 3328(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 3296(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm2, 3264(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm11, 3232(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm4, 3264(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm13, 3232(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 3200(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 3168(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 3136(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm13, 3104(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm3, 3104(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 3072(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -16717,8 +16738,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 2976(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2944(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm14, 2912(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm3, 2880(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm15, 2912(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm5, 2880(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2848(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -16727,10 +16748,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 2784(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2752(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm4, 2720(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm6, 2720(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2688(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm5, 2656(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm7, 2656(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2624(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -16741,7 +16762,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 2528(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2496(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm7, 2464(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm8, 2464(%rcx) ; AVX2-FCP-NEXT: vmovaps %ymm9, 2432(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2400(%rcx) @@ -16751,7 +16772,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 2336(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2304(%rcx) -; AVX2-FCP-NEXT: vmovaps %ymm10, 2272(%rcx) +; AVX2-FCP-NEXT: vmovaps %ymm11, 2272(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 2240(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -16894,313 +16915,317 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FCP-NEXT: addq $3880, %rsp # imm = 0xF28 +; AVX2-FCP-NEXT: addq $3896, %rsp # imm = 0xF38 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i64_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $6248, %rsp # imm = 0x1868 +; AVX512-NEXT: subq $6312, %rsp # imm = 0x18A8 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] +; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [2,10,0,3,2,10,0,3] +; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $96, %r10b ; AVX512-NEXT: kmovw %r10d, %k1 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 64(%rax), %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] +; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 +; AVX512-NEXT: vpermt2q %zmm16, %zmm28, %zmm2 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] +; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 +; AVX512-NEXT: vpermt2q %zmm9, %zmm22, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa (%r9), %ymm7 -; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%r9), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512-NEXT: movb $28, %r10b ; AVX512-NEXT: kmovw %r10d, %k2 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] -; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512-NEXT: vpermt2q %zmm16, %zmm4, %zmm0 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,0,14,6,5,0,14,6] +; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] +; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [6,13,14,7,6,13,14,7] +; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512-NEXT: vpermt2q %zmm12, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm8[0],ymm5[0],ymm8[2],ymm5[2] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm24, %zmm3 +; AVX512-NEXT: vpermt2q %zmm20, %zmm28, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 ; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512-NEXT: vpermt2q %zmm16, %zmm25, %zmm1 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 +; AVX512-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 +; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm27 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 128(%rax), %zmm5 -; AVX512-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512-NEXT: vmovdqa 128(%r9), %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 128(%r8), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512-NEXT: vmovdqa64 128(%r9), %zmm6 +; AVX512-NEXT: vmovdqa64 128(%r9), %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm6, %zmm28, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermt2q %zmm10, %zmm4, %zmm5 +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 +; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512-NEXT: vpermt2q %zmm10, %zmm25, %zmm1 +; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm21 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm23, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512-NEXT: vmovdqa 192(%r9), %ymm5 -; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 192(%r9), %ymm7 +; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512-NEXT: vpermt2q %zmm8, %zmm3, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 -; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm22 +; AVX512-NEXT: vmovdqa64 256(%rsi), %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm17, %zmm1 -; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm12 -; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512-NEXT: vpermt2q %zmm23, %zmm31, %zmm5 +; AVX512-NEXT: vpermt2q %zmm19, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm16 +; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm23, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512-NEXT: vmovdqa 256(%r9), %ymm7 -; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 256(%r9), %ymm14 +; AVX512-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 256(%r8), %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 256(%r8), %zmm5 ; AVX512-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm17, %zmm14 ; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512-NEXT: vpermt2q %zmm2, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm11 -; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4 ; AVX512-NEXT: vpermt2q %zmm2, %zmm25, %zmm5 -; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 320(%rsi), %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm5 -; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm11 -; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 +; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm12 +; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 320(%rax), %zmm1 ; AVX512-NEXT: vmovdqa 320(%r9), %ymm5 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 320(%r8), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 320(%r8), %zmm5 ; AVX512-NEXT: vmovdqa64 320(%r9), %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 -; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 +; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 ; AVX512-NEXT: vpermt2q %zmm14, %zmm25, %zmm5 -; AVX512-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 +; AVX512-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm24 +; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512-NEXT: vpermt2q %zmm24, %zmm25, %zmm0 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm11, %zmm25, %zmm0 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm28 ; AVX512-NEXT: vmovdqa64 384(%rsi), %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm9 ; AVX512-NEXT: movb $24, %r10b ; AVX512-NEXT: kmovw %r10d, %k3 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 +; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 ; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm24 +; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] -; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,5,15,u] +; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,5,13,5,13,5,13,5] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -17221,7 +17246,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm13, %zmm7, %zmm1 @@ -17234,76 +17259,75 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm29, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm21, %zmm2, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512-NEXT: vpermt2q %zmm27, %zmm2, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm2, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512-NEXT: vpermt2q %zmm21, %zmm25, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 +; AVX512-NEXT: vpermt2q %zmm16, %zmm2, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm17, %zmm25, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512-NEXT: vpermt2q %zmm12, %zmm2, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm12 ; AVX512-NEXT: vmovdqa64 448(%rdx), %zmm8 ; AVX512-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512-NEXT: vpermi2q %zmm24, %zmm5, %zmm31 -; AVX512-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 +; AVX512-NEXT: vpermi2q %zmm11, %zmm5, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm7, %zmm1 +; AVX512-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm16 ; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm8 ; AVX512-NEXT: movb $48, %r10b ; AVX512-NEXT: kmovw %r10d, %k4 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 @@ -17312,7 +17336,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 @@ -17321,7 +17345,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm16 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17339,106 +17364,104 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm27, %zmm5, %zmm1 +; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm27[0],zmm0[0],zmm27[2],zmm0[2],zmm27[4],zmm0[4],zmm27[6],zmm0[6] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] ; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm27 -; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 +; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm23 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] ; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm29 -; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm22[0],zmm15[2],zmm22[2],zmm15[4],zmm22[4],zmm15[6],zmm22[6] +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm30 +; AVX512-NEXT: vpermt2q %zmm26, %zmm4, %zmm31 +; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm26, %zmm7, %zmm29 +; AVX512-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm26, %zmm25, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512-NEXT: vpermt2q %zmm15, %zmm5, %zmm27 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm31 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2q %zmm22, %zmm4, %zmm27 -; AVX512-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512-NEXT: vpermt2q %zmm19, %zmm4, %zmm26 +; AVX512-NEXT: vpermt2q %zmm19, %zmm7, %zmm31 +; AVX512-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm25, %zmm26 +; AVX512-NEXT: vpermt2q %zmm19, %zmm25, %zmm24 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm21 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512-NEXT: vpermt2q %zmm19, %zmm5, %zmm24 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm21 +; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512-NEXT: vpermt2q %zmm20, %zmm5, %zmm22 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 ; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm18 ; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm28, %zmm5, %zmm20 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512-NEXT: vpermt2q %zmm28, %zmm5, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512-NEXT: vpermi2q %zmm19, %zmm28, %zmm25 +; AVX512-NEXT: vpermi2q %zmm12, %zmm28, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512-NEXT: vpermi2q %zmm19, %zmm28, %zmm7 +; AVX512-NEXT: vpermi2q %zmm12, %zmm28, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 ; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512-NEXT: vpermi2q %zmm19, %zmm28, %zmm10 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm19[0],zmm28[2],zmm19[2],zmm28[4],zmm19[4],zmm28[6],zmm19[6] +; AVX512-NEXT: vpermi2q %zmm12, %zmm28, %zmm10 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm12[0],zmm28[2],zmm12[2],zmm28[4],zmm12[4],zmm28[6],zmm12[6] ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm19, %zmm4, %zmm28 +; AVX512-NEXT: vpermt2q %zmm12, %zmm4, %zmm28 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] ; AVX512-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 ; AVX512-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [12,0,0,3,4,5,6,13] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,u,u,3,4,5,6,13] ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 {%k3} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] ; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm17 ; AVX512-NEXT: movb $6, %sil ; AVX512-NEXT: kmovw %esi, %k4 ; AVX512-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,0,0,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,u,u,6,7] ; AVX512-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 ; AVX512-NEXT: movb $64, %sil ; AVX512-NEXT: kmovw %esi, %k5 @@ -17447,16 +17470,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: kmovw %esi, %k5 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 {%k5} ; AVX512-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm9, %zmm5 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm6 = [13,0,2,3,4,5,6,14] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [13,u,2,3,4,5,6,14] ; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] ; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512-NEXT: movb $12, %sil ; AVX512-NEXT: kmovw %esi, %k5 @@ -17465,15 +17488,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} ; AVX512-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7] ; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm2 ; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa 384(%r9), %ymm12 ; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -17481,160 +17504,159 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $8, %sil ; AVX512-NEXT: kmovw %esi, %k2 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k3} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k3} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k3} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} ; AVX512-NEXT: movb $-31, %sil ; AVX512-NEXT: kmovw %esi, %k2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} ; AVX512-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} -; AVX512-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} +; AVX512-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} ; AVX512-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} ; AVX512-NEXT: vmovdqa 256(%rdx), %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k5} ; AVX512-NEXT: vmovdqa 320(%rdx), %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} -; AVX512-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa 384(%rdx), %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k5} ; AVX512-NEXT: movb $112, %sil ; AVX512-NEXT: kmovw %esi, %k2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti32x4 $3, 384(%rax), %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} ; AVX512-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} -; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k4} ; AVX512-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} -; AVX512-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} +; AVX512-NEXT: vpbroadcastq 328(%rcx), %ymm2 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} ; AVX512-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k4} ; AVX512-NEXT: movb $56, %cl ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17642,10 +17664,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm20 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} @@ -17653,50 +17681,45 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512-NEXT: movb $120, %cl ; AVX512-NEXT: kmovw %ecx, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm11 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512-NEXT: movb $-61, %cl ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -17714,86 +17737,87 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 64-byte Folded Reload ; AVX512-NEXT: # zmm7 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: movb $14, %cl ; AVX512-NEXT: kmovw %ecx, %k1 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm18 {%k1} ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm31 {%k1} ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm0 {%k1} ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 2944(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 2752(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 2944(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 2880(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm7, 2816(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, 2752(%rax) ; AVX512-NEXT: vmovdqa64 %zmm28, 2688(%rax) -; AVX512-NEXT: vmovdqa64 %zmm22, 2624(%rax) +; AVX512-NEXT: vmovdqa64 %zmm21, 2624(%rax) ; AVX512-NEXT: vmovdqa64 %zmm6, 2560(%rax) ; AVX512-NEXT: vmovdqa64 %zmm23, 2496(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 2432(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 2432(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 2240(%rax) -; AVX512-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512-NEXT: vmovdqa64 %zmm22, 2304(%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, 2176(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 2112(%rax) ; AVX512-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm5, 1920(%rax) -; AVX512-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512-NEXT: vmovdqa64 %zmm27, 1792(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512-NEXT: vmovdqa64 %zmm27, 1856(%rax) +; AVX512-NEXT: vmovdqa64 %zmm26, 1792(%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, 1728(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 1600(%rax) +; AVX512-NEXT: vmovdqa64 %zmm31, 1536(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm4, 1472(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17802,9 +17826,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 1216(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 1152(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 1088(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17813,8 +17837,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 704(%rax) ; AVX512-NEXT: vmovdqa64 %zmm18, 640(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 576(%rax) @@ -17822,10 +17846,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 256(%rax) ; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 128(%rax) @@ -17848,334 +17872,328 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 3072(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512-NEXT: addq $6248, %rsp # imm = 0x1868 +; AVX512-NEXT: addq $6312, %rsp # imm = 0x18A8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i64_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $6120, %rsp # imm = 0x17E8 +; AVX512-FCP-NEXT: subq $6248, %rsp # imm = 0x1868 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] -; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] +; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: movb $96, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k1 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [9,1,9,1,9,1,9,1] +; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm10 ; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm6 +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm8 ; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512-FCP-NEXT: movb $28, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k2 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,12,0,5,4,12,0,5] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,12,7,0,1,12,7] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,1,12,7,0,1,12,7] +; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] -; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,13,6,7,0,13,6,7] ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [6,13,14,7,6,13,14,7] +; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm30 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm29, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm7 -; AVX512-FCP-NEXT: vmovdqa 128(%r9), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512-FCP-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm14 -; AVX512-FCP-NEXT: vmovdqa 192(%r9), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %ymm24 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %ymm28 +; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %ymm25 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm25[0],ymm28[0],ymm25[2],ymm28[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm13[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm8, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm29, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 256(%rsi), %zmm31 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 256(%rax), %zmm0 ; AVX512-FCP-NEXT: vmovdqa 256(%r9), %ymm10 ; AVX512-FCP-NEXT: vmovdqa 256(%r8), %ymm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 256(%r9), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm16 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm14, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 320(%rax), %zmm1 ; AVX512-FCP-NEXT: vmovdqa 320(%r9), %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %ymm31 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm31[0],ymm6[0],ymm31[2],ymm6[2] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %ymm18 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm18[0],ymm6[0],ymm18[2],ymm6[2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 320(%r8), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 320(%r9), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpermt2q %ymm28, %ymm0, %ymm25 +; AVX512-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermt2q %ymm10, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermt2q %ymm6, %ymm0, %ymm31 -; AVX512-FCP-NEXT: vmovdqu64 %ymm31, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpermt2q %ymm6, %ymm0, %ymm18 +; AVX512-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 384(%r9), %ymm2 ; AVX512-FCP-NEXT: vmovdqa 384(%r8), %ymm4 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %ymm2, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm24 -; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm28 +; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 384(%rsi), %zmm18 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm28 ; AVX512-FCP-NEXT: movb $24, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermt2q (%rsp), %zmm20, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 ; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm28 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdx), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] @@ -18184,216 +18202,219 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm11 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm30 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $48, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k4 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm12[0],zmm19[0],zmm12[2],zmm19[2],zmm12[4],zmm19[4],zmm12[6],zmm19[6] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm9 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm3[0],zmm28[0],zmm3[2],zmm28[2],zmm3[4],zmm28[4],zmm3[6],zmm28[6] -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm24 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k4} = zmm3[0],zmm31[0],zmm3[2],zmm31[2],zmm3[4],zmm31[4],zmm3[6],zmm31[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm17 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k4} = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm31, %zmm26 -; AVX512-FCP-NEXT: vpermi2q %zmm31, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm21 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm10, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm25 -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm23 -; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm14[0],zmm18[0],zmm14[2],zmm18[2],zmm14[4],zmm18[4],zmm14[6],zmm18[6] -; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm19 +; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm13, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm13[0],zmm18[0],zmm13[2],zmm18[2],zmm13[4],zmm18[4],zmm13[6],zmm18[6] +; AVX512-FCP-NEXT: vpermi2q %zmm18, %zmm13, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 @@ -18402,21 +18423,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [12,0,0,3,4,5,6,13] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,u,u,3,4,5,6,13] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm15 {%k3} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm17 ; AVX512-FCP-NEXT: movb $6, %sil -; AVX512-FCP-NEXT: kmovw %esi, %k5 ; AVX512-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: kmovw %esi, %k5 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512-FCP-NEXT: movb $64, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 @@ -18425,16 +18445,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: kmovw %esi, %k4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} ; AVX512-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,11,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,11,u,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,12,0,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,12,u,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm8 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [13,0,2,3,4,5,6,14] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,u,2,3,4,5,6,14] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 ; AVX512-FCP-NEXT: movb $12, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k4 ; AVX512-FCP-NEXT: vmovdqa 448(%rdx), %xmm1 @@ -18442,68 +18460,72 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,9,0,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,1,2,3,4,5,15,u] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,u,7] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,9,u,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k5} +; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k5} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} ; AVX512-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} ; AVX512-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} -; AVX512-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k5} ; AVX512-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm8, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,9,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm18 ; AVX512-FCP-NEXT: movb $8, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 {%k2} @@ -18516,13 +18538,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k3} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k3} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 {%k3} ; AVX512-FCP-NEXT: movb $-31, %sil ; AVX512-FCP-NEXT: kmovw %esi, %k2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -18535,42 +18558,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4} -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} -; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm8 {%k4} -; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k4} -; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} -; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm10 {%k4} +; AVX512-FCP-NEXT: vmovdqa 256(%rdx), %xmm3 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k4} +; AVX512-FCP-NEXT: vmovdqa 320(%rdx), %xmm4 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm25 {%k4} ; AVX512-FCP-NEXT: vmovdqa 384(%rdx), %xmm3 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -18583,66 +18606,67 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm3 +; AVX512-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k2} ; AVX512-FCP-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} ; AVX512-FCP-NEXT: movb $56, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm25 {%k2} +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: movb $14, %cl ; AVX512-FCP-NEXT: kmovw %ecx, %k2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k2} -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k2} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm22 {%k2} ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k2} @@ -18716,33 +18740,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 3008(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 2944(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 2880(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 2752(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 2752(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 2688(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm7, 2624(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 2560(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 2432(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm7, 2368(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 2304(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 2240(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 2304(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 2240(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 2112(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 1984(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm7, 1920(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 1856(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 1856(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1728(%rax) @@ -18751,8 +18775,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1536(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm7, 1472(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 1408(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1280(%rax) @@ -18761,7 +18786,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm5, 1024(%rax) -; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 896(%rax) @@ -18780,14 +18805,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 192(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 3520(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 3520(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18795,29 +18820,28 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 3328(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 3264(%rax) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 3072(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512-FCP-NEXT: addq $6120, %rsp # imm = 0x17E8 +; AVX512-FCP-NEXT: addq $6248, %rsp # imm = 0x1868 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i64_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $6280, %rsp # imm = 0x1888 +; AVX512DQ-NEXT: subq $6248, %rsp # imm = 0x1868 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 @@ -18836,11 +18860,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [9,1,9,1,9,1,9,1] +; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm8, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm7, %zmm2 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 @@ -18849,15 +18873,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm5 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512DQ-NEXT: movb $28, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k2 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] @@ -18889,14 +18913,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm31, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm7, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 @@ -18907,7 +18930,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm24, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18916,10 +18939,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm8 @@ -18933,19 +18956,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm12, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18955,10 +18978,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm0 @@ -18971,54 +18994,57 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm6, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm7, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm28, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm19 +; AVX512DQ-NEXT: vmovdqa64 256(%rsi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm29 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm14 +; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512DQ-NEXT: vmovdqa 256(%r9), %ymm12 -; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 256(%r9), %ymm11 +; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 256(%r8), %ymm4 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 256(%r8), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 256(%r9), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm7, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm28, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm17, %zmm1 @@ -19029,32 +19055,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm25, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm29, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm29, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm31, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512DQ-NEXT: vmovdqa 320(%r9), %ymm7 -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm31, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512DQ-NEXT: vmovdqa64 320(%rax), %zmm3 +; AVX512DQ-NEXT: vmovdqa 320(%r9), %ymm1 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 320(%r8), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm7[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 320(%r8), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 320(%r9), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 @@ -19068,40 +19097,39 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm25, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 384(%rsi), %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm1 ; AVX512DQ-NEXT: movb $24, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k3 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm30 # 64-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm22 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm1, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm7, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm7, %zmm21 +; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm7, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm7, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm7, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,15,0] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u] +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 @@ -19118,13 +19146,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] +; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm25, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] -; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm6, %zmm1 @@ -19136,39 +19164,39 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm9, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm25, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm2, %zmm30 -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm2, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm2, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm2, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19179,13 +19207,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 448(%rdx), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm7 ; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm5, %zmm31 -; AVX512DQ-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19193,7 +19221,6 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm28 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm2, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12 @@ -19205,25 +19232,25 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: kmovw %r10d, %k4 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm9, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19233,15 +19260,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm25, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -19267,61 +19293,62 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm5, %zmm31 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] ; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm4, %zmm27 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm25, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm26 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm5, %zmm24 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k4} = zmm16[0],zmm3[0],zmm16[2],zmm3[2],zmm16[4],zmm3[4],zmm16[6],zmm3[6] -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm6, %zmm17 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm5, %zmm24 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm18[0],zmm1[0],zmm18[2],zmm1[2],zmm18[4],zmm1[4],zmm18[6],zmm1[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm6, %zmm17 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm5, %zmm20 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm5, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm16, %zmm25 +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm28, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm16, %zmm6 +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm28, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm16, %zmm9 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512DQ-NEXT: vpermi2q %zmm18, %zmm28, %zmm9 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k4} = zmm28[0],zmm18[0],zmm28[2],zmm18[2],zmm28[4],zmm18[4],zmm28[6],zmm18[6] +; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm4, %zmm28 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 @@ -19329,20 +19356,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm8, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 {%k3} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 ; AVX512DQ-NEXT: movb $6, %sil -; AVX512DQ-NEXT: kmovw %esi, %k4 ; AVX512DQ-NEXT: vpbroadcastq 456(%rcx), %ymm1 +; AVX512DQ-NEXT: kmovw %esi, %k4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,0,0,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,u,u,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 ; AVX512DQ-NEXT: movb $64, %sil ; AVX512DQ-NEXT: kmovw %esi, %k5 @@ -19351,16 +19378,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: kmovw %esi, %k5 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 {%k5} ; AVX512DQ-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm5 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 ; AVX512DQ-NEXT: movb $12, %sil ; AVX512DQ-NEXT: kmovw %esi, %k5 @@ -19369,15 +19396,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} ; AVX512DQ-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} ; AVX512DQ-NEXT: vmovdqa 384(%r9), %ymm12 ; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -19385,7 +19412,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 @@ -19394,25 +19421,25 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $8, %sil @@ -19421,13 +19448,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19438,13 +19464,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19457,22 +19484,22 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} -; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} ; AVX512DQ-NEXT: vmovdqa 256(%rdx), %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} ; AVX512DQ-NEXT: vmovdqa 320(%rdx), %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -19487,41 +19514,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm2 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm3 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2} +; AVX512DQ-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm4 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm27 {%k2} +; AVX512DQ-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm2 {%k2} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm30 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm21 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x2 $3, 384(%rax), %zmm0, %zmm28 {%k2} ; AVX512DQ-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQ-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k4} ; AVX512DQ-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} -; AVX512DQ-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastq 328(%rcx), %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} ; AVX512DQ-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -19532,16 +19559,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm20 {%k2} @@ -19592,10 +19618,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -19618,19 +19644,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: movb $14, %cl ; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm10 {%k1} ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -19643,13 +19669,6 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -19657,27 +19676,34 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 3008(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 2944(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 2880(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm7, 2816(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 2752(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 2688(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 2624(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 2560(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 2496(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 2432(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 2432(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm6, 2368(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 2304(%rax) @@ -19685,24 +19711,24 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 2176(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 2112(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm5, 1920(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 1792(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1856(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1792(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1664(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm4, 1472(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 1408(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1216(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1152(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1088(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -19715,18 +19741,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 640(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 192(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19748,7 +19774,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovaps %zmm0, 3072(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-NEXT: addq $6280, %rsp # imm = 0x1888 +; AVX512DQ-NEXT: addq $6248, %rsp # imm = 0x1868 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -19760,52 +19786,52 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $96, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] ; AVX512DQ-FCP-NEXT: movb $28, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] @@ -19813,61 +19839,65 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,12,0,5,4,12,0,5] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%r9), %ymm0 @@ -19877,204 +19907,205 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm23, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %ymm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %ymm20 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm26[0],ymm20[2],ymm26[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm19[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm12, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rsi), %zmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%r9), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%r8), %ymm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rax), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%r8), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm17[0],ymm13[2],ymm17[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm19[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r8), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%r9), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rsi), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rax), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%r9), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%r8), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 320(%r8), %ymm1 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r8), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%r9), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %ymm24, %ymm0, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %ymm15, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %ymm9, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %ymm26, %ymm0, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %ymm17, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %ymm9, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 384(%r9), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 384(%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %ymm4, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rsi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm30 ; AVX512DQ-FCP-NEXT: movb $24, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20087,51 +20118,38 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm13 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $48, %r10b -; AVX512DQ-FCP-NEXT: kmovw %r10d, %k4 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: kmovw %r10d, %k4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm11[0],zmm16[0],zmm11[2],zmm16[2],zmm11[4],zmm16[4],zmm11[6],zmm16[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20140,159 +20158,170 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm26[0],zmm12[2],zmm26[2],zmm12[4],zmm26[4],zmm12[6],zmm26[6] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm11 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm28[0],zmm0[2],zmm28[2],zmm0[4],zmm28[4],zmm0[6],zmm28[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm27 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k4} = zmm31[0],zmm21[0],zmm31[2],zmm21[2],zmm31[4],zmm21[4],zmm31[6],zmm21[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm21 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm23, %zmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm22 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm31 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermi2q %zmm19, %zmm15, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm15, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm20 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm14, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[6] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm20 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm26, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm26[0],zmm9[0],zmm26[2],zmm9[2],zmm26[4],zmm9[4],zmm26[6],zmm9[6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm26, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 @@ -20301,20 +20330,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [12,u,u,3,4,5,6,13] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k3} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm16 ; AVX512DQ-FCP-NEXT: movb $6, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k5 ; AVX512DQ-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: movb $64, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 @@ -20323,15 +20353,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,11,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm21 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,11,u,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm7 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-FCP-NEXT: movb $12, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k4 @@ -20340,32 +20370,31 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,9,0,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,9,u,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm10 ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k5} ; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} -; AVX512DQ-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} ; AVX512DQ-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} -; AVX512DQ-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k5} ; AVX512DQ-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} @@ -20373,9 +20402,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20386,114 +20414,114 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $8, %sil ; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm26 {%k2} +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} +; AVX512DQ-FCP-NEXT: movb $112, %sil +; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512DQ-FCP-NEXT: movb $-31, %sil -; AVX512DQ-FCP-NEXT: kmovw %esi, %k2 +; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %xmm4 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %xmm4 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdx), %xmm4 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdx), %xmm4 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k4} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm25 {%k4} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm25 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdx), %xmm4 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k4} -; AVX512DQ-FCP-NEXT: movb $112, %cl -; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm7 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2} ; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 384(%rax), %zmm18, %zmm23 {%k2} ; AVX512DQ-FCP-NEXT: movb $56, %cl ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 @@ -20507,12 +20535,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -20526,18 +20554,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm9 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm26 {%k2} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k2} @@ -20566,8 +20594,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm7 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -20582,7 +20610,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movb $-61, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] @@ -20624,11 +20652,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 2624(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 2560(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 2432(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 2432(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 2304(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 2240(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 2304(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 2240(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 2112(%rax) @@ -20642,7 +20670,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 1664(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 1536(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 1472(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -20673,7 +20701,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -20687,7 +20715,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 3328(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 3328(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -20701,307 +20729,311 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512BW-LABEL: store_i64_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6248, %rsp # imm = 0x1868 +; AVX512BW-NEXT: subq $6312, %rsp # imm = 0x18A8 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [2,10,0,3,2,10,0,3] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $96, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa (%r9), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512BW-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX512BW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512BW-NEXT: movb $28, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,0,5,4,12,0,5] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,0,14,6,5,0,14,6] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [6,13,14,7,6,13,14,7] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm8[0],ymm5[0],ymm8[2],ymm5[2] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm27 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm5 -; AVX512BW-NEXT: vmovdqa 128(%r9), %ymm4 -; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512BW-NEXT: vmovdqa 128(%r9), %ymm5 +; AVX512BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 128(%r8), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm6 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm21 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa 192(%r9), %ymm5 -; AVX512BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 192(%r9), %ymm7 +; AVX512BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm31, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa 256(%r9), %ymm7 -; AVX512BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 256(%r9), %ymm14 +; AVX512BW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 256(%r8), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm5 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm1 ; AVX512BW-NEXT: vmovdqa 320(%r9), %ymm5 ; AVX512BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 320(%r8), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm9 ; AVX512BW-NEXT: movb $24, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,15,0] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,5,15,u] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -21022,7 +21054,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm1 @@ -21035,76 +21067,75 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 ; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm24, %zmm5, %zmm31 -; AVX512BW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm8 ; AVX512BW-NEXT: movb $48, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k4 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 @@ -21113,7 +21144,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 @@ -21122,7 +21153,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -21140,106 +21172,104 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm27[0],zmm0[0],zmm27[2],zmm0[2],zmm27[4],zmm0[4],zmm27[6],zmm0[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k4} = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k4} = zmm31[0],zmm26[0],zmm31[2],zmm26[2],zmm31[4],zmm26[4],zmm31[6],zmm26[6] ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm22[0],zmm15[2],zmm22[2],zmm15[4],zmm22[4],zmm15[6],zmm22[6] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm24 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm19[0],zmm1[0],zmm19[2],zmm1[2],zmm19[4],zmm1[4],zmm19[6],zmm1[6] -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm20[0],zmm1[0],zmm20[2],zmm1[2],zmm20[4],zmm1[4],zmm20[6],zmm1[6] +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm18 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm28, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm28, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm28, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm28, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm28, %zmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm19[0],zmm28[2],zmm19[2],zmm28[4],zmm19[4],zmm28[6],zmm19[6] +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm28, %zmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm28[0],zmm12[0],zmm28[2],zmm12[2],zmm28[4],zmm12[4],zmm28[6],zmm12[6] ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm4, %zmm28 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [12,0,0,3,4,5,6,13] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,u,u,3,4,5,6,13] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 {%k3} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm17 ; AVX512BW-NEXT: movb $6, %sil ; AVX512BW-NEXT: kmovd %esi, %k4 ; AVX512BW-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,9,0,0,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,u,u,6,7] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm8 ; AVX512BW-NEXT: movb $64, %sil ; AVX512BW-NEXT: kmovd %esi, %k5 @@ -21248,16 +21278,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: kmovd %esi, %k5 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k5} ; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm5 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm6 = [13,0,2,3,4,5,6,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [13,u,2,3,4,5,6,14] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 ; AVX512BW-NEXT: movb $12, %sil ; AVX512BW-NEXT: kmovd %esi, %k5 @@ -21266,15 +21296,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} ; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 384(%r9), %ymm12 ; AVX512BW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -21282,160 +21312,159 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm12 = [0,1,2,3,10,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $8, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} ; AVX512BW-NEXT: movb $-31, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k5} ; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} ; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} ; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k5} ; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} -; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k5} ; AVX512BW-NEXT: movb $112, %sil ; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $3, 384(%rax), %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} ; AVX512BW-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} -; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512BW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512BW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512BW-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k4} ; AVX512BW-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} -; AVX512BW-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} +; AVX512BW-NEXT: vpbroadcastq 328(%rcx), %ymm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} ; AVX512BW-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k4} ; AVX512BW-NEXT: movb $56, %cl ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21443,10 +21472,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} @@ -21454,50 +21489,45 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: movb $120, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} ; AVX512BW-NEXT: movb $-61, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -21515,86 +21545,87 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm7 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: movb $14, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm18 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 2944(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 2752(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 2944(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 2880(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm7, 2816(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 2752(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 2688(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 2624(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 2624(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 2560(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 2496(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 2432(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 2432(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 2240(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 2304(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 2240(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 2176(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 2112(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm5, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 1856(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1728(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1536(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm4, 1472(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -21603,9 +21634,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1088(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -21614,8 +21645,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 640(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm2, 576(%rax) @@ -21623,10 +21654,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) @@ -21649,334 +21680,328 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 3072(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-NEXT: addq $6248, %rsp # imm = 0x1868 +; AVX512BW-NEXT: addq $6312, %rsp # imm = 0x18A8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i64_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $6120, %rsp # imm = 0x17E8 +; AVX512BW-FCP-NEXT: subq $6248, %rsp # imm = 0x1868 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [11,3,11,3,11,3,11,3] -; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] +; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: movb $96, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [9,1,9,1,9,1,9,1] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 64(%r9), %ymm10 ; AVX512BW-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm8 ; AVX512BW-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512BW-FCP-NEXT: movb $28, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,12,0,5,4,12,0,5] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,12,7,0,1,12,7] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,1,12,7,0,1,12,7] +; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,6,7,0,13,6,7] -; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,13,6,7,0,13,6,7] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm1 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [6,13,14,7,6,13,14,7] +; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm30 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm29, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa 128(%r9), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa 128(%r9), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm12, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa 192(%r9), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %ymm24 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm14[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %ymm28 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %ymm25 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm25[0],ymm28[0],ymm25[2],ymm28[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm4[2,3,2,3],zmm13[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm7, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm29, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm9, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm2, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa 256(%r9), %ymm10 ; AVX512BW-FCP-NEXT: vmovdqa 256(%r8), %ymm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm6[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm7, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 320(%r9), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %ymm31 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm31[0],ymm6[0],ymm31[2],ymm6[2] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %ymm18 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm18[0],ymm6[0],ymm18[2],ymm6[2] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm8[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %ymm28, %ymm0, %ymm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %ymm10, %ymm0, %ymm5 ; AVX512BW-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %ymm6, %ymm0, %ymm31 -; AVX512BW-FCP-NEXT: vmovdqu64 %ymm31, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %ymm6, %ymm0, %ymm18 +; AVX512BW-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 384(%r9), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa 384(%r8), %ymm4 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %ymm2, %ymm0, %ymm4 ; AVX512BW-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm18 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm28 ; AVX512BW-FCP-NEXT: movb $24, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload +; AVX512BW-FCP-NEXT: vpermt2q (%rsp), %zmm20, %zmm0 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm28 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] @@ -21985,216 +22010,219 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm7, %zmm11 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $48, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k4 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm12[0],zmm19[0],zmm12[2],zmm19[2],zmm12[4],zmm19[4],zmm12[6],zmm19[6] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k4} = zmm3[0],zmm28[0],zmm3[2],zmm28[2],zmm3[4],zmm28[4],zmm3[6],zmm28[6] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm4, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm6, %zmm24 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k4} = zmm3[0],zmm31[0],zmm3[2],zmm31[2],zmm3[4],zmm31[4],zmm3[6],zmm31[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm17 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k4} = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm27 -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm31, %zmm26 -; AVX512BW-FCP-NEXT: vpermi2q %zmm31, %zmm10, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm21 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm10, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm25 -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm14[0],zmm18[0],zmm14[2],zmm18[2],zmm14[4],zmm18[4],zmm14[6],zmm18[6] -; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm6, %zmm19 +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm13, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k4} = zmm13[0],zmm18[0],zmm13[2],zmm18[2],zmm13[4],zmm18[4],zmm13[6],zmm18[6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 @@ -22203,21 +22231,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,11,0,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [12,0,0,3,4,5,6,13] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [12,u,u,3,4,5,6,13] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm15 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm17 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm17 ; AVX512BW-FCP-NEXT: movb $6, %sil -; AVX512BW-FCP-NEXT: kmovd %esi, %k5 ; AVX512BW-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-FCP-NEXT: kmovd %esi, %k5 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: movb $64, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 @@ -22226,16 +22253,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} ; AVX512BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,1,11,0,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,11,u,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,12,0,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,12,u,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm11, %zmm8 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [13,0,2,3,4,5,6,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,u,2,3,4,5,6,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm15 ; AVX512BW-FCP-NEXT: movb $12, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512BW-FCP-NEXT: vmovdqa 448(%rdx), %xmm1 @@ -22243,68 +22268,72 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,2,3,9,0,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,1,2,3,4,5,15,u] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,u,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,9,u,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k5} +; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k5} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} ; AVX512BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512BW-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} ; AVX512BW-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} -; AVX512BW-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512BW-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k5} ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rax), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm8, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm11, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,9,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm18 ; AVX512BW-FCP-NEXT: movb $8, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm31 {%k2} @@ -22317,13 +22346,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm13 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 {%k3} ; AVX512BW-FCP-NEXT: movb $-31, %sil ; AVX512BW-FCP-NEXT: kmovd %esi, %k2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -22336,42 +22366,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm8 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm9 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm10 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm3 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm4 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm25 {%k4} ; AVX512BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm3 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -22384,66 +22414,67 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k2} ; AVX512BW-FCP-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} ; AVX512BW-FCP-NEXT: movb $56, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm25 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: movb $14, %cl ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm20 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm6 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm6 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm27 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm22 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k2} @@ -22517,33 +22548,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload ; AVX512BW-FCP-NEXT: # zmm12 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 3008(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 2944(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 2880(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 2752(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 2752(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 2688(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm7, 2624(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 2560(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 2432(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 2432(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm7, 2368(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 2304(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 2240(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 2304(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 2240(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 2112(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 1984(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 1984(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm7, 1920(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 1856(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 1856(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1728(%rax) @@ -22552,8 +22583,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1536(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm7, 1472(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 1408(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) @@ -22562,7 +22594,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1088(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm5, 1024(%rax) -; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 896(%rax) @@ -22581,14 +22613,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 192(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 3520(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 3520(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -22596,29 +22628,28 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, 3328(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3264(%rax) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 3072(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 3072(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-FCP-NEXT: addq $6120, %rsp # imm = 0x17E8 +; AVX512BW-FCP-NEXT: addq $6248, %rsp # imm = 0x1868 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i64_stride7_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: subq $6280, %rsp # imm = 0x1888 +; AVX512DQ-BW-NEXT: subq $6248, %rsp # imm = 0x1868 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 @@ -22637,11 +22668,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [9,1,9,1,9,1,9,1] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm7, %zmm2 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 @@ -22650,15 +22681,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm5 ; AVX512DQ-BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-BW-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX512DQ-BW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512DQ-BW-NEXT: movb $28, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] @@ -22690,14 +22721,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 @@ -22708,7 +22738,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22717,10 +22747,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm8 @@ -22734,19 +22764,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm18 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22756,10 +22786,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm0 @@ -22772,54 +22802,57 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm28, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rsi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa 256(%r9), %ymm12 -; AVX512DQ-BW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 256(%r9), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 256(%r8), %ymm4 ; AVX512DQ-BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 256(%r8), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%r9), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm28, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm1 @@ -22830,32 +22863,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm25, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rax), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa 320(%r9), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rax), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa 320(%r9), %ymm1 +; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 320(%r8), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm7[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm7[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 320(%r8), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%r9), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm17, %zmm0 @@ -22869,40 +22905,39 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rsi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm1 ; AVX512DQ-BW-NEXT: movb $24, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm30 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm30 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%r9), %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,1,2,3,4,5,15,0] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm17, %zmm2 @@ -22919,13 +22954,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] +; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,0,12,4,3,0,12,4] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm1 @@ -22937,39 +22972,39 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm30 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22980,13 +23015,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdx), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm7 ; AVX512DQ-BW-NEXT: vpermi2q %zmm27, %zmm5, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22994,7 +23029,6 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 @@ -23006,25 +23040,25 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: kmovd %r10d, %k4 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k4} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -23034,15 +23068,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -23068,61 +23101,62 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm30 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm31 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm27 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm22 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm24 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k4} = zmm16[0],zmm3[0],zmm16[2],zmm3[2],zmm16[4],zmm3[4],zmm16[6],zmm3[6] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm24 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm18[0],zmm1[0],zmm18[2],zmm1[2],zmm18[4],zmm1[4],zmm18[6],zmm1[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm16, %zmm25 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm28, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm16, %zmm6 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm28, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm16, %zmm9 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512DQ-BW-NEXT: vpermi2q %zmm18, %zmm28, %zmm9 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k4} = zmm28[0],zmm18[0],zmm28[2],zmm18[2],zmm28[4],zmm18[4],zmm28[6],zmm18[6] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm28 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm15, %zmm4 @@ -23130,20 +23164,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,11,0,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm8, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [12,0,0,3,4,5,6,13] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: movb $6, %sil -; AVX512DQ-BW-NEXT: kmovd %esi, %k4 ; AVX512DQ-BW-NEXT: vpbroadcastq 456(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: kmovd %esi, %k4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4} -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,9,0,0,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,u,u,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm10 ; AVX512DQ-BW-NEXT: movb $64, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k5 @@ -23152,16 +23186,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: kmovd %esi, %k5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k5} ; AVX512DQ-BW-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,11,0,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,12,0,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm7 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 ; AVX512DQ-BW-NEXT: movb $12, %sil ; AVX512DQ-BW-NEXT: kmovd %esi, %k5 @@ -23170,15 +23204,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5} ; AVX512DQ-BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,9,0,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm10, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rax), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 384(%r9), %ymm12 ; AVX512DQ-BW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -23186,7 +23220,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[2] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm8[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 @@ -23195,25 +23229,25 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,8,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $8, %sil @@ -23222,13 +23256,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k3} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -23239,13 +23272,14 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -23258,22 +23292,22 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %xmm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} ; AVX512DQ-BW-NEXT: vmovdqa 256(%rdx), %xmm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} ; AVX512DQ-BW-NEXT: vmovdqa 320(%rdx), %xmm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -23288,41 +23322,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm27 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm30 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm21 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x2 $3, 384(%rax), %zmm0, %zmm28 {%k2} ; AVX512DQ-BW-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm2 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpbroadcastq 200(%rcx), %ymm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k4} ; AVX512DQ-BW-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} -; AVX512DQ-BW-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpbroadcastq 328(%rcx), %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k4} ; AVX512DQ-BW-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -23333,16 +23367,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm20 {%k2} @@ -23393,10 +23426,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512DQ-BW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-BW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-BW-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -23419,19 +23452,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: movb $14, %cl ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm10 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -23444,13 +23477,6 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -23458,27 +23484,34 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm8 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-BW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 3008(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 2944(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 2880(%rax) -; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 2816(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 2880(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm7, 2816(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 2752(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 2688(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 2624(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 2560(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 2496(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 2432(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 2432(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 2368(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 2304(%rax) @@ -23486,24 +23519,24 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 2176(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 2112(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1984(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm5, 1920(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 1792(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1728(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, 1856(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 1792(%rax) +; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1664(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1600(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1536(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm4, 1472(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 1408(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1216(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 1152(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 1088(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -23516,18 +23549,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 768(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 640(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 576(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 192(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -23549,7 +23582,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3072(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-BW-NEXT: addq $6280, %rsp # imm = 0x1888 +; AVX512DQ-BW-NEXT: addq $6248, %rsp # imm = 0x1868 ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -23561,52 +23594,52 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $96, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r8), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] ; AVX512DQ-BW-FCP-NEXT: movb $28, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] @@ -23614,61 +23647,65 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,12,0,5,4,12,0,5] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,1,12,7,0,1,12,7] ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7] -; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [5,0,14,6,5,0,14,6] +; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,13,6,7,0,13,6,7] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,13,14,7,6,13,14,7] +; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm15, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%r9), %ymm0 @@ -23678,204 +23715,205 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm23, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %ymm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %ymm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm26[0],ymm20[2],ymm26[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm2[2,3,2,3],zmm19[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm12, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm7, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rsi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%r9), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%r8), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rax), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%r8), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm17[0],ymm13[2],ymm17[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm5[2,3,2,3],zmm19[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r8), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%r9), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm23, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rsi), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm2, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rax), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%r9), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%r8), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%r8), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm7[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r8), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%r9), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm6, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm23, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm24, %ymm0, %ymm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm15, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm9, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm26, %ymm0, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm17, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm9, %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%r9), %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%r8), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm4, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdx), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rsi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm30 ; AVX512DQ-BW-FCP-NEXT: movb $24, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm18, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rcx), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23888,51 +23926,38 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [13,5,13,5,13,5,13,5] ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $48, %r10b -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm6, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k4} = zmm11[0],zmm16[0],zmm11[2],zmm16[2],zmm11[4],zmm16[4],zmm11[6],zmm16[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23941,159 +23966,170 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k4} = zmm16[0],zmm0[0],zmm16[2],zmm0[2],zmm16[4],zmm0[4],zmm16[6],zmm0[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k4} = zmm12[0],zmm26[0],zmm12[2],zmm26[2],zmm12[4],zmm26[4],zmm12[6],zmm26[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm6, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm17[0],zmm16[0],zmm17[2],zmm16[2],zmm17[4],zmm16[4],zmm17[6],zmm16[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm23, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm28[0],zmm0[2],zmm28[2],zmm0[4],zmm28[4],zmm0[6],zmm28[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k4} = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm4, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k4} = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm5, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rsi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k4} = zmm31[0],zmm21[0],zmm31[2],zmm21[2],zmm31[4],zmm21[4],zmm31[6],zmm21[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm6, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k4} = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm23, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm4, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm19, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm19, %zmm15, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm3, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm15, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm6, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm14[0],zmm9[0],zmm14[2],zmm9[2],zmm14[4],zmm9[4],zmm14[6],zmm9[6] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm26, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k4} = zmm26[0],zmm9[0],zmm26[2],zmm9[2],zmm26[4],zmm9[4],zmm26[6],zmm9[6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm26, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k4} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm4 @@ -24102,20 +24138,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,11,0,0,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm13, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [12,0,0,3,4,5,6,13] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [12,u,u,3,4,5,6,13] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,3,4,15,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movb $6, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k5 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 456(%rcx), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,2,9,0,0,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movb $64, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 @@ -24124,15 +24161,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k4} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,11,0,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,10,0,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,1,11,u,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,12,0,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm12, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [13,0,2,3,4,5,6,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,4,5,15,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movb $12, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k4 @@ -24141,32 +24178,31 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,2,3,4,8,0,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,1,2,3,9,0,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,9,u,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k5} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} @@ -24174,9 +24210,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24187,114 +24222,114 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rax), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm9, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm10, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $8, %sil ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm21 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm26 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} +; AVX512DQ-BW-FCP-NEXT: movb $112, %sil +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512DQ-BW-FCP-NEXT: movb $-31, %sil -; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k2 +; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm9 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdx), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdx), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm25 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm25 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdx), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k4} -; AVX512DQ-BW-FCP-NEXT: movb $112, %cl -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 256(%rax), %zmm0, %zmm29 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 320(%rax), %zmm0, %zmm22 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 384(%rax), %zmm18, %zmm23 {%k2} ; AVX512DQ-BW-FCP-NEXT: movb $56, %cl ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 @@ -24308,12 +24343,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -24327,18 +24362,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm9 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm28 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm26 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm22 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k2} @@ -24367,8 +24402,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # zmm9 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: # zmm7 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -24383,7 +24418,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movb $-61, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] @@ -24425,11 +24460,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, 2624(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 2560(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 2432(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 2432(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 2368(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 2304(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 2240(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 2304(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 2240(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 2176(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 2112(%rax) @@ -24443,7 +24478,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 1664(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1536(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 1536(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 1472(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -24474,7 +24509,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -24488,7 +24523,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 3328(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 3328(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index e837f14d367b2..aaa209d0c7f29 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -62,9 +62,9 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovaps (%r9), %xmm3 ; AVX-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX-NEXT: vinsertf128 $1, (%r11), %ymm3, %ymm3 ; AVX-NEXT: vinsertf128 $1, (%r10), %ymm2, %ymm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] @@ -170,13 +170,13 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r11), %xmm3 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) @@ -194,13 +194,13 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -218,13 +218,13 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) @@ -242,13 +242,13 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -266,13 +266,13 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -290,13 +290,13 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -314,13 +314,13 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -338,13 +338,13 @@ define void @store_i64_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) @@ -640,15 +640,14 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-NEXT: vmovdqa (%r8), %ymm2 -; AVX512-NEXT: vmovdqa (%r11), %ymm3 +; AVX512-NEXT: vmovdqa (%r10), %ymm3 ; AVX512-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 +; AVX512-NEXT: vinserti64x4 $1, (%rax), %zmm3, %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -669,6 +668,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] @@ -683,15 +683,14 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%r11), %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, (%rax), %zmm3, %zmm3 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -712,6 +711,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] @@ -726,15 +726,14 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%r11), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%r10), %ymm3 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $1, (%rax), %zmm3, %zmm3 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -755,6 +754,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] @@ -769,15 +769,14 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rax), %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -798,6 +797,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] @@ -812,15 +812,14 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 -; AVX512BW-NEXT: vmovdqa (%r11), %ymm3 +; AVX512BW-NEXT: vmovdqa (%r10), %ymm3 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $1, (%rax), %zmm3, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -841,6 +840,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] @@ -855,15 +855,14 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa (%r11), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa (%r10), %ymm3 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rax), %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -884,6 +883,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] @@ -898,15 +898,14 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa (%r11), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %ymm3 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rax), %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -927,6 +926,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] @@ -941,15 +941,14 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r10), %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rax), %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 @@ -970,6 +969,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] @@ -1208,8 +1208,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX-NEXT: vbroadcastsd 8(%rdx), %ymm9 +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] ; AVX-NEXT: vmovaps 16(%r9), %xmm10 @@ -1289,18 +1289,18 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovaps 32(%r9), %xmm7 ; AVX2-NEXT: vmovaps (%r8), %xmm10 ; AVX2-NEXT: vmovaps 32(%r8), %xmm8 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] -; AVX2-NEXT: vbroadcastsd 40(%r10), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 40(%r10), %ymm1 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rcx), %xmm5 ; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 ; AVX2-NEXT: vmovaps (%rsi), %xmm11 -; AVX2-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-NEXT: vmovaps (%rdi), %xmm13 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm12[1] ; AVX2-NEXT: vbroadcastsd 40(%rdx), %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -1311,7 +1311,7 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm11[1] ; AVX2-NEXT: vbroadcastsd 8(%rdx), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-NEXT: vmovaps (%rcx), %xmm15 @@ -1324,33 +1324,33 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm12[0] ; AVX2-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vmovaps (%r8), %ymm13 +; AVX2-NEXT: vmovaps (%r8), %ymm12 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] -; AVX2-NEXT: vmovaps (%r9), %ymm14 ; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 +; AVX2-NEXT: vmovaps (%r9), %ymm14 ; AVX2-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm13[0],xmm11[0] ; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vbroadcastsd %xmm15, %ymm9 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],mem[2,3] +; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-NEXT: vbroadcastsd 16(%rax), %ymm10 -; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-NEXT: vbroadcastsd 16(%rcx), %ymm11 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX2-NEXT: vmovaps 32(%r9), %ymm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-NEXT: vbroadcastsd 56(%r10), %ymm15 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; AVX2-NEXT: vbroadcastsd 48(%rax), %ymm1 @@ -1364,14 +1364,14 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] ; AVX2-NEXT: vbroadcastsd 48(%rcx), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] -; AVX2-NEXT: vbroadcastsd 24(%r10), %ymm13 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-NEXT: vbroadcastsd 24(%r10), %ymm12 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX2-NEXT: vbroadcastsd 24(%rdx), %ymm8 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] @@ -1411,18 +1411,18 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovaps 32(%r9), %xmm7 ; AVX2-FP-NEXT: vmovaps (%r8), %xmm10 ; AVX2-FP-NEXT: vmovaps 32(%r8), %xmm8 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] -; AVX2-FP-NEXT: vbroadcastsd 40(%r10), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 40(%r10), %ymm1 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm5 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 ; AVX2-FP-NEXT: vmovaps (%rsi), %xmm11 -; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm13 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm12[1] ; AVX2-FP-NEXT: vbroadcastsd 40(%rdx), %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -1433,7 +1433,7 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm11[1] ; AVX2-FP-NEXT: vbroadcastsd 8(%rdx), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FP-NEXT: vmovaps (%rcx), %xmm15 @@ -1446,33 +1446,33 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm12[0] ; AVX2-FP-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovaps (%r8), %ymm13 +; AVX2-FP-NEXT: vmovaps (%r8), %ymm12 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] -; AVX2-FP-NEXT: vmovaps (%r9), %ymm14 ; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovaps (%r9), %ymm14 ; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm13[0],xmm11[0] ; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vbroadcastsd %xmm15, %ymm9 +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],mem[2,3] +; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-FP-NEXT: vbroadcastsd 16(%rax), %ymm10 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FP-NEXT: vbroadcastsd 16(%rcx), %ymm11 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FP-NEXT: vmovaps 32(%r9), %ymm0 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FP-NEXT: vbroadcastsd 56(%r10), %ymm15 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FP-NEXT: vbroadcastsd 48(%rax), %ymm1 @@ -1486,14 +1486,14 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] ; AVX2-FP-NEXT: vbroadcastsd 48(%rcx), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] -; AVX2-FP-NEXT: vbroadcastsd 24(%r10), %ymm13 -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-FP-NEXT: vbroadcastsd 24(%r10), %ymm12 +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX2-FP-NEXT: vbroadcastsd 24(%rdx), %ymm8 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] @@ -1533,18 +1533,18 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovaps 32(%r9), %xmm7 ; AVX2-FCP-NEXT: vmovaps (%r8), %xmm10 ; AVX2-FCP-NEXT: vmovaps 32(%r8), %xmm8 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] -; AVX2-FCP-NEXT: vbroadcastsd 40(%r10), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 40(%r10), %ymm1 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm5 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm11 -; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm13 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm12[1] ; AVX2-FCP-NEXT: vbroadcastsd 40(%rdx), %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -1555,7 +1555,7 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm11[1] ; AVX2-FCP-NEXT: vbroadcastsd 8(%rdx), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm15 @@ -1568,33 +1568,33 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm12[0] ; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm1 ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovaps (%r8), %ymm13 +; AVX2-FCP-NEXT: vmovaps (%r8), %ymm12 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] -; AVX2-FCP-NEXT: vmovaps (%r9), %ymm14 ; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovaps (%r9), %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm13[0],xmm11[0] ; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vbroadcastsd %xmm15, %ymm9 +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],mem[2,3] +; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm10 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vbroadcastsd 16(%rcx), %ymm11 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm0 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-FCP-NEXT: vbroadcastsd 56(%r10), %ymm15 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 48(%rax), %ymm1 @@ -1608,14 +1608,14 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] ; AVX2-FCP-NEXT: vbroadcastsd 48(%rcx), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] -; AVX2-FCP-NEXT: vbroadcastsd 24(%r10), %ymm13 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-FCP-NEXT: vbroadcastsd 24(%r10), %ymm12 +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX2-FCP-NEXT: vbroadcastsd 24(%rdx), %ymm8 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] @@ -1679,17 +1679,17 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 +; AVX512-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14 @@ -1731,11 +1731,11 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] @@ -1788,17 +1788,17 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 @@ -1840,11 +1840,11 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] @@ -1897,17 +1897,17 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14 @@ -1949,11 +1949,11 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] @@ -2006,17 +2006,17 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 @@ -2058,11 +2058,11 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] @@ -2115,17 +2115,17 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 @@ -2167,11 +2167,11 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] @@ -2224,17 +2224,17 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 @@ -2276,11 +2276,11 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] @@ -2333,17 +2333,17 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14 @@ -2385,11 +2385,11 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] @@ -2442,17 +2442,17 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm13, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm13, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14 @@ -2494,11 +2494,11 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] @@ -2898,17 +2898,17 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm5 -; AVX-NEXT: vbroadcastsd 8(%r10), %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX-NEXT: vbroadcastsd 8(%r10), %ymm5 +; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 -; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm6, %ymm6 -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX-NEXT: vinsertf128 $1, 32(%rdx), %ymm6, %ymm7 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm6 +; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[2] ; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 32(%r9), %xmm6 ; AVX-NEXT: vmovaps 32(%r8), %xmm7 @@ -2918,8 +2918,8 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[2] ; AVX-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX-NEXT: vbroadcastsd 40(%rdx), %ymm5 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2978,10 +2978,10 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 104(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 104(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 16(%rsi), %xmm0 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1 @@ -3044,18 +3044,18 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 112(%rdi), %xmm5 ; AVX-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm1[0] ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm15 -; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX-NEXT: vmovaps 112(%r9), %xmm15 +; AVX-NEXT: vbroadcastsd 112(%rcx), %ymm13 +; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX-NEXT: vmovaps 112(%r9), %xmm14 ; AVX-NEXT: vmovaps 112(%r8), %xmm0 -; AVX-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm15[0] -; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm14[0] +; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vbroadcastsd 112(%rax), %ymm12 -; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6,7] ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] ; AVX-NEXT: vbroadcastsd 120(%rdx), %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX-NEXT: vbroadcastsd 120(%r10), %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx @@ -3070,7 +3070,7 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps %ymm0, 992(%rdx) ; AVX-NEXT: vmovaps %ymm1, 960(%rdx) ; AVX-NEXT: vmovaps %ymm12, 928(%rdx) -; AVX-NEXT: vmovaps %ymm14, 896(%rdx) +; AVX-NEXT: vmovaps %ymm13, 896(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 864(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3132,17 +3132,17 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-NEXT: vbroadcastsd 8(%rdx), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 8(%rdx), %ymm1 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rax), %xmm0 @@ -3184,20 +3184,20 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%r9), %xmm8 ; AVX2-NEXT: vmovaps 64(%r8), %xmm7 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] -; AVX2-NEXT: vbroadcastsd 72(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 72(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 64(%rax), %xmm6 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX2-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] +; AVX2-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm5[1] ; AVX2-NEXT: vbroadcastsd 104(%rdx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%r9), %xmm2 @@ -3226,15 +3226,15 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-NEXT: # xmm13 = xmm13[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm13, %ymm13 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm13, %ymm13 ; AVX2-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] ; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 @@ -3246,13 +3246,13 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX2-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm4, %ymm4 -; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm3, %ymm3 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vbroadcastsd %xmm4, %ymm2 ; AVX2-NEXT: vinsertf128 $1, 96(%r10), %ymm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3405,17 +3405,17 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vbroadcastsd 8(%rdx), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 8(%rdx), %ymm1 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rax), %xmm0 @@ -3457,20 +3457,20 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm8 ; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm7 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] -; AVX2-FP-NEXT: vbroadcastsd 72(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 72(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm6 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] +; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm5[1] ; AVX2-FP-NEXT: vbroadcastsd 104(%rdx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%r9), %xmm2 @@ -3499,15 +3499,15 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm13, %ymm13 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm13, %ymm13 ; AVX2-FP-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 @@ -3519,13 +3519,13 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm4, %ymm4 -; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm3, %ymm3 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FP-NEXT: vbroadcastsd %xmm4, %ymm2 ; AVX2-FP-NEXT: vinsertf128 $1, 96(%r10), %ymm1, %ymm1 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3678,17 +3678,17 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vbroadcastsd 8(%rdx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 8(%rdx), %ymm1 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0 @@ -3730,20 +3730,20 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm8 ; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm7 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] -; AVX2-FCP-NEXT: vbroadcastsd 72(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 72(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm6 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm5[1] ; AVX2-FCP-NEXT: vbroadcastsd 104(%rdx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm2 @@ -3772,15 +3772,15 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm13, %ymm13 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm13, %ymm13 ; AVX2-FCP-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 @@ -3792,13 +3792,13 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm4, %ymm4 -; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FCP-NEXT: vbroadcastsd %xmm4, %ymm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%r10), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3950,184 +3950,184 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm27 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm28 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512-NEXT: vmovdqa64 64(%r11), %zmm3 +; AVX512-NEXT: vmovdqa64 64(%r11), %zmm2 ; AVX512-NEXT: vmovdqa64 (%r10), %zmm9 ; AVX512-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm14 ; AVX512-NEXT: movb $-64, %r8b ; AVX512-NEXT: kmovw %r8d, %k1 -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512-NEXT: vinserti32x4 $1, (%rcx), %ymm16, %ymm16 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm10[1],ymm16[1],ymm10[3],ymm16[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm19, %zmm14, %zmm21 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],ymm16[0],ymm10[2],ymm16[2] ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512-NEXT: vpermt2q %zmm17, %zmm20, %zmm14 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm14 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm5[1],zmm7[1],zmm5[3],zmm7[3],zmm5[5],zmm7[5],zmm5[7],zmm7[7] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm16, %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512-NEXT: vpermt2q %zmm7, %zmm24, %zmm12 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm16 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512-NEXT: vpermt2q %zmm26, %zmm25, %zmm6 ; AVX512-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm6, %ymm6 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-NEXT: vpermt2q %zmm28, %zmm20, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512-NEXT: vpermt2q %zmm27, %zmm20, %zmm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512-NEXT: vpermt2q %zmm27, %zmm23, %zmm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpermi2q %zmm4, %zmm2, %zmm23 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512-NEXT: vpermt2q %zmm28, %zmm24, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512-NEXT: vpermt2q %zmm27, %zmm24, %zmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512-NEXT: vpermt2q %zmm28, %zmm25, %zmm13 ; AVX512-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512-NEXT: vmovdqa64 64(%rcx), %ymm23 ; AVX512-NEXT: vmovdqa64 (%rdx), %ymm24 ; AVX512-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm13[1],ymm24[3],ymm13[3] ; AVX512-NEXT: vmovdqa64 (%rsi), %ymm26 ; AVX512-NEXT: vmovdqa64 64(%rsi), %ymm27 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm28 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm13[0],ymm24[2],ymm13[2] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm7 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 640(%rax) ; AVX512-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512-NEXT: vmovdqa64 %zmm18, 576(%rax) @@ -4149,184 +4149,184 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 64(%r11), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm14 ; AVX512-FCP-NEXT: movb $-64, %r8b ; AVX512-FCP-NEXT: kmovw %r8d, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512-FCP-NEXT: vinserti32x4 $1, (%rcx), %ymm16, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm10[1],ymm16[1],ymm10[3],ymm16[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm14, %zmm21 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],ymm16[0],ymm10[2],ymm16[2] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm14 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm5[1],zmm7[1],zmm5[3],zmm7[3],zmm5[5],zmm7[5],zmm5[7],zmm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm16, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm12 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm16 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm6 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm6, %ymm6 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm23 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm13[1],ymm24[3],ymm13[3] ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm13[0],ymm24[2],ymm13[2] ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) @@ -4348,184 +4348,184 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm28 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 64(%r11), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 64(%r11), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm18, %zmm14 ; AVX512DQ-NEXT: movb $-64, %r8b ; AVX512DQ-NEXT: kmovw %r8d, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQ-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512DQ-NEXT: vinserti32x4 $1, (%rcx), %ymm16, %ymm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm10[1],ymm16[1],ymm10[3],ymm16[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm19, %zmm14, %zmm21 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm19, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm19, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],ymm16[0],ymm10[2],ymm16[2] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm20, %zmm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm20, %zmm14 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm5[1],zmm7[1],zmm5[3],zmm7[3],zmm5[5],zmm7[5],zmm5[7],zmm7[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm16, %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm24, %zmm12 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm16 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm25, %zmm6 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm6, %ymm6 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm20, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm20, %zmm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm23, %zmm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm2, %zmm23 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm24, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm24, %zmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm25, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %ymm23 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm24 ; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm13[1],ymm24[3],ymm13[3] ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %ymm26 ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm27 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm28 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm13[0],ymm24[2],ymm13[2] ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm7 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 576(%rax) @@ -4547,184 +4547,184 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r11), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm14 ; AVX512DQ-FCP-NEXT: movb $-64, %r8b ; AVX512DQ-FCP-NEXT: kmovw %r8d, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%rcx), %ymm16, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm10[1],ymm16[1],ymm10[3],ymm16[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm14, %zmm21 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],ymm16[0],ymm10[2],ymm16[2] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm14 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm5[1],zmm7[1],zmm5[3],zmm7[3],zmm5[5],zmm7[5],zmm5[7],zmm7[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm16, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm12 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm16 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm23 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm13[1],ymm24[3],ymm13[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm13[0],ymm24[2],ymm13[2] ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) @@ -4746,184 +4746,184 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm28 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r11), %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%r11), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm9 ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm14 ; AVX512BW-NEXT: movb $-64, %r8b ; AVX512BW-NEXT: kmovd %r8d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, (%rcx), %ymm16, %ymm16 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm10[1],ymm16[1],ymm10[3],ymm16[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm14, %zmm21 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],ymm16[0],ymm10[2],ymm16[2] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm5[1],zmm7[1],zmm5[3],zmm7[3],zmm5[5],zmm7[5],zmm5[7],zmm7[7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm16, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm16 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm6, %ymm6 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %ymm23 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm24 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm13[1],ymm24[3],ymm13[3] ; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm26 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm27 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm28 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm13[0],ymm24[2],ymm13[2] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) @@ -4945,184 +4945,184 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r11), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm14 ; AVX512BW-FCP-NEXT: movb $-64, %r8b ; AVX512BW-FCP-NEXT: kmovd %r8d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%rcx), %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm10[1],ymm16[1],ymm10[3],ymm16[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm14, %zmm21 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],ymm16[0],ymm10[2],ymm16[2] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm14 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm5[1],zmm7[1],zmm5[3],zmm7[3],zmm5[5],zmm7[5],zmm5[7],zmm7[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm16, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm12 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm16 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm6, %ymm6 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm23 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm13[1],ymm24[3],ymm13[3] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm13[0],ymm24[2],ymm13[2] ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm7 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) @@ -5144,184 +5144,184 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r11), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r11), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm18, %zmm14 ; AVX512DQ-BW-NEXT: movb $-64, %r8b ; AVX512DQ-BW-NEXT: kmovd %r8d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%rcx), %ymm16, %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm10[1],ymm16[1],ymm10[3],ymm16[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm19, %zmm14, %zmm21 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm19, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],ymm16[0],ymm10[2],ymm16[2] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm14 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm14 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm5[1],zmm7[1],zmm5[3],zmm7[3],zmm5[5],zmm7[5],zmm5[7],zmm7[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm16, %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm12 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm16 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm6 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm6, %ymm6 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm20, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm20, %zmm6 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm23 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm12 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %ymm23 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm24 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm13[1],ymm24[3],ymm13[3] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm26 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %ymm27 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm28 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm13[0],ymm24[2],ymm13[2] ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm7 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 640(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 576(%rax) @@ -5343,184 +5343,184 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r11), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r11), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %r8b ; AVX512DQ-BW-FCP-NEXT: kmovd %r8d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%rdx), %ymm16, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm16[1],ymm10[1],ymm16[3],ymm10[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm12, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%rcx), %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm19 = ymm10[1],ymm16[1],ymm10[3],ymm16[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm19, %zmm14, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm16[0],ymm10[0],ymm16[2],ymm10[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm19, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],ymm16[0],ymm10[2],ymm16[2] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [5,13,5,13,5,13,5,13] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm22, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm6[1],zmm7[1],zmm6[3],zmm7[3],zmm6[5],zmm7[5],zmm6[7],zmm7[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm20, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm20, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm5[1],zmm7[1],zmm5[3],zmm7[3],zmm5[5],zmm7[5],zmm5[7],zmm7[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm16 = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm16, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm24, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm6, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm19, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm20, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm2, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm3[1],zmm4[1],zmm3[3],zmm4[3],zmm3[5],zmm4[5],zmm3[7],zmm4[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm3[0],zmm4[0],zmm3[2],zmm4[2],zmm3[4],zmm4[4],zmm3[6],zmm4[6] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm2[0],zmm4[0],zmm2[2],zmm4[2],zmm2[4],zmm4[4],zmm2[6],zmm4[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] +; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %ymm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm14[1],ymm24[3],ymm14[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm24[1],ymm13[1],ymm24[3],ymm13[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm28[1],ymm26[1],ymm28[3],ymm26[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm14[0],ymm24[2],ymm14[2] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [2,10,2,10,2,10,2,10] +; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm24[0],ymm13[0],ymm24[2],ymm13[2] ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm26[0],ymm28[2],ymm26[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm25[1],ymm23[1],ymm25[3],ymm23[3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm30[1],ymm27[1],ymm30[3],ymm27[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm23[0],ymm25[2],ymm23[2] -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm30[0],ymm27[0],ymm30[2],ymm27[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 640(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 960(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax) @@ -6260,10 +6260,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[2] ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm3 -; AVX-NEXT: vbroadcastsd 8(%rdx), %ymm5 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX-NEXT: vbroadcastsd 8(%rdx), %ymm3 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm4[1] ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm3 @@ -6288,8 +6288,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[2] ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX-NEXT: vbroadcastsd 40(%rdx), %ymm4 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6348,10 +6348,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 104(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 104(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX-NEXT: vmovaps 128(%rdi), %xmm1 @@ -6376,10 +6376,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 136(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 136(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1 @@ -6404,10 +6404,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 168(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 168(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX-NEXT: vmovaps 192(%rdi), %xmm1 @@ -6432,10 +6432,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 200(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 200(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX-NEXT: vmovaps 224(%rdi), %xmm1 @@ -6460,10 +6460,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 232(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 232(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 16(%rsi), %xmm0 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1 @@ -6784,17 +6784,17 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-NEXT: vbroadcastsd 8(%rdx), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 8(%rdx), %ymm1 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rax), %xmm0 @@ -6839,13 +6839,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%r9), %xmm1 +; AVX2-NEXT: vmovaps 64(%r9), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 64(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 64(%r8), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd 72(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 72(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 64(%rax), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -6887,15 +6887,15 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%r9), %xmm1 +; AVX2-NEXT: vmovaps 128(%r9), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 128(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 128(%r8), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd 136(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 136(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 128(%rax), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6907,7 +6907,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vbroadcastsd 168(%rdx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6917,26 +6917,27 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] ; AVX2-NEXT: vbroadcastsd 168(%r10), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 160(%rax), %xmm12 -; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps 160(%rax), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rsi), %xmm11 -; AVX2-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] +; AVX2-NEXT: vmovaps 192(%rsi), %xmm12 +; AVX2-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] ; AVX2-NEXT: vbroadcastsd 200(%rdx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 192(%rcx), %xmm9 -; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps 192(%rcx), %xmm10 +; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%r9), %xmm8 -; AVX2-NEXT: vmovaps 192(%r8), %xmm7 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] -; AVX2-NEXT: vbroadcastsd 200(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 192(%rax), %xmm6 -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps 192(%r9), %xmm9 +; AVX2-NEXT: vmovaps 192(%r8), %xmm8 +; AVX2-NEXT: vbroadcastsd 200(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 192(%rax), %xmm7 +; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rsi), %xmm5 @@ -6944,120 +6945,120 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] ; AVX2-NEXT: vbroadcastsd 232(%rdx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 224(%rcx), %xmm3 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps 224(%rcx), %xmm6 +; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 224(%r9), %xmm2 +; AVX2-NEXT: vmovaps 224(%r9), %xmm3 ; AVX2-NEXT: vmovaps 224(%r8), %xmm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm3[1] ; AVX2-NEXT: vbroadcastsd 232(%r10), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vmovaps 224(%rax), %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-NEXT: vmovaps 224(%rax), %xmm2 +; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm15, %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 64(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 128(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 128(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 160(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-NEXT: # xmm13 = xmm13[0],mem[0] +; AVX2-NEXT: vbroadcastsd (%rsp), %ymm14 # 16-byte Folded Reload ; AVX2-NEXT: vinsertf128 $1, 160(%r10), %ymm13, %ymm13 -; AVX2-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm10, %ymm10 -; AVX2-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX2-NEXT: vinsertf128 $1, 192(%r10), %ymm7, %ymm7 -; AVX2-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-NEXT: vinsertf128 $1, 224(%rdx), %ymm4, %ymm4 -; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm12[0] +; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm9[0] +; AVX2-NEXT: vinsertf128 $1, 192(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd %xmm7, %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm5[0] +; AVX2-NEXT: vinsertf128 $1, 224(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-NEXT: vbroadcastsd %xmm6, %ymm3 ; AVX2-NEXT: vinsertf128 $1, 224(%r10), %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd %xmm2, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %ymm0 @@ -7140,11 +7141,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vbroadcastsd 120(%rdx), %ymm1 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -7239,9 +7240,9 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -7381,17 +7382,17 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vbroadcastsd 8(%rdx), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 8(%rdx), %ymm1 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rax), %xmm0 @@ -7436,13 +7437,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vbroadcastsd 72(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 72(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -7484,15 +7485,15 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps 128(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vbroadcastsd 136(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 136(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 128(%rax), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7504,7 +7505,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vbroadcastsd 168(%rdx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7514,26 +7515,27 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] ; AVX2-FP-NEXT: vbroadcastsd 168(%r10), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 160(%rax), %xmm12 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps 160(%rax), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm11 -; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] +; AVX2-FP-NEXT: vmovaps 192(%rsi), %xmm12 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] ; AVX2-FP-NEXT: vbroadcastsd 200(%rdx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 192(%rcx), %xmm9 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps 192(%rcx), %xmm10 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%r9), %xmm8 -; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm7 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] -; AVX2-FP-NEXT: vbroadcastsd 200(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm6 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps 192(%r9), %xmm9 +; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm8 +; AVX2-FP-NEXT: vbroadcastsd 200(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm7 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rsi), %xmm5 @@ -7541,120 +7543,120 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] ; AVX2-FP-NEXT: vbroadcastsd 232(%rdx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 224(%rcx), %xmm3 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps 224(%rcx), %xmm6 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps 224(%r9), %xmm3 ; AVX2-FP-NEXT: vmovaps 224(%r8), %xmm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm3[1] ; AVX2-FP-NEXT: vbroadcastsd 232(%r10), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 224(%rax), %xmm0 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FP-NEXT: vmovaps 224(%rax), %xmm2 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm15, %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 64(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 128(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd (%rsp), %ymm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: vinsertf128 $1, 160(%r10), %ymm13, %ymm13 -; AVX2-FP-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm10, %ymm10 -; AVX2-FP-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX2-FP-NEXT: vinsertf128 $1, 192(%r10), %ymm7, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdx), %ymm4, %ymm4 -; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm12[0] +; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm9[0] +; AVX2-FP-NEXT: vinsertf128 $1, 192(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd %xmm7, %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm5[0] +; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm3 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%r10), %ymm1, %ymm1 -; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd %xmm2, %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 @@ -7737,11 +7739,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vbroadcastsd 120(%rdx), %ymm1 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FP-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -7836,9 +7838,9 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -7978,17 +7980,17 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vbroadcastsd 8(%rdx), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 8(%rdx), %ymm1 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0 @@ -8033,13 +8035,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastsd 72(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 72(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -8081,15 +8083,15 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps 128(%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%r8), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastsd 136(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 136(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 128(%rax), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8101,7 +8103,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vbroadcastsd 168(%rdx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8111,26 +8113,27 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] ; AVX2-FCP-NEXT: vbroadcastsd 168(%r10), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 160(%rax), %xmm12 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps 160(%rax), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm11 -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] +; AVX2-FCP-NEXT: vmovaps 192(%rsi), %xmm12 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] ; AVX2-FCP-NEXT: vbroadcastsd 200(%rdx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 192(%rcx), %xmm9 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps 192(%rcx), %xmm10 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%r9), %xmm8 -; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm7 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] -; AVX2-FCP-NEXT: vbroadcastsd 200(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm6 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps 192(%r9), %xmm9 +; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm8 +; AVX2-FCP-NEXT: vbroadcastsd 200(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm7 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %xmm5 @@ -8138,120 +8141,120 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] ; AVX2-FCP-NEXT: vbroadcastsd 232(%rdx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm3 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm6 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm3 ; AVX2-FCP-NEXT: vmovaps 224(%r8), %xmm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm3[1] ; AVX2-FCP-NEXT: vbroadcastsd 232(%r10), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 224(%rax), %xmm0 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FCP-NEXT: vmovaps 224(%rax), %xmm2 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm15, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 128(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%r10), %ymm13, %ymm13 -; AVX2-FCP-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm10, %ymm10 -; AVX2-FCP-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 192(%r10), %ymm7, %ymm7 -; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdx), %ymm4, %ymm4 -; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm12[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm9[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 192(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd %xmm7, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm5[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm3 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%r10), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd %xmm2, %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 @@ -8334,11 +8337,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vbroadcastsd 120(%rdx), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-FCP-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -8433,9 +8436,9 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -8572,233 +8575,233 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512-NEXT: vmovdqa64 64(%r8), %zmm15 +; AVX512-NEXT: vmovdqa64 (%r9), %zmm13 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm19 ; AVX512-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm14 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512-NEXT: movb $-64, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [4,12,4,12,4,12,4,12] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-NEXT: vpermt2q %zmm6, %zmm24, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm11 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm17[0],zmm13[0],zmm17[2],zmm13[2],zmm17[4],zmm13[4],zmm17[6],zmm13[6] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [5,13,5,13,5,13,5,13] ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512-NEXT: vpermt2q %zmm14, %zmm22, %zmm11 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm17[1],zmm13[1],zmm17[3],zmm13[3],zmm17[5],zmm13[5],zmm17[7],zmm13[7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-NEXT: vpermt2q %zmm6, %zmm21, %zmm11 +; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm11 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm9, %zmm24, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm9, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm22, %zmm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512-NEXT: vpermt2q %zmm20, %zmm22, %zmm6 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512-NEXT: vpermt2q %zmm19, %zmm21, %zmm6 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-NEXT: vpermt2q %zmm19, %zmm18, %zmm3 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm20[1],zmm16[3],zmm20[3],zmm16[5],zmm20[5],zmm16[7],zmm20[7] +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] ; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 +; AVX512-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512-NEXT: vpermt2q %zmm14, %zmm26, %zmm25 ; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512-NEXT: vpermt2q %zmm13, %zmm30, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-NEXT: vpermt2q %zmm13, %zmm31, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm13, %zmm26, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 +; AVX512-NEXT: vpermt2q %zmm20, %zmm26, %zmm16 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512-NEXT: vpermt2q %zmm19, %zmm23, %zmm28 +; AVX512-NEXT: vpermt2q %zmm19, %zmm26, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 128(%r10), %zmm19 ; AVX512-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512-NEXT: vpermt2q %zmm0, %zmm30, %zmm29 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 ; AVX512-NEXT: vmovdqa64 128(%r8), %zmm20 ; AVX512-NEXT: vmovdqa64 128(%r9), %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512-NEXT: vpermt2q %zmm1, %zmm18, %zmm10 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2 ; AVX512-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 @@ -8815,18 +8818,18 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 ; AVX512-NEXT: vmovdqa64 192(%r10), %zmm8 ; AVX512-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512-NEXT: vpermt2q %zmm1, %zmm30, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15 ; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 ; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512-NEXT: vpermi2q %zmm1, %zmm8, %zmm22 ; AVX512-NEXT: vmovdqa64 192(%r8), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] ; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8840,7 +8843,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8855,49 +8858,49 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX512-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm27 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa 128(%rsi), %xmm3 ; AVX512-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm29 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm25 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} ; AVX512-NEXT: vmovdqa 192(%rsi), %xmm4 ; AVX512-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm30 {%k1} ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX512-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] @@ -8908,69 +8911,69 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm9 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm13 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512-NEXT: vmovdqa (%rdx), %ymm15 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512-NEXT: vmovdqa 64(%rcx), %ymm11 ; AVX512-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm11 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} ; AVX512-NEXT: vmovdqa 128(%rcx), %ymm14 ; AVX512-NEXT: vmovdqa 128(%rdx), %ymm15 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload @@ -8979,13 +8982,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 192(%rdx), %ymm15 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -8993,11 +8996,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 1216(%rax) ; AVX512-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 640(%rax) ; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 1984(%rax) ; AVX512-NEXT: vmovdqa64 %zmm9, 1920(%rax) ; AVX512-NEXT: vmovdqa64 %zmm7, 1856(%rax) ; AVX512-NEXT: vmovdqa64 %zmm6, 1792(%rax) @@ -9010,8 +9013,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512-NEXT: vmovdqa64 %zmm29, 1024(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9021,7 +9024,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm27, 512(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9045,233 +9048,233 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512-FCP-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512-FCP-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512-FCP-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512-FCP-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512-FCP-NEXT: movb $-64, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k1 -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [4,12,4,12,4,12,4,12] ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm11 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm17[0],zmm13[0],zmm17[2],zmm13[2],zmm17[4],zmm13[4],zmm17[6],zmm13[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [5,13,5,13,5,13,5,13] ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm11 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm17[1],zmm13[1],zmm17[3],zmm13[3],zmm17[5],zmm13[5],zmm17[7],zmm13[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm11 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm22, %zmm6 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm6 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm20[1],zmm16[3],zmm20[3],zmm16[5],zmm20[5],zmm16[7],zmm20[7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] ; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] ; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm26, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm16 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm28 +; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm10 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 @@ -9288,18 +9291,18 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9313,7 +9316,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9328,49 +9331,49 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm27 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm29 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm25 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} ; AVX512-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 {%k1} ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] @@ -9381,69 +9384,69 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm15 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm11 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} ; AVX512-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 ; AVX512-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload @@ -9452,13 +9455,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -9466,11 +9469,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 640(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 1984(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) @@ -9483,8 +9486,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 1024(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9494,7 +9497,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 512(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9518,233 +9521,233 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512DQ-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512DQ-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512DQ-NEXT: movb $-64, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [4,12,4,12,4,12,4,12] ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm24, %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm24, %zmm11 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm17[0],zmm13[0],zmm17[2],zmm13[2],zmm17[4],zmm13[4],zmm17[6],zmm13[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [5,13,5,13,5,13,5,13] ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm22, %zmm11 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm17[1],zmm13[1],zmm17[3],zmm13[3],zmm17[5],zmm13[5],zmm17[7],zmm13[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm21, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm21, %zmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm11 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm24, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm22, %zmm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm22, %zmm6 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm21, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm21, %zmm6 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm18, %zmm3 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm20[1],zmm16[3],zmm20[3],zmm16[5],zmm20[5],zmm16[7],zmm20[7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] ; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm26, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm30, %zmm27 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm31, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm26, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm26, %zmm16 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm31, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm23, %zmm28 +; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm26, %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm19 ; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm30, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm14 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm18, %zmm10 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 @@ -9761,18 +9764,18 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 192(%r10), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm30, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm8, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 192(%r8), %zmm2 ; AVX512DQ-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9786,7 +9789,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9801,49 +9804,49 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX512DQ-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512DQ-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm27 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm26 +; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqa 128(%rsi), %xmm3 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512DQ-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm29 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm25 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} ; AVX512DQ-NEXT: vmovdqa 192(%rsi), %xmm4 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm30 {%k1} ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] @@ -9854,69 +9857,69 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm9 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm13 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm15 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm11 ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} ; AVX512DQ-NEXT: vmovdqa 128(%rcx), %ymm14 ; AVX512DQ-NEXT: vmovdqa 128(%rdx), %ymm15 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512DQ-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload @@ -9925,13 +9928,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 192(%rdx), %ymm15 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512DQ-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -9939,11 +9942,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1216(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 640(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1984(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1920(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1856(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1792(%rax) @@ -9956,8 +9959,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1024(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9967,7 +9970,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 512(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9991,233 +9994,233 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-FCP-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512DQ-FCP-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512DQ-FCP-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512DQ-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [4,12,4,12,4,12,4,12] ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm11 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm17[0],zmm13[0],zmm17[2],zmm13[2],zmm17[4],zmm13[4],zmm17[6],zmm13[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [5,13,5,13,5,13,5,13] ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm11 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm17[1],zmm13[1],zmm17[3],zmm13[3],zmm17[5],zmm13[5],zmm17[7],zmm13[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm11 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm22, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm20[1],zmm16[3],zmm20[3],zmm16[5],zmm20[5],zmm16[7],zmm20[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm26, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm28 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm10 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 @@ -10234,18 +10237,18 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10259,7 +10262,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10274,49 +10277,49 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] @@ -10327,69 +10330,69 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm15 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload @@ -10398,13 +10401,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -10412,11 +10415,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 1984(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) @@ -10429,8 +10432,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 1024(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10440,7 +10443,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 512(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10464,233 +10467,233 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm13 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm19 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm14 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512BW-NEXT: movb $-64, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm17[0],zmm13[0],zmm17[2],zmm13[2],zmm17[4],zmm13[4],zmm17[6],zmm13[6] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm11 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm17[1],zmm13[1],zmm17[3],zmm13[3],zmm17[5],zmm13[5],zmm17[7],zmm13[7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm22, %zmm6 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm20[1],zmm16[3],zmm20[3],zmm16[5],zmm20[5],zmm16[7],zmm20[7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm26, %zmm25 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm19 ; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 ; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm20 ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm10 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 @@ -10707,18 +10710,18 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 ; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm8 ; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm22 ; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10732,7 +10735,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10747,49 +10750,49 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm3 ; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm29 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm4 ; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] @@ -10800,69 +10803,69 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm9 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm15 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm11 ; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm14 ; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm15 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload @@ -10871,13 +10874,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm15 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -10885,11 +10888,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1216(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 1984(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 1920(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 1856(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 1792(%rax) @@ -10902,8 +10905,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1024(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10913,7 +10916,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 512(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10937,233 +10940,233 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-FCP-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-FCP-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512BW-FCP-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512BW-FCP-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512BW-FCP-NEXT: movb $-64, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [4,12,4,12,4,12,4,12] ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm11 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm17[0],zmm13[0],zmm17[2],zmm13[2],zmm17[4],zmm13[4],zmm17[6],zmm13[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [5,13,5,13,5,13,5,13] ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm11 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm17[1],zmm13[1],zmm17[3],zmm13[3],zmm17[5],zmm13[5],zmm17[7],zmm13[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm11 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm22, %zmm6 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm6 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm20[1],zmm16[3],zmm20[3],zmm16[5],zmm20[5],zmm16[7],zmm20[7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] ; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] ; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm26, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm28 +; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm10 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 @@ -11180,18 +11183,18 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11205,7 +11208,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11220,49 +11223,49 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm27 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] @@ -11273,69 +11276,69 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm15 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm11 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload @@ -11344,13 +11347,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -11358,11 +11361,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 640(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 1984(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) @@ -11375,8 +11378,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 1024(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11386,7 +11389,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 512(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11410,233 +11413,233 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-BW-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512DQ-BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512DQ-BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512DQ-BW-NEXT: movb $-64, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm11 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm17[0],zmm13[0],zmm17[2],zmm13[2],zmm17[4],zmm13[4],zmm17[6],zmm13[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [5,13,5,13,5,13,5,13] ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm11 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm17[1],zmm13[1],zmm17[3],zmm13[3],zmm17[5],zmm13[5],zmm17[7],zmm13[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm21, %zmm11 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm11 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm6 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm22, %zmm6 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm6 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm21, %zmm6 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm20[1],zmm16[3],zmm20[3],zmm16[5],zmm20[5],zmm16[7],zmm20[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm26, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm28 +; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm14 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm10 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 @@ -11653,18 +11656,18 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r10), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r8), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11678,7 +11681,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11693,49 +11696,49 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm27 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 128(%rsi), %xmm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm25 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 192(%rsi), %xmm4 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm30 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] @@ -11746,69 +11749,69 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm15 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 64(%rcx), %ymm11 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa 128(%rcx), %ymm14 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdx), %ymm15 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload @@ -11817,13 +11820,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdx), %ymm15 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -11831,11 +11834,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 1216(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 640(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 1984(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 1920(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 1856(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 1792(%rax) @@ -11848,8 +11851,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 1024(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11859,7 +11862,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 512(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11883,233 +11886,233 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm20 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm27, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm13 = zmm18[0],zmm12[0],zmm18[2],zmm12[2],zmm18[4],zmm12[4],zmm18[6],zmm12[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [5,13,5,13,5,13,5,13] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm24, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm12[1],zmm18[3],zmm12[3],zmm18[5],zmm12[5],zmm18[7],zmm12[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm24, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm17[0],zmm13[0],zmm17[2],zmm13[2],zmm17[4],zmm13[4],zmm17[6],zmm13[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [5,13,5,13,5,13,5,13] ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm22, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm22, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm15[0],zmm25[2],zmm15[2],zmm25[4],zmm15[4],zmm25[6],zmm15[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm22, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm17[1],zmm13[1],zmm17[3],zmm13[3],zmm17[5],zmm13[5],zmm17[7],zmm13[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm25[1],zmm15[1],zmm25[3],zmm15[3],zmm25[5],zmm15[5],zmm25[7],zmm15[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm27, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm19[0],zmm16[2],zmm19[2],zmm16[4],zmm19[4],zmm16[6],zmm19[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm24, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm19[1],zmm16[3],zmm19[3],zmm16[5],zmm19[5],zmm16[7],zmm19[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm22, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm22, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm21, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm15[0],zmm19[0],zmm15[2],zmm19[2],zmm15[4],zmm19[4],zmm15[6],zmm19[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm22, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm15[1],zmm19[1],zmm15[3],zmm19[3],zmm15[5],zmm19[5],zmm15[7],zmm19[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm21, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm20[1],zmm16[3],zmm20[3],zmm16[5],zmm20[5],zmm16[7],zmm20[7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm27, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm22, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm18, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] ; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm26, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm31, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm31, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm26, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm31, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm23, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm26, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm19[1],zmm0[1],zmm19[3],zmm0[3],zmm19[5],zmm0[5],zmm19[7],zmm0[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 @@ -12126,18 +12129,18 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r10), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm8, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r8), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm21 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm8[0],zmm1[0],zmm8[2],zmm1[2],zmm8[4],zmm1[4],zmm8[6],zmm1[6] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm8[1],zmm1[1],zmm8[3],zmm1[3],zmm8[5],zmm1[5],zmm8[7],zmm1[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12151,7 +12154,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12166,49 +12169,49 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdx), %ymm0, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rcx), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdx), %ymm5, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm14, %zmm29 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rsi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rcx), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdx), %ymm5, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rsi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rcx), %ymm4, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdx), %ymm4, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] @@ -12219,69 +12222,69 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm9 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-BW-FCP-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rcx), %ymm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rcx), %ymm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdx), %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm21[0],ymm22[2],ymm21[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm21[0],ymm18[0],ymm21[2],ymm18[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm17, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm22[1],ymm21[1],ymm22[3],ymm21[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm21[1],ymm18[1],ymm21[3],ymm18[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload @@ -12290,13 +12293,13 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdx), %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[2,3],ymm5[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm17[1],ymm19[3],ymm17[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -12304,11 +12307,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 1664(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 1216(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 640(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 1984(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 1984(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 1920(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 1856(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 1792(%rax) @@ -12321,8 +12324,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 1088(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 1024(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12332,7 +12335,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 512(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 512(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13784,10 +13787,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 8(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 8(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX-NEXT: vmovaps 32(%rdi), %xmm5 @@ -13804,9 +13807,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 64(%r8), %xmm2 ; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm8[0],xmm7[0] ; AVX-NEXT: vmovaps 32(%rax), %xmm9 -; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm10 -; AVX-NEXT: vinsertf128 $1, 32(%r10), %ymm3, %ymm3 -; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2],ymm10[2] +; AVX-NEXT: vinsertf128 $1, 32(%r10), %ymm3, %ymm10 +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 +; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[2] ; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 64(%rax), %xmm3 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] @@ -13816,17 +13819,17 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm5 -; AVX-NEXT: vbroadcastsd 40(%r10), %ymm6 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX-NEXT: vbroadcastsd 40(%r10), %ymm5 +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm6 +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 64(%rsi), %xmm4 ; AVX-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm7 -; AVX-NEXT: vinsertf128 $1, 64(%rdx), %ymm6, %ymm6 -; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX-NEXT: vinsertf128 $1, 64(%rdx), %ymm6, %ymm7 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 +; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[2] ; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm7 @@ -13840,10 +13843,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 72(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 72(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX-NEXT: vmovaps 96(%rdi), %xmm1 @@ -13868,10 +13871,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 104(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 104(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX-NEXT: vmovaps 128(%rdi), %xmm1 @@ -13896,10 +13899,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 136(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 136(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 160(%rcx), %xmm0 ; AVX-NEXT: vmovaps 160(%rsi), %xmm1 @@ -13924,10 +13927,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 168(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 168(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX-NEXT: vmovaps 192(%rdi), %xmm1 @@ -13952,10 +13955,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 200(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 200(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX-NEXT: vmovaps 224(%rsi), %xmm1 @@ -13980,10 +13983,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 232(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 232(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 256(%rsi), %xmm0 ; AVX-NEXT: vmovaps 256(%rdi), %xmm1 @@ -14008,10 +14011,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 264(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 264(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 288(%rsi), %xmm0 ; AVX-NEXT: vmovaps 288(%rdi), %xmm1 @@ -14036,10 +14039,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 296(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 296(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 320(%rsi), %xmm0 ; AVX-NEXT: vmovaps 320(%rdi), %xmm1 @@ -14064,10 +14067,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 328(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 328(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 352(%rcx), %xmm0 ; AVX-NEXT: vmovaps 352(%rsi), %xmm1 @@ -14092,10 +14095,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 360(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 360(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 384(%rcx), %xmm0 ; AVX-NEXT: vmovaps 384(%rsi), %xmm1 @@ -14120,10 +14123,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 392(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 392(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 416(%rcx), %xmm0 ; AVX-NEXT: vmovaps 416(%rsi), %xmm1 @@ -14148,10 +14151,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 424(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 424(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 448(%rcx), %xmm0 ; AVX-NEXT: vmovaps 448(%rsi), %xmm1 @@ -14176,10 +14179,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 456(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 456(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 480(%rcx), %xmm0 ; AVX-NEXT: vmovaps 480(%rsi), %xmm1 @@ -14204,10 +14207,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX-NEXT: vbroadcastsd 488(%r10), %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX-NEXT: vbroadcastsd 488(%r10), %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 16(%rsi), %xmm0 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1 @@ -14890,13 +14893,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovaps (%rax), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps (%r9), %xmm2 +; AVX2-NEXT: vmovaps (%r9), %xmm5 +; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%r8), %xmm2 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps (%r8), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-NEXT: vbroadcastsd 8(%r10), %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 8(%r10), %ymm1 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rcx), %xmm0 @@ -14931,13 +14934,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 64(%r9), %xmm1 +; AVX2-NEXT: vmovaps 64(%r9), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 64(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 64(%r8), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd 72(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 72(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 64(%rax), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -14979,13 +14982,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%r9), %xmm1 +; AVX2-NEXT: vmovaps 128(%r9), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 128(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 128(%r8), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd 136(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 136(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 128(%rax), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -15027,13 +15030,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%r9), %xmm1 +; AVX2-NEXT: vmovaps 192(%r9), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 192(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 192(%r8), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd 200(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 200(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 192(%rax), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -15075,22 +15078,22 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 256(%r9), %xmm1 +; AVX2-NEXT: vmovaps 256(%r9), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 256(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 256(%r8), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd 264(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 264(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 256(%rax), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 288(%rsi), %xmm1 -; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-NEXT: vbroadcastsd 296(%rdx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -15123,13 +15126,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 320(%r9), %xmm1 +; AVX2-NEXT: vmovaps 320(%r9), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 320(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 320(%r8), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd 328(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 328(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 320(%rax), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -15171,13 +15174,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 384(%r9), %xmm1 +; AVX2-NEXT: vmovaps 384(%r9), %xmm2 +; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 384(%r8), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 384(%r8), %xmm0 -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd 392(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vbroadcastsd 392(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps 384(%rax), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -15195,32 +15198,33 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%r9), %xmm0 +; AVX2-NEXT: vmovaps 416(%r9), %xmm1 +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 416(%r8), %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 416(%r8), %xmm13 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-NEXT: vbroadcastsd 424(%r10), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 416(%rax), %xmm12 -; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps 416(%rax), %xmm13 +; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 448(%rsi), %xmm11 -; AVX2-NEXT: vmovaps 448(%rdi), %xmm10 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] +; AVX2-NEXT: vmovaps 448(%rsi), %xmm12 +; AVX2-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] ; AVX2-NEXT: vbroadcastsd 456(%rdx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 448(%rcx), %xmm9 -; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps 448(%rcx), %xmm10 +; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 448(%r9), %xmm8 -; AVX2-NEXT: vmovaps 448(%r8), %xmm7 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] -; AVX2-NEXT: vbroadcastsd 456(%r10), %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 448(%rax), %xmm6 -; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps 448(%r9), %xmm9 +; AVX2-NEXT: vmovaps 448(%r8), %xmm8 +; AVX2-NEXT: vbroadcastsd 456(%r10), %ymm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovaps 448(%rax), %xmm7 +; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 480(%rsi), %xmm5 @@ -15228,233 +15232,234 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] ; AVX2-NEXT: vbroadcastsd 488(%rdx), %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovaps 480(%rcx), %xmm3 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-NEXT: vmovaps 480(%rcx), %xmm6 +; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 480(%r9), %xmm2 -; AVX2-NEXT: vmovaps 480(%r8), %xmm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-NEXT: vmovaps 480(%r9), %xmm3 +; AVX2-NEXT: vmovaps 480(%r8), %xmm2 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm3[1] ; AVX2-NEXT: vbroadcastsd 488(%r10), %ymm15 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-NEXT: vmovaps 480(%rax), %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-NEXT: vmovaps 480(%rax), %xmm1 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 160(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 192(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 224(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 224(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 256(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 256(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 288(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 288(%r10), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 320(%rdx), %ymm14, %ymm14 -; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vinsertf128 $1, 32(%r10), %ymm15, %ymm15 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 64(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 64(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 96(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 320(%r10), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 128(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 128(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 160(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 352(%rdx), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, 160(%r10), %ymm14, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 192(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 192(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 224(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 352(%r10), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, 224(%r10), %ymm14, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 256(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 256(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 288(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 384(%rdx), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, 288(%r10), %ymm14, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 320(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 320(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 352(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 384(%r10), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vinsertf128 $1, 352(%r10), %ymm14, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 384(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 384(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, 416(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 416(%rdx), %ymm14, %ymm14 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, 416(%r10), %ymm13, %ymm13 -; AVX2-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; AVX2-NEXT: vinsertf128 $1, 448(%rdx), %ymm10, %ymm10 -; AVX2-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX2-NEXT: vinsertf128 $1, 448(%r10), %ymm7, %ymm7 -; AVX2-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-NEXT: vinsertf128 $1, 480(%rdx), %ymm4, %ymm4 -; AVX2-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, 480(%r10), %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vinsertf128 $1, 416(%r10), %ymm14, %ymm14 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd %xmm13, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm12[0] +; AVX2-NEXT: vinsertf128 $1, 448(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm9[0] +; AVX2-NEXT: vinsertf128 $1, 448(%r10), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd %xmm7, %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm5[0] +; AVX2-NEXT: vinsertf128 $1, 480(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-NEXT: vbroadcastsd %xmm6, %ymm3 +; AVX2-NEXT: vinsertf128 $1, 480(%r10), %ymm2, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovaps (%rsi), %ymm1 @@ -16139,13 +16144,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovaps (%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps (%r9), %xmm5 +; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps (%r8), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vbroadcastsd 8(%r10), %ymm2 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 8(%r10), %ymm1 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rcx), %xmm0 @@ -16180,13 +16185,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps 64(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 64(%r8), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vbroadcastsd 72(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 72(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 64(%rax), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -16228,13 +16233,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps 128(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%r8), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vbroadcastsd 136(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 136(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 128(%rax), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -16276,13 +16281,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps 192(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%r8), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vbroadcastsd 200(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 200(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 192(%rax), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -16324,22 +16329,22 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 256(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps 256(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 256(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 256(%r8), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vbroadcastsd 264(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 264(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 256(%rax), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 288(%rsi), %xmm1 -; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-FP-NEXT: vbroadcastsd 296(%rdx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -16372,13 +16377,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps 320(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 320(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 320(%r8), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vbroadcastsd 328(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 328(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 320(%rax), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -16420,13 +16425,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 384(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps 384(%r9), %xmm2 +; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 384(%r8), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 384(%r8), %xmm0 -; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vbroadcastsd 392(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vbroadcastsd 392(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps 384(%rax), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -16444,32 +16449,33 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 416(%r9), %xmm0 +; AVX2-FP-NEXT: vmovaps 416(%r9), %xmm1 +; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 416(%r8), %xmm0 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 416(%r8), %xmm13 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-FP-NEXT: vbroadcastsd 424(%r10), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 416(%rax), %xmm12 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps 416(%rax), %xmm13 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 448(%rsi), %xmm11 -; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm10 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] +; AVX2-FP-NEXT: vmovaps 448(%rsi), %xmm12 +; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] ; AVX2-FP-NEXT: vbroadcastsd 456(%rdx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 448(%rcx), %xmm9 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps 448(%rcx), %xmm10 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 448(%r9), %xmm8 -; AVX2-FP-NEXT: vmovaps 448(%r8), %xmm7 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] -; AVX2-FP-NEXT: vbroadcastsd 456(%r10), %ymm1 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 448(%rax), %xmm6 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps 448(%r9), %xmm9 +; AVX2-FP-NEXT: vmovaps 448(%r8), %xmm8 +; AVX2-FP-NEXT: vbroadcastsd 456(%r10), %ymm0 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 448(%rax), %xmm7 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 480(%rsi), %xmm5 @@ -16477,233 +16483,234 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] ; AVX2-FP-NEXT: vbroadcastsd 488(%rdx), %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 480(%rcx), %xmm3 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovaps 480(%rcx), %xmm6 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 480(%r9), %xmm2 -; AVX2-FP-NEXT: vmovaps 480(%r8), %xmm1 -; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vmovaps 480(%r9), %xmm3 +; AVX2-FP-NEXT: vmovaps 480(%r8), %xmm2 +; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm3[1] ; AVX2-FP-NEXT: vbroadcastsd 488(%r10), %ymm15 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FP-NEXT: vmovaps 480(%rax), %xmm0 -; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FP-NEXT: vmovaps 480(%rax), %xmm1 +; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vinsertf128 $1, 32(%r10), %ymm15, %ymm15 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 64(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 128(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 128(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, 160(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 192(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 192(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 192(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdx), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, 224(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 256(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 256(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 288(%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 256(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 288(%rdx), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FP-NEXT: vinsertf128 $1, 288(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 320(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 320(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 352(%r10), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdx), %ymm14, %ymm14 -; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 320(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 320(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 384(%r10), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vinsertf128 $1, 352(%r10), %ymm14, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 384(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 384(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdx), %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 416(%rdx), %ymm14, %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-FP-NEXT: vinsertf128 $1, 416(%r10), %ymm13, %ymm13 -; AVX2-FP-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdx), %ymm10, %ymm10 -; AVX2-FP-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX2-FP-NEXT: vinsertf128 $1, 448(%r10), %ymm7, %ymm7 -; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdx), %ymm4, %ymm4 -; AVX2-FP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-FP-NEXT: vinsertf128 $1, 480(%r10), %ymm1, %ymm1 -; AVX2-FP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vinsertf128 $1, 416(%r10), %ymm14, %ymm14 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd %xmm13, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm12[0] +; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm9[0] +; AVX2-FP-NEXT: vinsertf128 $1, 448(%r10), %ymm0, %ymm0 +; AVX2-FP-NEXT: vbroadcastsd %xmm7, %ymm7 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm5[0] +; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdx), %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-FP-NEXT: vbroadcastsd %xmm6, %ymm3 +; AVX2-FP-NEXT: vinsertf128 $1, 480(%r10), %ymm2, %ymm2 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vbroadcastsd %xmm1, %ymm0 +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps (%rsi), %ymm1 @@ -17388,13 +17395,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovaps (%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps (%r9), %xmm5 +; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps (%r8), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vbroadcastsd 8(%r10), %ymm2 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 8(%r10), %ymm1 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm0 @@ -17429,13 +17436,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps 64(%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%r8), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastsd 72(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 72(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 64(%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -17477,13 +17484,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps 128(%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 128(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 128(%r8), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastsd 136(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 136(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 128(%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -17525,13 +17532,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps 192(%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%r8), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastsd 200(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 200(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 192(%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -17573,22 +17580,22 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 256(%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps 256(%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 256(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 256(%r8), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastsd 264(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 264(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 256(%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 288(%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-FCP-NEXT: vbroadcastsd 296(%rdx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -17621,13 +17628,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps 320(%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 320(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 320(%r8), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastsd 328(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 328(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 320(%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -17669,13 +17676,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 384(%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps 384(%r9), %xmm2 +; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 384(%r8), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 384(%r8), %xmm0 -; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vbroadcastsd 392(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vbroadcastsd 392(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovaps 384(%rax), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -17693,32 +17700,33 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 416(%r9), %xmm0 +; AVX2-FCP-NEXT: vmovaps 416(%r9), %xmm1 +; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 416(%r8), %xmm0 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 416(%r8), %xmm13 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-FCP-NEXT: vbroadcastsd 424(%r10), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 416(%rax), %xmm12 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps 416(%rax), %xmm13 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 448(%rsi), %xmm11 -; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm10 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] +; AVX2-FCP-NEXT: vmovaps 448(%rsi), %xmm12 +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] ; AVX2-FCP-NEXT: vbroadcastsd 456(%rdx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 448(%rcx), %xmm9 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps 448(%rcx), %xmm10 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 448(%r9), %xmm8 -; AVX2-FCP-NEXT: vmovaps 448(%r8), %xmm7 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] -; AVX2-FCP-NEXT: vbroadcastsd 456(%r10), %ymm1 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 448(%rax), %xmm6 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps 448(%r9), %xmm9 +; AVX2-FCP-NEXT: vmovaps 448(%r8), %xmm8 +; AVX2-FCP-NEXT: vbroadcastsd 456(%r10), %ymm0 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 448(%rax), %xmm7 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 480(%rsi), %xmm5 @@ -17726,233 +17734,234 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] ; AVX2-FCP-NEXT: vbroadcastsd 488(%rdx), %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 480(%rcx), %xmm3 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovaps 480(%rcx), %xmm6 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 480(%r9), %xmm2 -; AVX2-FCP-NEXT: vmovaps 480(%r8), %xmm1 -; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vmovaps 480(%r9), %xmm3 +; AVX2-FCP-NEXT: vmovaps 480(%r8), %xmm2 +; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm3[1] ; AVX2-FCP-NEXT: vbroadcastsd 488(%r10), %ymm15 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FCP-NEXT: vmovaps 480(%rax), %xmm0 -; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FCP-NEXT: vmovaps 480(%rax), %xmm1 +; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, (%r10), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm14 = xmm0[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 32(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 64(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vinsertf128 $1, 32(%r10), %ymm15, %ymm15 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 64(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 128(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 128(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 128(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 160(%rdx), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 160(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 192(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 192(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 192(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdx), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 256(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 256(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 288(%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 256(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 288(%rdx), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 288(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 320(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 320(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdx), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-FCP-NEXT: vinsertf128 $1, 352(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdx), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 384(%r10), %ymm14, %ymm14 -; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 384(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 384(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdx), %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 416(%rdx), %ymm14, %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 416(%r10), %ymm13, %ymm13 -; AVX2-FCP-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdx), %ymm10, %ymm10 -; AVX2-FCP-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 448(%r10), %ymm7, %ymm7 -; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdx), %ymm4, %ymm4 -; AVX2-FCP-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-FCP-NEXT: vinsertf128 $1, 480(%r10), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vinsertf128 $1, 416(%r10), %ymm14, %ymm14 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd %xmm13, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm12[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm9[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 448(%r10), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vbroadcastsd %xmm7, %ymm7 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm5[0] +; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdx), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-FCP-NEXT: vbroadcastsd %xmm6, %ymm3 +; AVX2-FCP-NEXT: vinsertf128 $1, 480(%r10), %ymm2, %ymm2 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vbroadcastsd %xmm1, %ymm0 +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll index 53a6d306ef84d..9ce0f1c2897bb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll @@ -606,7 +606,7 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -617,7 +617,7 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -628,7 +628,7 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -639,7 +639,7 @@ define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 8802e8a779332..e9a8b583e60f2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -370,9 +370,9 @@ define void @store_i8_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero -; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) @@ -673,8 +673,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -696,8 +695,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -719,8 +717,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -742,8 +739,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -765,8 +761,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -788,8 +783,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -811,8 +805,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -834,8 +827,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -857,8 +849,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -880,8 +871,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -903,8 +893,7 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -1097,8 +1086,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -1124,8 +1112,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -1151,8 +1138,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -1178,8 +1164,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -1205,8 +1190,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -1232,8 +1216,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -1259,8 +1242,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -1288,8 +1270,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1 @@ -1314,8 +1295,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb %zmm2, %zmm1, %zmm1 @@ -1340,8 +1320,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1 @@ -1366,8 +1345,7 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm2, %zmm1, %zmm1 @@ -1763,13 +1741,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] -; AVX2-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-NEXT: vpslldq {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] +; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm8 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20] -; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10 +; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm10 ; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: # ymm11 = mem[0,1,0,1] @@ -1783,18 +1761,17 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm12 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25] -; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -1824,13 +1801,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] -; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7 +; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm8 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm10 ; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1] @@ -1844,18 +1821,17 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm12 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] -; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 ; AVX2-FP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -1885,13 +1861,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] -; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] @@ -1905,18 +1881,17 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm12 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -1962,8 +1937,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -2009,8 +1983,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -2056,8 +2029,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -2103,8 +2075,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -2148,8 +2119,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm4 @@ -2184,8 +2154,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm4, %zmm4 @@ -2220,8 +2189,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm4, %zmm4 @@ -2256,8 +2224,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm4, %zmm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 86efcf9c57616..f18a61a296711 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -45,8 +45,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-LABEL: store_i8_stride5_vf2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa (%rdx), %xmm1 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa (%rdx), %xmm1 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -58,8 +58,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-LABEL: store_i8_stride5_vf2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -71,8 +71,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-LABEL: store_i8_stride5_vf2: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -84,8 +84,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-LABEL: store_i8_stride5_vf2: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -97,8 +97,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-LABEL: store_i8_stride5_vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -110,8 +110,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-LABEL: store_i8_stride5_vf2: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -123,8 +123,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-LABEL: store_i8_stride5_vf2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -136,8 +136,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-LABEL: store_i8_stride5_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -149,8 +149,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-LABEL: store_i8_stride5_vf2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -162,8 +162,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-LABEL: store_i8_stride5_vf2: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -175,8 +175,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-LABEL: store_i8_stride5_vf2: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -188,8 +188,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -217,9 +217,9 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: pxor %xmm3, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,1,2,1] @@ -264,9 +264,9 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,4,8,12],zero,xmm0[1,5,9,13],zero,xmm0[2,6,10,14],zero,xmm0[3] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero @@ -280,8 +280,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-LABEL: store_i8_stride5_vf4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -298,8 +298,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-LABEL: store_i8_stride5_vf4: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -316,8 +316,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-LABEL: store_i8_stride5_vf4: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -334,8 +334,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-LABEL: store_i8_stride5_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -352,8 +352,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-LABEL: store_i8_stride5_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -370,8 +370,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-LABEL: store_i8_stride5_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -388,8 +388,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-LABEL: store_i8_stride5_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -406,8 +406,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-LABEL: store_i8_stride5_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -424,8 +424,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-LABEL: store_i8_stride5_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -442,8 +442,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-LABEL: store_i8_stride5_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -460,8 +460,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 @@ -592,8 +592,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero -; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[3,11,u],zero,zero,xmm1[4,12,u],zero,zero,xmm1[5,13,u],zero,zero +; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,xmm0[u,4,12],zero,zero,xmm0[u,5,13],zero,zero,xmm0[u,6,14] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7],zero,xmm0[9,10,11,12],zero,xmm0[14,15] @@ -614,12 +614,12 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovq %rax, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-NEXT: vmovq %rax, %xmm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 @@ -646,12 +646,12 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-FP-NEXT: vmovq %rax, %xmm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-FP-NEXT: vmovq %rax, %xmm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; AVX2-FP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 @@ -678,13 +678,13 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vmovq %rax, %xmm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-FCP-NEXT: vmovq %rax, %xmm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX2-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] @@ -745,7 +745,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] ; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,0,0,u,1,1] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] @@ -805,7 +805,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,0,0,0,0,u,1,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] @@ -868,7 +868,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] ; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX512BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512BW-FCP-NEXT: movl $554189328, %ecx # imm = 0x21084210 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -934,7 +934,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] ; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX512DQ-BW-FCP-NEXT: movl $554189328, %ecx # imm = 0x21084210 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -1126,68 +1126,68 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX-LABEL: store_i8_stride5_vf16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vmovdqa (%rsi), %xmm3 +; AVX-NEXT: vmovdqa (%rdi), %xmm4 +; AVX-NEXT: vmovdqa (%rsi), %xmm5 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%rcx), %xmm4 +; AVX-NEXT: vmovdqa (%rcx), %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[6,u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero ; AVX-NEXT: vmovdqa (%r8), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u] +; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9],zero,xmm4[u] ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255] -; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15] +; AVX-NEXT: vpblendvb %xmm7, %xmm3, %xmm6, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[3,4,5,6],zero,xmm3[8,9,10,11],zero,xmm3[13,14,15] ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero -; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[10,11],zero,zero,zero,xmm6[12,13],zero,zero,zero,xmm6[14,15],zero -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[10,11],zero,zero,zero,xmm7[12,13],zero,zero,zero,xmm7[14,15],zero,zero,zero ; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm8[0,1],zero,zero,zero,xmm8[2,3],zero,zero,zero,xmm8[4,5],zero,zero -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,zero,zero,xmm10[2,3],zero,zero,zero,xmm10[4,5],zero,zero,zero,xmm10[6] +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero ; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero -; AVX-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6,7],zero,zero,zero,xmm8[8,9],zero,zero,zero,xmm8[10,11],zero,zero,zero -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,zero,zero,xmm2[9,8],zero,zero,zero,xmm2[11,10],zero,zero,zero,xmm2[13,12] -; AVX-NEXT: vpor %xmm2, %xmm8, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero -; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6],zero,zero,zero,xmm4[9,8],zero,zero,zero,xmm4[11,10],zero,zero,zero,xmm4[13,12] +; AVX-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero +; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,xmm1[5,4],zero,zero,zero,xmm1[7,6],zero,zero,zero,xmm1[9,8] -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 48(%r9) -; AVX-NEXT: vmovdqa %xmm2, 16(%r9) -; AVX-NEXT: vmovdqa %xmm9, (%r9) +; AVX-NEXT: vmovdqa %xmm4, 16(%r9) ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX-NEXT: vmovdqa %xmm9, (%r9) ; AVX-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX-NEXT: vmovdqa %xmm5, 32(%r9) +; AVX-NEXT: vmovdqa %xmm3, 32(%r9) ; AVX-NEXT: retq ; ; AVX2-LABEL: store_i8_stride5_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm2 +; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm0[6],zero,zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero ; AVX2-NEXT: vmovdqa (%r8), %xmm0 -; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero ; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9],zero,zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28],zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero ; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] @@ -1196,22 +1196,22 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[1,3,2,3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[19,27],zero,zero,zero,ymm1[20,28],zero,zero,zero,ymm1[21,29],zero,zero,zero ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22] -; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,0] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3,19],zero,zero,zero,ymm2[28,20],zero,zero,zero,ymm2[29,21],zero,zero,zero,ymm2[30,22] +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%r9) ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX2-NEXT: vmovdqa %ymm1, (%r9) ; AVX2-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX2-NEXT: vmovdqa %ymm3, 32(%r9) @@ -1220,17 +1220,17 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-FP-LABEL: store_i8_stride5_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm2 +; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm0[6],zero,zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0 -; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 -; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero ; AVX2-FP-NEXT: vpor %ymm4, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9],zero,zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28],zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] @@ -1239,22 +1239,22 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[1,3,2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[19,27],zero,zero,zero,ymm1[20,28],zero,zero,zero,ymm1[21,29],zero,zero,zero ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22] -; AVX2-FP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,0] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3,19],zero,zero,zero,ymm2[28,20],zero,zero,zero,ymm2[29,21],zero,zero,zero,ymm2[30,22] +; AVX2-FP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9) ; AVX2-FP-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9) @@ -1263,39 +1263,39 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-FCP-LABEL: store_i8_stride5_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm0 -; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 -; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,13],zero,zero,zero,xmm3[6,14],zero,zero,zero,xmm3[7,15],zero -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero,zero,zero ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,0,2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[19,27],zero,zero,zero,ymm4[20,28],zero,zero,zero,ymm4[21,29],zero,zero,zero -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,2,0] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,2,0] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] ; AVX2-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,0,1,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,2,6,3,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,2,6,2,6,3,7] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[3,7],zero,zero,zero,ymm0[8,12],zero,zero,zero,ymm0[9,13],zero,zero,zero,ymm0[18,22],zero,zero,zero,ymm0[19,23],zero,zero,zero,ymm0[24,28],zero,zero +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[3,7],zero,zero,zero,ymm1[8,12],zero,zero,zero,ymm1[9,13],zero,zero,zero,ymm1[18,22],zero,zero,zero,ymm1[19,23],zero,zero,zero,ymm1[24,28],zero,zero -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,6],zero,zero,zero,ymm2[3,7],zero,zero,zero,ymm2[8,12],zero,zero,zero,ymm2[9,17],zero,zero,zero,ymm2[22,18],zero,zero,zero,ymm2[23,19],zero,zero,zero,ymm2[24,28] -; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,1,2,2,2,2,2,2] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,6],zero,zero,zero,ymm1[3,7],zero,zero,zero,ymm1[8,12],zero,zero,zero,ymm1[9,17],zero,zero,zero,ymm1[22,18],zero,zero,zero,ymm1[23,19],zero,zero,zero,ymm1[24,28] +; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,2,2,2,2,2,2] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9) -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] ; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX2-FCP-NEXT: vzeroupper @@ -1303,31 +1303,31 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i8_stride5_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] +; AVX512-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u],zero,ymm0[7,u,u,u],zero,ymm0[8,u,u,u],zero,ymm0[9,u,u,u,26],zero,ymm0[u,u,u,27],zero,ymm0[u,u,u,28],zero,ymm0[u,u] ; AVX512-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm2) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[6],zero,ymm3[u,u,u,7],zero,ymm3[u,u,u,8],zero,ymm3[u,u,u,9,25,u,u,u],zero,ymm3[26,u,u,u],zero,ymm3[27,u,u,u],zero,ymm3[28] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero ; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4) ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512-NEXT: vporq %zmm2, %zmm4, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] ; AVX512-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm3[1,3,2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] @@ -1342,35 +1342,35 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-FCP-LABEL: store_i8_stride5_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero -; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] -; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,xmm1[5,13,u],zero,zero,xmm1[6,14,u],zero,zero,xmm1[7,15,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,5,13],zero,zero,xmm0[u,6,14],zero,zero,xmm0[u,7,15],zero,zero,xmm0[u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -1378,31 +1378,31 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-LABEL: store_i8_stride5_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u],zero,ymm0[7,u,u,u],zero,ymm0[8,u,u,u],zero,ymm0[9,u,u,u,26],zero,ymm0[u,u,u,27],zero,ymm0[u,u,u,28],zero,ymm0[u,u] ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm2) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[6],zero,ymm3[u,u,u,7],zero,ymm3[u,u,u,8],zero,ymm3[u,u,u,9,25,u,u,u],zero,ymm3[26,u,u,u],zero,ymm3[27,u,u,u],zero,ymm3[28] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512DQ-NEXT: vporq %zmm2, %zmm4, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm3[1,3,2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] @@ -1417,35 +1417,35 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-FCP-LABEL: store_i8_stride5_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,xmm1[5,13,u],zero,zero,xmm1[6,14,u],zero,zero,xmm1[7,15,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,5,13],zero,zero,xmm0[u,6,14],zero,zero,xmm0[u,7,15],zero,zero,xmm0[u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -1455,39 +1455,39 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero +; AVX512BW-NEXT: vmovdqa (%r8), %xmm3 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero -; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] +; AVX512BW-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero +; AVX512BW-NEXT: vpor %ymm5, %ymm4, %ymm4 ; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm4 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,2,2,0] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3,19],zero,zero,zero,ymm2[28,20],zero,zero,zero,ymm2[29,21],zero,zero,zero,ymm2[30,22] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 +; AVX512BW-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-NEXT: vpermd %zmm3, %zmm4, %zmm4 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] -; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 -; AVX512BW-NEXT: vmovdqa %xmm2, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12],zero,zero,zero,zero,xmm3[13],zero,zero,zero,zero,xmm3[14],zero,zero,zero,zero,xmm3[15] +; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} +; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 | xmm0 | xmm1 +; AVX512BW-NEXT: vmovdqa %xmm3, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1495,26 +1495,26 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,4,5,0,1,17,21,18,22,22,18,19,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,u,4,u,4,5,u,1,17,21,18,22,22,18,19,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,0,0,1,4,5,17,21,18,22,18,22,19,23] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,u,4,u,0,1,4,5,17,21,18,22,18,22,19,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero ; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512BW-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} ; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 64(%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) @@ -1525,39 +1525,39 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm3 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero -; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm4, %ymm4 ; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm4 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,2,2,0] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3,19],zero,zero,zero,ymm2[28,20],zero,zero,zero,ymm2[29,21],zero,zero,zero,ymm2[30,22] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512DQ-BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-NEXT: vpermd %zmm3, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] -; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12],zero,zero,zero,zero,xmm3[13],zero,zero,zero,zero,xmm3[14],zero,zero,zero,zero,xmm3[15] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 | xmm0 | xmm1 +; AVX512DQ-BW-NEXT: vmovdqa %xmm3, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1565,26 +1565,26 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,4,5,0,1,17,21,18,22,22,18,19,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,u,4,u,4,5,u,1,17,21,18,22,22,18,19,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,0,0,1,4,5,17,21,18,22,18,22,19,23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,u,4,u,0,1,4,5,17,21,18,22,18,22,19,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero ; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 64(%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) @@ -2116,12 +2116,11 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] ; AVX2-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255,255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] @@ -2150,8 +2149,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm10[6],zero,xmm10[8,u],zero,xmm10[7],zero,xmm10[9],zero,xmm10[11,u],zero,xmm10[10],zero,xmm10[12] ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero ; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[u],zero,xmm8[u,10],zero,xmm8[12],zero,xmm8[u,11] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] @@ -2164,9 +2163,9 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] ; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero ; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] @@ -2175,7 +2174,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [3,3,3,u,4,4,4,4] ; AVX2-NEXT: vpermd %ymm3, %ymm9, %ymm3 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] @@ -2185,7 +2184,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 @@ -2199,554 +2198,554 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-FP-LABEL: store_i8_stride5_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm10 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero -; AVX2-FP-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[u],zero,xmm7[u,10],zero,xmm7[12],zero,xmm7[u,11] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm8 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6],zero,xmm6[8],zero,xmm6[u,7],zero,xmm6[9],zero,xmm6[11],zero,xmm6[u,10],zero,xmm6[12],zero ; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm4[8,u],zero,xmm4[7],zero,xmm4[9,u,11,u],zero,xmm4[10],zero,xmm4[12,u],zero +; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,2,2] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,ymm1[19,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25],zero,ymm1[23] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FP-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25] ; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] -; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,2,1,1,4,6,5,5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[29,26],zero,ymm0[28],zero,ymm0[30],zero,ymm0[28,29],zero,ymm0[31],zero,ymm0[29] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[27],zero,zero,ymm5[26],zero,ymm5[28],zero,ymm5[30],zero,zero,ymm5[29],zero,ymm5[31],zero ; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[26,27,28,29],zero,ymm1[31],zero,ymm1[29,30],zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,zero,zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] ; AVX2-FP-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[2,2,3,3,6,6,7,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] -; AVX2-FP-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [3,3,3,u,4,4,4,4] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm5[13],zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,ymm5[18],zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] -; AVX2-FP-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,u,4,4,4] +; AVX2-FP-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-FP-NEXT: vmovdqa %ymm8, 128(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm7, 96(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FP-NEXT: vmovdqa %ymm6, 96(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i8_stride5_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm9, %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm9, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero +; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8],zero,xmm6[u,7],zero,xmm6[9],zero,xmm6[u],zero,xmm6[u,10],zero,xmm6[12],zero,xmm6[u,11] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero +; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,0,0,1,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero -; AVX2-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[u],zero,xmm7[u,10],zero,xmm7[12],zero,xmm7[u,11] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero -; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,2,2,2,2,2,2] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,ymm2[19,20],zero,ymm2[22],zero,ymm2[24],zero,ymm2[22,23],zero,ymm2[25],zero,ymm2[23] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[19],zero,ymm6[21],zero,zero,ymm6[20],zero,ymm6[22],zero,ymm6[24],zero,zero,ymm6[23],zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25] ; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[29,26],zero,ymm0[28],zero,ymm0[30],zero,ymm0[28,29],zero,ymm0[31],zero,ymm0[29] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero ; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[29,26],zero,ymm2[28],zero,ymm2[26,27,28,29],zero,ymm2[31],zero,ymm2[29,30],zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] ; AVX2-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [3,3,3,u,4,4,4,4] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm9, %ymm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm6[13],zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,ymm6[18],zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero,zero,zero +; AVX2-FCP-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm8, 128(%r9) ; AVX2-FCP-NEXT: vmovdqa %ymm7, 96(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i8_stride5_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512-NEXT: vmovdqa (%r8), %ymm0 -; AVX512-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] -; AVX512-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero -; AVX512-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] -; AVX512-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero -; AVX512-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm11 & (ymm9 ^ ymm6)) -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] +; AVX512-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero +; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm7 & (ymm5 ^ ymm6)) -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm9[0,1,2,3] -; AVX512-NEXT: vmovdqa (%r8), %xmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-NEXT: vpermd %zmm6, %zmm8, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u] -; AVX512-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u,u],zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u,u,19] +; AVX512-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12] +; AVX512-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero +; AVX512-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm7[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm8[0,1,2,3] +; AVX512-NEXT: vmovdqa (%r8), %xmm5 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512-NEXT: vpermd %zmm5, %zmm7, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm4)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] +; AVX512-NEXT: vmovdqa (%r8), %ymm4 +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u] +; AVX512-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19] ; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm11 & (ymm8 ^ ymm5)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u] -; AVX512-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm10 & (ymm8 ^ ymm7)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u] +; AVX512-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] ; AVX512-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm9[0,1,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero -; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm9[0,1,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero +; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,2,1,1,4,6,5,5] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30] -; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,ymm1[26],zero,ymm1[28],zero,ymm1[30],zero,zero,ymm1[29],zero,ymm1[31],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27,u],zero,ymm2[26],zero,ymm2[28],zero,ymm2[30,u],zero,ymm2[29],zero,ymm2[31,u] -; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & mem) +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30] +; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) -; AVX512-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,3,3,6,6,7,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) +; AVX512-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride5_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero -; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12] -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero -; AVX512-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[6],zero,xmm6[8],zero,xmm6[u,7],zero,xmm6[9],zero,xmm6[11],zero,xmm6[u,10],zero,xmm6[12],zero +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm9 & (ymm8 ^ ymm4)) +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm8[0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u] -; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19] -; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u] -; AVX512-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm3[0,1,2,3],zmm8[0,1,2,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u] +; AVX512-FCP-NEXT: vpermd %zmm3, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm4)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,13],zero,ymm4[u,u,u,14],zero,ymm4[u,u,u,15],zero,ymm4[u,u,u,16],zero,ymm4[u,u,u,17],zero,ymm4[u,u,u,18],zero,ymm4[u,u] +; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u,u],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u,u,19] +; AVX512-FCP-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm9 & (ymm8 ^ ymm7)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21,u],zero,ymm2[20],zero,ymm2[22],zero,ymm2[24,u],zero,ymm2[23],zero,ymm2[25,u] +; AVX512-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm4[21],zero,ymm4[21,20],zero,ymm4[22],zero,ymm4[24],zero,ymm4[22,23],zero,ymm4[25] ; AVX512-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm9[0,1,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,0,5,5,5,5,0,6] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm9[0,1,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[12],zero,zero,zero,zero,ymm3[13],zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,ymm3[18],zero +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,u,5,5,5,5,u,6] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30] -; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & mem) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[26],zero,ymm0[28],zero,zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero,zero,ymm0[30],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm2[26],zero,ymm2[28,u],zero,ymm2[u],zero,ymm2[29],zero,ymm2[31,u],zero,ymm2[30] +; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u] -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa %ymm1, 128(%r9) +; AVX512-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm5 & (ymm1 ^ ymm0)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [6,6,6,u,7,7,7,7] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride5_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero -; AVX512DQ-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero -; AVX512DQ-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm11 & (ymm9 ^ ymm6)) -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero +; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm7 & (ymm5 ^ ymm6)) -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u] -; AVX512DQ-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u,u],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u,u,19] +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero +; AVX512DQ-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm7[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm8[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm4)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u] +; AVX512DQ-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19] ; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm11 & (ymm8 ^ ymm5)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u] -; AVX512DQ-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm10 & (ymm8 ^ ymm7)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u] +; AVX512DQ-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] ; AVX512DQ-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm9[0,1,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,2,1,1,4,6,5,5] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30] -; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,ymm1[26],zero,ymm1[28],zero,ymm1[30],zero,zero,ymm1[29],zero,ymm1[31],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27,u],zero,ymm2[26],zero,ymm2[28],zero,ymm2[30,u],zero,ymm2[29],zero,ymm2[31,u] -; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & mem) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30] +; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) -; AVX512DQ-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,3,3,6,6,7,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride5_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero -; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[6],zero,xmm6[8],zero,xmm6[u,7],zero,xmm6[9],zero,xmm6[11],zero,xmm6[u,10],zero,xmm6[12],zero +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm9 & (ymm8 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm8[0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19] -; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u] -; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm3[0,1,2,3],zmm8[0,1,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u] +; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,13],zero,ymm4[u,u,u,14],zero,ymm4[u,u,u,15],zero,ymm4[u,u,u,16],zero,ymm4[u,u,u,17],zero,ymm4[u,u,u,18],zero,ymm4[u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u,u],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u,u,19] +; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm9 & (ymm8 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21,u],zero,ymm2[20],zero,ymm2[22],zero,ymm2[24,u],zero,ymm2[23],zero,ymm2[25,u] +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm4[21],zero,ymm4[21,20],zero,ymm4[22],zero,ymm4[24],zero,ymm4[22,23],zero,ymm4[25] ; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,0,5,5,5,5,0,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm9[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[12],zero,zero,zero,zero,ymm3[13],zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,ymm3[18],zero +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,u,5,5,5,5,u,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30] -; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & mem) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[26],zero,ymm0[28],zero,zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero,zero,ymm0[30],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm2[26],zero,ymm2[28,u],zero,ymm2[u],zero,ymm2[29],zero,ymm2[31,u],zero,ymm2[30] +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 128(%r9) +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm5 & (ymm1 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [6,6,6,u,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i8_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm3 ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero +; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm8 ; AVX512BW-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] ; AVX512BW-NEXT: vinserti32x4 $2, %xmm6, %zmm10, %zmm6 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero -; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm3 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm7 +; AVX512BW-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] ; AVX512BW-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C ; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm3[13],zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,ymm3[18],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero ; AVX512BW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm7 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4] -; AVX512BW-NEXT: vpermd %ymm4, %ymm7, %ymm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [3,3,3,u,4,4,4,4] +; AVX512BW-NEXT: vpermd %ymm2, %ymm7, %ymm7 ; AVX512BW-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero ; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 ; AVX512BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,3,3,3,u,4,4,4,4,6,5,5,5,5,4,6] ; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm6 ; AVX512BW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] ; AVX512BW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm5 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,3,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm4 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,2,3,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] ; AVX512BW-NEXT: movl $693250386, %eax # imm = 0x29522952 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,2,3,3] +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,2,3,3] ; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512BW-NEXT: movl $-2078209982, %eax # imm = 0x84210842 @@ -2754,18 +2753,18 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i8_stride5_vf32: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 @@ -2786,25 +2785,25 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm3[13],zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,ymm3[18],zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero ; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm7 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [3,3,3,u,4,4,4,4] ; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm7 ; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm2[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] @@ -2812,23 +2811,23 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14] ; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,zero,zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero -; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27],zero,zero,ymm3[26],zero,ymm3[28],zero,ymm3[30],zero,zero,ymm3[29],zero,ymm3[31],zero ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] ; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: movl $-2078209982, %eax # imm = 0x84210842 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -2841,81 +2840,81 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-BW-LABEL: store_i8_stride5_vf32: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm3 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm8 ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm6, %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm7 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] ; AVX512DQ-BW-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm3[13],zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,ymm3[18],zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero ; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm7 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4] -; AVX512DQ-BW-NEXT: vpermd %ymm4, %ymm7, %ymm7 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [3,3,3,u,4,4,4,4] +; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm7, %ymm7 ; AVX512DQ-BW-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 ; AVX512DQ-BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,3,3,3,u,4,4,4,4,6,5,5,5,5,4,6] ; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] ; AVX512DQ-BW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm5 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,3,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm4 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,2,3,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] ; AVX512DQ-BW-NEXT: movl $693250386, %eax # imm = 0x29522952 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,2,3,3] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,2,3,3] ; AVX512DQ-BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512DQ-BW-NEXT: movl $-2078209982, %eax # imm = 0x84210842 @@ -2923,18 +2922,18 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf32: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 @@ -2955,25 +2954,25 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm3[13],zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,ymm3[18],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [3,3,3,u,4,4,4,4] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm7 ; AVX512DQ-BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm2[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] @@ -2981,23 +2980,23 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,zero,zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27],zero,zero,ymm3[26],zero,ymm3[28],zero,ymm3[30],zero,zero,ymm3[29],zero,ymm3[31],zero ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512DQ-BW-FCP-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: movl $-2078209982, %eax # imm = 0x84210842 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 @@ -4036,200 +4035,198 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-LABEL: store_i8_stride5_vf64: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $248, %rsp -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX2-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] -; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm9 -; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa (%rsi), %xmm7 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] +; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm8 +; AVX2-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%r8), %xmm4 +; AVX2-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-NEXT: vpshufb %xmm2, %xmm10, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-NEXT: vpshufb %ymm15, %ymm13, %ymm1 -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm4, %ymm11, %ymm3 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-NEXT: vpshufb %ymm0, %ymm12, %ymm1 +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-NEXT: vpshufb %ymm6, %ymm15, %ymm3 ; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX2-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm3, %ymm14, %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-NEXT: vpshufb %ymm3, %ymm14, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm10 -; AVX2-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm8 +; AVX2-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] -; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm2 -; AVX2-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-NEXT: vpshufb %ymm15, %ymm9, %ymm1 -; AVX2-NEXT: vmovdqa (%rsi), %ymm15 -; AVX2-NEXT: vpshufb %ymm4, %ymm15, %ymm4 -; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa (%rcx), %ymm7 -; AVX2-NEXT: vpshufb %ymm3, %ymm7, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm4 +; AVX2-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX2-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-NEXT: vpshufb %ymm6, %ymm10, %ymm6 +; AVX2-NEXT: vpor %ymm1, %ymm6, %ymm6 +; AVX2-NEXT: vmovdqa (%rcx), %ymm8 +; AVX2-NEXT: vpshufb %ymm3, %ymm8, %ymm0 ; AVX2-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm5 ; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,2,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[0,2,1,1,4,6,5,5] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,2,1,1,4,6,5,5] +; AVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,2,1,1,4,6,5,5] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,2,1,1,4,6,5,5] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,0,4,4,4,4] -; AVX2-NEXT: vpermd %ymm13, %ymm2, %ymm4 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,3,u,4,4,4,4] +; AVX2-NEXT: vpermd %ymm12, %ymm4, %ymm5 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-NEXT: vpshufb %ymm6, %ymm15, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpermd %ymm9, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm5, %ymm15, %ymm4 -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-NEXT: vpshufb %ymm2, %ymm14, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm13 -; AVX2-NEXT: vpor %ymm4, %ymm13, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] -; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm4 -; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4] -; AVX2-NEXT: vpermd %ymm10, %ymm2, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm13, %ymm4, %ymm4 +; AVX2-NEXT: vpshufb %ymm6, %ymm10, %ymm5 +; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX2-NEXT: vpshufb %ymm4, %ymm14, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm12 +; AVX2-NEXT: vpor %ymm5, %ymm12, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] +; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,3,3,u,4,4,4] +; AVX2-NEXT: vpermd %ymm9, %ymm4, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm0 -; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm0 +; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX2-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,0,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255] -; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX2-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm0 ; AVX2-NEXT: vpshufd $80, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm6 ; AVX2-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[0,0,1,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6 +; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm7 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] ; AVX2-NEXT: vpshufb %ymm0, %ymm14, %ymm1 -; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] ; AVX2-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm3 +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX2-NEXT: vpshufb %ymm3, %ymm15, %ymm4 ; AVX2-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-NEXT: # ymm5 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] -; AVX2-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm9[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] +; AVX2-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] -; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -4239,11 +4236,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm2, 256(%r9) ; AVX2-NEXT: vmovdqa %ymm0, 128(%r9) -; AVX2-NEXT: vmovdqa %ymm6, 160(%r9) +; AVX2-NEXT: vmovdqa %ymm7, 160(%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-NEXT: vmovdqa %ymm1, 288(%r9) -; AVX2-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-NEXT: addq $248, %rsp @@ -4262,8 +4259,6 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 @@ -4280,6 +4275,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] @@ -4294,11 +4291,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm1 @@ -4307,27 +4303,26 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm13 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm8, %ymm14, %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm13, %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX2-FP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm11, %ymm7 ; AVX2-FP-NEXT: vpor %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm3 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm13 -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm7 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm1 ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm8 @@ -4352,29 +4347,27 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm4, %ymm9 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm11, %ymm15 ; AVX2-FP-NEXT: vpor %ymm9, %ymm15, %ymm9 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm14, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FP-NEXT: vpshufb %ymm15, %ymm13, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm12, %ymm14 -; AVX2-FP-NEXT: vpor %ymm0, %ymm14, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm12, %ymm13 +; AVX2-FP-NEXT: vpor %ymm0, %ymm13, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0] +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm9, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm2, %ymm8 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm9 ; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm7, %ymm9 -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm6 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm14, %ymm6 ; AVX2-FP-NEXT: vpor %ymm6, %ymm9, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm8, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm6, %ymm6 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,1,1,4,6,5,5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] @@ -4382,7 +4375,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,2,1,1,4,6,5,5] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm9 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,3,3,0,4,4,4,4] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,3,u,4,4,4,4] ; AVX2-FP-NEXT: vpermd %ymm4, %ymm0, %ymm4 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm8 @@ -4400,10 +4393,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm2 ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm4 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm14, %ymm4 ; AVX2-FP-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,3,0,4,4,4] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,3,3,u,4,4,4] ; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 @@ -4466,7 +4459,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: subq $168, %rsp ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm11 +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm12 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 @@ -4503,11 +4496,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm3 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,1,2,2,2,2,2,2] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,2,2,2,2,2,2] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4517,8 +4510,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm1 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm15, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 @@ -4526,16 +4518,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm5 ; AVX2-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm1 ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm10 @@ -4547,22 +4538,20 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,2,3,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm9 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm9, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm4 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm7 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm9 ; AVX2-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm10 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm15 @@ -4575,18 +4564,18 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm8 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm4 ; AVX2-FCP-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm6, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm6, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,0,4,4,4,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,3,u,4,4,4,4] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 @@ -4604,14 +4593,14 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm3 ; AVX2-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,3,3,3,0,4,4,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,3,3,u,4,4,4] ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -4638,8 +4627,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm4 @@ -4698,32 +4687,24 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm31 ; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm23 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] ; AVX512-NEXT: vpshufb %ymm8, %ymm5, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] -; AVX512-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,128,26,128,28,u,128,u,128,29,128,31,u,128,30] ; AVX512-NEXT: vpshufb %ymm5, %ymm11, %ymm1 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] -; AVX512-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,u,128,20,128,22,128,24,u,128,23,128,25,u] ; AVX512-NEXT: vpshufb %ymm2, %ymm11, %ymm11 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] ; AVX512-NEXT: vpshufb %ymm11, %ymm14, %ymm1 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm13 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm25 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,u,128,26,128,28,128,30,u,128,29,128,31,u] ; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm13 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512-NEXT: vpshufb %ymm15, %ymm14, %ymm14 ; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm27 ; AVX512-NEXT: vmovdqa (%rcx), %ymm14 @@ -4809,7 +4790,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vporq %zmm10, %zmm11, %zmm10 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm11 & (zmm10 ^ zmm3)) -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,u,13,13,13,13,u,14,14,14,14,u,15,15,15,15] ; AVX512-NEXT: vpermd %zmm7, %zmm3, %zmm3 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm28 & (zmm3 ^ zmm10)) ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,2,3,3] @@ -4823,13 +4804,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm11 & (zmm1 ^ zmm0)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm12 & mem) -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9] ; AVX512-NEXT: vpermd %zmm7, %zmm0, %zmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm5[0,0,1,1,4,4,5,5] ; AVX512-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] ; AVX512-NEXT: vpermd %zmm7, %zmm1, %zmm1 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%r9) @@ -4876,11 +4857,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm4 ; AVX512-FCP-NEXT: vporq %ymm7, %ymm4, %ymm25 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm9 ; AVX512-FCP-NEXT: vporq %ymm4, %ymm9, %ymm26 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm9 @@ -4888,28 +4867,22 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm13 ; AVX512-FCP-NEXT: vporq %ymm4, %ymm13, %ymm27 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm7 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,u,128,20,128,22,128,24,u,128,23,128,25,u] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm13 ; AVX512-FCP-NEXT: vporq %ymm7, %ymm13, %ymm28 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,128,26,128,28,u,128,u,128,29,128,31,u,128,30] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm15 ; AVX512-FCP-NEXT: vporq %ymm0, %ymm15, %ymm16 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm3 ; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,u,128,26,128,28,128,30,u,128,29,128,31,u] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8 ; AVX512-FCP-NEXT: vporq %ymm3, %ymm8, %ymm19 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 @@ -4933,14 +4906,14 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,1,2,2,2,2,2,2] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,2,2,2,2,2,2] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX512-FCP-NEXT: vpandn %ymm9, %ymm10, %ymm9 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,0,5,5,5,5,0,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,u,5,5,5,5,u,6] ; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] @@ -4951,7 +4924,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm11 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm14 @@ -4963,12 +4936,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,1,8,8,9,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,1,1,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm3, %zmm3 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm22[0,0,1,1] @@ -4983,17 +4956,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm28[2,2,3,3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm5 & (zmm4 ^ zmm2)) -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,10,11,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,10,10,11,11] ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm17 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm2 & (zmm18 ^ zmm17)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,u,13,13,13,13,u,14,14,14,14,u,15,15,15,15] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm18)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm2 & (zmm0 ^ zmm20)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm4 & mem) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) @@ -5040,32 +5013,24 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm31 ; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm23 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm5, %ymm0 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,128,26,128,28,u,128,u,128,29,128,31,u,128,30] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm11, %ymm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,u,128,20,128,22,128,24,u,128,23,128,25,u] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm11, %ymm11 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm14, %ymm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm13 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm25 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] -; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,u,128,26,128,28,128,30,u,128,29,128,31,u] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm13 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512DQ-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm14, %ymm14 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm27 ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm14 @@ -5151,7 +5116,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vporq %zmm10, %zmm11, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm11 & (zmm10 ^ zmm3)) -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,u,13,13,13,13,u,14,14,14,14,u,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm3, %zmm3 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm28 & (zmm3 ^ zmm10)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,2,3,3] @@ -5165,13 +5130,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm11 & (zmm1 ^ zmm0)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm12 & mem) -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm5[0,0,1,1,4,4,5,5] ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%r9) @@ -5218,11 +5183,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm4 ; AVX512DQ-FCP-NEXT: vporq %ymm7, %ymm4, %ymm25 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm9 ; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm9, %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm9 @@ -5230,28 +5193,22 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm13 ; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm13, %ymm27 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm7 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,u,128,20,128,22,128,24,u,128,23,128,25,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm13 ; AVX512DQ-FCP-NEXT: vporq %ymm7, %ymm13, %ymm28 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,128,26,128,28,u,128,u,128,29,128,31,u,128,30] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm15 ; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm15, %ymm16 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm3 ; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,u,128,26,128,28,128,30,u,128,29,128,31,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8 ; AVX512DQ-FCP-NEXT: vporq %ymm3, %ymm8, %ymm19 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 @@ -5275,14 +5232,14 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,1,2,2,2,2,2,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,2,2,2,2,2,2] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX512DQ-FCP-NEXT: vpandn %ymm9, %ymm10, %ymm9 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,0,5,5,5,5,0,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,u,5,5,5,5,u,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] @@ -5293,7 +5250,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm14 @@ -5305,12 +5262,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,1,8,8,9,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,1,1,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm3, %zmm3 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm22[0,0,1,1] @@ -5325,17 +5282,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm28[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm5 & (zmm4 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm2 & (zmm18 ^ zmm17)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,u,13,13,13,13,u,14,14,14,14,u,15,15,15,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm18)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm2 & (zmm0 ^ zmm20)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm4 & mem) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) @@ -5350,7 +5307,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm10 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u] ; AVX512BW-NEXT: vpshufb %ymm10, %ymm1, %ymm3 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] @@ -5369,7 +5326,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm14 ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm18 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14] ; AVX512BW-NEXT: vpshufb %ymm18, %ymm3, %ymm5 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] @@ -5390,7 +5347,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512BW-NEXT: kmovq %rax, %k4 ; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm5 {%k4} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] ; AVX512BW-NEXT: vpermd %zmm0, %zmm14, %zmm14 ; AVX512BW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-NEXT: kmovq %rax, %k2 @@ -5416,10 +5373,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm23 = ymm16[0,0,1,1] ; AVX512BW-NEXT: vmovdqa64 32(%rdi), %ymm27 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,3,3,0,4,4,4,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,3,3,u,4,4,4,4] ; AVX512BW-NEXT: vpermd %ymm27, %ymm16, %ymm24 ; AVX512BW-NEXT: vmovdqa64 32(%rsi), %ymm28 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u] ; AVX512BW-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vpshufb %ymm17, %ymm28, %ymm24 {%k2} @@ -5427,13 +5384,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-NEXT: kmovq %rax, %k3 ; AVX512BW-NEXT: vmovdqu8 %zmm23, %zmm9 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [9,9,10,10,10,10,10,10,11,11,11,11,u,12,12,12] ; AVX512BW-NEXT: vpermd %zmm0, %zmm23, %zmm23 ; AVX512BW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512BW-NEXT: kmovq %rax, %k6 ; AVX512BW-NEXT: vmovdqu8 %zmm23, %zmm9 {%k6} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512BW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX512BW-NEXT: vpshufb %ymm23, %ymm28, %ymm29 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512BW-NEXT: vpshufb %ymm24, %ymm27, %ymm30 @@ -5446,8 +5402,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,2,3,3,6,6,7,7] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512BW-NEXT: vpshufb %ymm27, %ymm25, %ymm28 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512BW-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512BW-NEXT: vpshufb %ymm29, %ymm26, %ymm30 ; AVX512BW-NEXT: vporq %ymm28, %ymm30, %ymm28 ; AVX512BW-NEXT: vpshufb %ymm10, %ymm26, %ymm10 @@ -5457,7 +5412,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm10 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,3,3,6,6,7,7] ; AVX512BW-NEXT: vmovdqu8 %zmm18, %zmm10 {%k4} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm18, %zmm18 ; AVX512BW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512BW-NEXT: kmovq %rax, %k1 @@ -5479,7 +5434,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm6 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-NEXT: kmovq %rax, %k1 @@ -5500,7 +5455,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb %ymm17, %ymm3, %ymm4 {%k2} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm2 {%k3} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,u,4,4,4,4,6,5,5,5,5,4,6] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-NEXT: kmovq %rax, %k1 @@ -5546,20 +5501,20 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] ; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm8, %xmm16 -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm15[0,0,1,1] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [3,3,3,0,4,4,4,4] -; AVX512BW-FCP-NEXT: vpermd 32(%rdi), %ymm15, %ymm23 +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [3,3,3,u,4,4,4,4] +; AVX512BW-FCP-NEXT: vpermd 32(%rdi), %ymm15, %ymm22 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm16[0,0,1,1] ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm24 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm16 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm23 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm22 +; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm22 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm23, %zmm22 ; AVX512BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [9,9,10,10,10,10,10,10,11,11,11,11,u,12,12,12] ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm22 ; AVX512BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 @@ -5583,7 +5538,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm10 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm17 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 @@ -5612,7 +5567,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm21, %ymm15 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [3,3,3,3,u,4,4,4,4,6,5,5,5,5,4,6] ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13 ; AVX512BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 @@ -5632,7 +5587,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm5, %zmm5 ; AVX512BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 @@ -5642,16 +5597,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm17[27],zero,zero,ymm17[26],zero,ymm17[28],zero,ymm17[30],zero,zero,ymm17[29],zero,ymm17[31],zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm5 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm21[26],zero,ymm21[28],zero,zero,zero,zero,ymm21[29],zero,ymm21[31],zero,zero,ymm21[30] +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm24[26],zero,ymm24[28],zero,zero,ymm24[27],zero,ymm24[29],zero,ymm24[31],zero,zero,ymm24[30],zero ; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 @@ -5668,7 +5623,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm10 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u] ; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm1, %ymm3 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] @@ -5687,7 +5642,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm18 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14] ; AVX512DQ-BW-NEXT: vpshufb %ymm18, %ymm3, %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] @@ -5708,7 +5663,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512DQ-BW-NEXT: kmovq %rax, %k4 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm5 {%k4} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] ; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm14, %zmm14 ; AVX512DQ-BW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512DQ-BW-NEXT: kmovq %rax, %k2 @@ -5734,10 +5689,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm23 = ymm16[0,0,1,1] ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %ymm27 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,3,3,0,4,4,4,4] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,3,3,u,4,4,4,4] ; AVX512DQ-BW-NEXT: vpermd %ymm27, %ymm16, %ymm24 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %ymm28 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u] ; AVX512DQ-BW-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-NEXT: vpshufb %ymm17, %ymm28, %ymm24 {%k2} @@ -5745,13 +5700,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512DQ-BW-NEXT: kmovq %rax, %k3 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm23, %zmm9 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm23 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [9,9,10,10,10,10,10,10,11,11,11,11,u,12,12,12] ; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm23, %zmm23 ; AVX512DQ-BW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512DQ-BW-NEXT: kmovq %rax, %k6 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm23, %zmm9 {%k6} -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512DQ-BW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm28, %ymm29 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm27, %ymm30 @@ -5764,8 +5718,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,2,3,3,6,6,7,7] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512DQ-BW-NEXT: vpshufb %ymm27, %ymm25, %ymm28 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512DQ-BW-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512DQ-BW-NEXT: vpshufb %ymm29, %ymm26, %ymm30 ; AVX512DQ-BW-NEXT: vporq %ymm28, %ymm30, %ymm28 ; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm26, %ymm10 @@ -5775,7 +5728,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm10 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,3,3,6,6,7,7] ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm18, %zmm10 {%k4} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] ; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm18, %zmm18 ; AVX512DQ-BW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 @@ -5797,7 +5750,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 @@ -5818,7 +5771,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb %ymm17, %ymm3, %ymm4 {%k2} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm2 {%k3} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,u,4,4,4,4,6,5,5,5,5,4,6] ; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 @@ -5864,20 +5817,20 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm8, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm15[0,0,1,1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [3,3,3,0,4,4,4,4] -; AVX512DQ-BW-FCP-NEXT: vpermd 32(%rdi), %ymm15, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [3,3,3,u,4,4,4,4] +; AVX512DQ-BW-FCP-NEXT: vpermd 32(%rdi), %ymm15, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm16[0,0,1,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm16 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,0,u,u,u,u,1,u,u,u,u,2,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm23, %zmm22 ; AVX512DQ-BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [9,9,10,10,10,10,10,10,11,11,11,11,u,12,12,12] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm22 ; AVX512DQ-BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 @@ -5901,7 +5854,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 @@ -5930,7 +5883,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm21, %ymm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [3,3,3,3,u,4,4,4,4,6,5,5,5,5,4,6] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 @@ -5950,7 +5903,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm5, %zmm5 ; AVX512DQ-BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 @@ -5960,16 +5913,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm17[27],zero,zero,ymm17[26],zero,ymm17[28],zero,ymm17[30],zero,zero,ymm17[29],zero,ymm17[31],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm21[26],zero,ymm21[28],zero,zero,zero,zero,ymm21[29],zero,ymm21[31],zero,zero,ymm21[30] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm24[26],zero,ymm24[28],zero,zero,ymm24[27],zero,ymm24[29],zero,ymm24[31],zero,zero,ymm24[30],zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 6d499e17bfbc6..fce73161d5697 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -21,8 +21,8 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pxor %xmm1, %xmm1 @@ -54,14 +54,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX: # %bb.0: ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r9), %xmm2 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa (%rdx), %xmm1 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX-NEXT: vmovdqa (%r9), %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX-NEXT: vmovq %xmm0, (%rax) @@ -71,14 +71,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-NEXT: vmovdqa (%r9), %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX2-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX2-NEXT: vmovq %xmm0, (%rax) @@ -88,14 +88,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FP-NEXT: vmovdqa (%r9), %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) @@ -105,14 +105,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) @@ -122,14 +122,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vmovdqa (%r9), %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX512-NEXT: vmovdqa (%r9), %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512-NEXT: vmovq %xmm0, (%rax) @@ -139,14 +139,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) @@ -156,14 +156,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) @@ -173,14 +173,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) @@ -190,14 +190,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r9), %xmm2 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX512BW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512BW-NEXT: vmovq %xmm0, (%rax) @@ -207,14 +207,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) @@ -224,14 +224,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) @@ -241,14 +241,14 @@ define void @store_i8_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) @@ -274,11 +274,11 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-LABEL: store_i8_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: movdqa (%r8), %xmm1 ; SSE-NEXT: pxor %xmm3, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] @@ -288,10 +288,10 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm5, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] ; SSE-NEXT: packuswb %xmm3, %xmm6 @@ -300,21 +300,21 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: por %xmm6, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0] ; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movq %xmm2, 16(%rax) ; SSE-NEXT: movdqa %xmm3, (%rax) ; SSE-NEXT: retq @@ -324,16 +324,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,4,8,12],zero,zero,xmm0[1,5,9,13],zero,zero,xmm0[2,6,10,14] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[0,4],zero,zero,zero,zero,xmm1[1,5],zero,zero,zero,zero -; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3,7,11,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,6],zero,zero,zero,zero,xmm1[3,7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rax) ; AVX-NEXT: vmovdqa %xmm2, (%rax) @@ -341,16 +341,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX2-LABEL: store_i8_stride6_vf4: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -362,16 +362,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX2-FP-LABEL: store_i8_stride6_vf4: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -383,16 +383,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX2-FCP-LABEL: store_i8_stride6_vf4: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -404,16 +404,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512-LABEL: store_i8_stride6_vf4: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23,u,u,u,u,u,u,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -425,16 +425,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512-FCP-LABEL: store_i8_stride6_vf4: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -446,16 +446,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512DQ-LABEL: store_i8_stride6_vf4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -467,16 +467,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512DQ-FCP-LABEL: store_i8_stride6_vf4: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -488,16 +488,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512BW-LABEL: store_i8_stride6_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -509,16 +509,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512BW-FCP-LABEL: store_i8_stride6_vf4: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -530,16 +530,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512DQ-BW-LABEL: store_i8_stride6_vf4: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -551,16 +551,16 @@ define void @store_i8_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride6_vf4: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4],zero,zero,zero,zero,ymm0[1,5],zero,zero,zero,zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 @@ -651,7 +651,6 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX-LABEL: store_i8_stride6_vf8: ; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -668,15 +667,16 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[0,8,u,u],zero,zero,xmm1[1,9,u,u],zero,zero,xmm1[2,10] ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,8],zero,zero,xmm0[u,u,1,9],zero,zero,xmm0[u,u,2,10],zero,zero +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7] ; AVX-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX-NEXT: vmovaps %ymm3, (%rax) ; AVX-NEXT: vzeroupper @@ -702,13 +702,13 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6],ymm2[7],ymm5[8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14],ymm2[15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] ; AVX2-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-NEXT: vzeroupper @@ -734,13 +734,13 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6],ymm2[7],ymm5[8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14],ymm2[15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] ; AVX2-FP-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FP-NEXT: vzeroupper @@ -766,13 +766,13 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6],ymm2[7],ymm5[8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14],ymm2[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] ; AVX2-FCP-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FCP-NEXT: vzeroupper @@ -780,7 +780,6 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512-LABEL: store_i8_stride6_vf8: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -795,12 +794,13 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3)) ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -812,7 +812,6 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512-FCP-LABEL: store_i8_stride6_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -827,12 +826,13 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -844,7 +844,6 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512DQ-LABEL: store_i8_stride6_vf8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -859,12 +858,13 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -876,7 +876,6 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; ; AVX512DQ-FCP-LABEL: store_i8_stride6_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -891,12 +890,13 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -923,14 +923,14 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} ; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -957,14 +957,14 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} ; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -991,14 +991,14 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512DQ-BW-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} ; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -1025,14 +1025,14 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -1257,7 +1257,6 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-LABEL: store_i8_stride6_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa (%r8), %xmm2 @@ -1268,29 +1267,30 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,ymm3[20,28],zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero,zero,zero,ymm4[21,29] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] ; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero,ymm4[18,26],zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13],zero,zero,zero,zero,ymm5[6,14],zero,zero,zero,zero,ymm5[7,15],zero,zero,zero,zero,ymm5[16,24],zero,zero,zero,zero,ymm5[17,25],zero,zero,zero,zero,ymm5[18,26] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm2[0,2,1,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u] ; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31],zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11],zero,zero,zero,zero,ymm0[4,12],zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-NEXT: vmovdqa %ymm4, 32(%rax) ; AVX2-NEXT: vmovdqa %ymm3, (%rax) @@ -1299,7 +1299,6 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-FP-LABEL: store_i8_stride6_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 @@ -1310,29 +1309,30 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,ymm3[20,28],zero,zero,zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero,zero,zero,ymm4[21,29] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] ; AVX2-FP-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero,ymm4[18,26],zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13],zero,zero,zero,zero,ymm5[6,14],zero,zero,zero,zero,ymm5[7,15],zero,zero,zero,zero,ymm5[16,24],zero,zero,zero,zero,ymm5[17,25],zero,zero,zero,zero,ymm5[18,26] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[0,2,1,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u] ; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31],zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11],zero,zero,zero,zero,ymm0[4,12],zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] ; AVX2-FP-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm3, (%rax) @@ -1341,7 +1341,6 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-FCP-LABEL: store_i8_stride6_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 @@ -1352,29 +1351,30 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,ymm3[20,28],zero,zero,zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero,zero,zero,ymm4[21,29] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] ; AVX2-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,ymm4[7,15],zero,zero,zero,zero,ymm4[16,24],zero,zero,zero,zero,ymm4[17,25],zero,zero,zero,zero,ymm4[18,26],zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13],zero,zero,zero,zero,ymm5[6,14],zero,zero,zero,zero,ymm5[7,15],zero,zero,zero,zero,ymm5[16,24],zero,zero,zero,zero,ymm5[17,25],zero,zero,zero,zero,ymm5[18,26] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[0,2,1,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u] ; AVX2-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15,u,u,u,u,16,24,u,u,u,u,17,25,u,u,u,u] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31],zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11],zero,zero,zero,zero,ymm0[4,12],zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] ; AVX2-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax) @@ -1383,36 +1383,36 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i8_stride6_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u],zero,zero,ymm2[1,9,u,u],zero,zero,ymm2[2,10,u,u],zero,zero,ymm2[19,27,u,u],zero,zero,ymm2[20,28,u,u],zero,zero ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,u,6,14],zero,zero,ymm4[u,u,7,15],zero,zero,ymm4[u,u,16,24],zero,zero,ymm4[u,u,17,25],zero,zero,ymm4[u,u,18,26],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,u,1,9],zero,zero,ymm4[u,u,2,10],zero,zero,ymm4[u,u,19,27],zero,zero,ymm4[u,u,20,28],zero,zero,ymm4[u,u,21,29] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13,u,u],zero,zero,ymm5[6,14,u,u],zero,zero,ymm5[7,15,u,u],zero,zero,ymm5[16,24,u,u],zero,zero,ymm5[17,25,u,u],zero,zero,ymm5[18,26] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,u,1,9],zero,zero,ymm5[u,u,2,10],zero,zero,ymm5[u,u,19,27],zero,zero,ymm5[u,u,20,28],zero,zero,ymm5[u,u,21,29] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[5,13,u,u],zero,zero,ymm6[6,14,u,u],zero,zero,ymm6[7,15,u,u],zero,zero,ymm6[16,24,u,u],zero,zero,ymm6[17,25,u,u],zero,zero,ymm6[18,26] +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[6,14],zero,zero,zero,zero,ymm5[7,15],zero,zero,zero,zero,ymm5[16,24],zero,zero,zero,zero,ymm5[17,25],zero,zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,2,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[5,13],zero,zero,zero,zero,ymm6[6,14],zero,zero,zero,zero,ymm6[7,15],zero,zero,zero,zero,ymm6[16,24],zero,zero,zero,zero,ymm6[17,25],zero,zero,zero,zero +; AVX512-NEXT: vporq %zmm2, %zmm5, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm2 & mem) ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u],zero,zero,ymm1[3,11,u,u],zero,zero,ymm1[4,12,u,u],zero,zero,ymm1[21,29,u,u],zero,zero,ymm1[22,30,u,u],zero,zero,ymm1[23,31,u,u] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,3,11],zero,zero,ymm0[u,u,4,12],zero,zero,ymm0[u,u,5,13],zero,zero,ymm0[u,u,22,30],zero,zero,ymm0[u,u,23,31],zero,zero,ymm0[u,u] ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm3[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10],zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem) ; AVX512-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1422,36 +1422,36 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-FCP-LABEL: store_i8_stride6_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u],zero,zero,ymm2[1,9,u,u],zero,zero,ymm2[2,10,u,u],zero,zero,ymm2[19,27,u,u],zero,zero,ymm2[20,28,u,u],zero,zero ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,u,6,14],zero,zero,ymm4[u,u,7,15],zero,zero,ymm4[u,u,16,24],zero,zero,ymm4[u,u,17,25],zero,zero,ymm4[u,u,18,26],zero,zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,u,1,9],zero,zero,ymm4[u,u,2,10],zero,zero,ymm4[u,u,19,27],zero,zero,ymm4[u,u,20,28],zero,zero,ymm4[u,u,21,29] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13,u,u],zero,zero,ymm5[6,14,u,u],zero,zero,ymm5[7,15,u,u],zero,zero,ymm5[16,24,u,u],zero,zero,ymm5[17,25,u,u],zero,zero,ymm5[18,26] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,u,1,9],zero,zero,ymm5[u,u,2,10],zero,zero,ymm5[u,u,19,27],zero,zero,ymm5[u,u,20,28],zero,zero,ymm5[u,u,21,29] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[5,13,u,u],zero,zero,ymm6[6,14,u,u],zero,zero,ymm6[7,15,u,u],zero,zero,ymm6[16,24,u,u],zero,zero,ymm6[17,25,u,u],zero,zero,ymm6[18,26] +; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[6,14],zero,zero,zero,zero,ymm5[7,15],zero,zero,zero,zero,ymm5[16,24],zero,zero,zero,zero,ymm5[17,25],zero,zero,zero,zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,2,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[5,13],zero,zero,zero,zero,ymm6[6,14],zero,zero,zero,zero,ymm6[7,15],zero,zero,zero,zero,ymm6[16,24],zero,zero,zero,zero,ymm6[17,25],zero,zero,zero,zero +; AVX512-FCP-NEXT: vporq %zmm2, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm2 & mem) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u],zero,zero,ymm1[3,11,u,u],zero,zero,ymm1[4,12,u,u],zero,zero,ymm1[21,29,u,u],zero,zero,ymm1[22,30,u,u],zero,zero,ymm1[23,31,u,u] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,3,11],zero,zero,ymm0[u,u,4,12],zero,zero,ymm0[u,u,5,13],zero,zero,ymm0[u,u,22,30],zero,zero,ymm0[u,u,23,31],zero,zero,ymm0[u,u] ; AVX512-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[1,3,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10],zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem) ; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1461,36 +1461,36 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-LABEL: store_i8_stride6_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u],zero,zero,ymm2[1,9,u,u],zero,zero,ymm2[2,10,u,u],zero,zero,ymm2[19,27,u,u],zero,zero,ymm2[20,28,u,u],zero,zero ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,u,6,14],zero,zero,ymm4[u,u,7,15],zero,zero,ymm4[u,u,16,24],zero,zero,ymm4[u,u,17,25],zero,zero,ymm4[u,u,18,26],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,u,1,9],zero,zero,ymm4[u,u,2,10],zero,zero,ymm4[u,u,19,27],zero,zero,ymm4[u,u,20,28],zero,zero,ymm4[u,u,21,29] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13,u,u],zero,zero,ymm5[6,14,u,u],zero,zero,ymm5[7,15,u,u],zero,zero,ymm5[16,24,u,u],zero,zero,ymm5[17,25,u,u],zero,zero,ymm5[18,26] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,u,1,9],zero,zero,ymm5[u,u,2,10],zero,zero,ymm5[u,u,19,27],zero,zero,ymm5[u,u,20,28],zero,zero,ymm5[u,u,21,29] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[5,13,u,u],zero,zero,ymm6[6,14,u,u],zero,zero,ymm6[7,15,u,u],zero,zero,ymm6[16,24,u,u],zero,zero,ymm6[17,25,u,u],zero,zero,ymm6[18,26] +; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[6,14],zero,zero,zero,zero,ymm5[7,15],zero,zero,zero,zero,ymm5[16,24],zero,zero,zero,zero,ymm5[17,25],zero,zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,2,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[5,13],zero,zero,zero,zero,ymm6[6,14],zero,zero,zero,zero,ymm6[7,15],zero,zero,zero,zero,ymm6[16,24],zero,zero,zero,zero,ymm6[17,25],zero,zero,zero,zero +; AVX512DQ-NEXT: vporq %zmm2, %zmm5, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm2 & mem) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u],zero,zero,ymm1[3,11,u,u],zero,zero,ymm1[4,12,u,u],zero,zero,ymm1[21,29,u,u],zero,zero,ymm1[22,30,u,u],zero,zero,ymm1[23,31,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,3,11],zero,zero,ymm0[u,u,4,12],zero,zero,ymm0[u,u,5,13],zero,zero,ymm0[u,u,22,30],zero,zero,ymm0[u,u,23,31],zero,zero,ymm0[u,u] ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm3[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10],zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem) ; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1500,36 +1500,36 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-FCP-LABEL: store_i8_stride6_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u],zero,zero,ymm2[1,9,u,u],zero,zero,ymm2[2,10,u,u],zero,zero,ymm2[19,27,u,u],zero,zero,ymm2[20,28,u,u],zero,zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,u,6,14],zero,zero,ymm4[u,u,7,15],zero,zero,ymm4[u,u,16,24],zero,zero,ymm4[u,u,17,25],zero,zero,ymm4[u,u,18,26],zero,zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,u,1,9],zero,zero,ymm4[u,u,2,10],zero,zero,ymm4[u,u,19,27],zero,zero,ymm4[u,u,20,28],zero,zero,ymm4[u,u,21,29] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[5,13,u,u],zero,zero,ymm5[6,14,u,u],zero,zero,ymm5[7,15,u,u],zero,zero,ymm5[16,24,u,u],zero,zero,ymm5[17,25,u,u],zero,zero,ymm5[18,26] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,u,1,9],zero,zero,ymm5[u,u,2,10],zero,zero,ymm5[u,u,19,27],zero,zero,ymm5[u,u,20,28],zero,zero,ymm5[u,u,21,29] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[5,13,u,u],zero,zero,ymm6[6,14,u,u],zero,zero,ymm6[7,15,u,u],zero,zero,ymm6[16,24,u,u],zero,zero,ymm6[17,25,u,u],zero,zero,ymm6[18,26] +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[0,8],zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[6,14],zero,zero,zero,zero,ymm5[7,15],zero,zero,zero,zero,ymm5[16,24],zero,zero,zero,zero,ymm5[17,25],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,2,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[5,13],zero,zero,zero,zero,ymm6[6,14],zero,zero,zero,zero,ymm6[7,15],zero,zero,zero,zero,ymm6[16,24],zero,zero,zero,zero,ymm6[17,25],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vporq %zmm2, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm2 & mem) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u],zero,zero,ymm1[3,11,u,u],zero,zero,ymm1[4,12,u,u],zero,zero,ymm1[21,29,u,u],zero,zero,ymm1[22,30,u,u],zero,zero,ymm1[23,31,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,3,11],zero,zero,ymm0[u,u,4,12],zero,zero,ymm0[u,u,5,13],zero,zero,ymm0[u,u,22,30],zero,zero,ymm0[u,u,23,31],zero,zero,ymm0[u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[1,3,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10],zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1541,39 +1541,39 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,2,0,2] ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zmm3[21,29,37,45],zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zmm3[39,47],zero,zero,zero,zero,zmm3[48,56],zero,zero,zero,zero,zmm3[49,57],zero,zero,zero,zero,zmm3[50,58] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[0,8],zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zmm2[2,10],zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zmm2[21,29,37,45],zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zero,zmm2[39,47],zero,zero,zero,zero,zmm2[48,56],zero,zero,zero,zero,zmm2[49,57],zero,zero,zero,zero,zmm2[50,58] +; AVX512BW-NEXT: vmovdqa (%r8), %xmm3 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,0,2] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zmm4[2,10],zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[38,46],zero,zero,zero,zero,zmm4[39,47],zero,zero,zero,zero,zmm4[48,56],zero,zero,zero,zero,zmm4[49,57],zero,zero,zero,zero,zmm4[50,58],zero,zero -; AVX512BW-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512BW-NEXT: vporq %zmm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,1,3] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u] ; AVX512BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11],zero,zero,zero,zero,ymm0[4,12],zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31],zero,zero ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[1,3,1,3] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] ; AVX512BW-NEXT: movw $-28087, %cx # imm = 0x9249 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1582,10 +1582,10 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,0,2,8,10,9,11] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,8,10,9,11] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8],zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zmm4[2,10],zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zmm4[21,29,37,45],zero,zero,zero,zero,zmm4[38,46],zero,zero,zero,zero,zmm4[39,47],zero,zero,zero,zero,zmm4[48,56],zero,zero,zero,zero,zmm4[49,57],zero,zero,zero,zero,zmm4[50,58] @@ -1593,7 +1593,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zmm3[0,8],zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zmm3[39,47],zero,zero,zero,zero,zmm3[48,56],zero,zero,zero,zero,zmm3[49,57],zero,zero,zero,zero,zmm3[50,58],zero,zero ; AVX512BW-FCP-NEXT: vporq %zmm4, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] ; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 @@ -1618,39 +1618,39 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2] +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,2,0,2] ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zmm3[21,29,37,45],zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zmm3[39,47],zero,zero,zero,zero,zmm3[48,56],zero,zero,zero,zero,zmm3[49,57],zero,zero,zero,zero,zmm3[50,58] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[0,8],zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zmm2[2,10],zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zmm2[21,29,37,45],zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zero,zmm2[39,47],zero,zero,zero,zero,zmm2[48,56],zero,zero,zero,zero,zmm2[49,57],zero,zero,zero,zero,zmm2[50,58] +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm3 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,0,2] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zmm4[2,10],zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[38,46],zero,zero,zero,zero,zmm4[39,47],zero,zero,zero,zero,zmm4[48,56],zero,zero,zero,zero,zmm4[49,57],zero,zero,zero,zero,zmm4[50,58],zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vporq %zmm2, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,1,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u] ; AVX512DQ-BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[3,11],zero,zero,zero,zero,ymm0[4,12],zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[3,11],zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,ymm1[21,29],zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,ymm1[23,31],zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[1,3,1,3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u,u,u,21,29,u,u,u,u,22,30,u,u,u,u,23,31] ; AVX512DQ-BW-NEXT: movw $-28087, %cx # imm = 0x9249 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1659,10 +1659,10 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,0,2,8,10,9,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,8,10,9,11] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8],zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zmm4[2,10],zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zmm4[21,29,37,45],zero,zero,zero,zero,zmm4[38,46],zero,zero,zero,zero,zmm4[39,47],zero,zero,zero,zero,zmm4[48,56],zero,zero,zero,zero,zmm4[49,57],zero,zero,zero,zero,zmm4[50,58] @@ -1670,7 +1670,7 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zmm3[0,8],zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zmm3[39,47],zero,zero,zero,zero,zmm3[48,56],zero,zero,zero,zero,zmm3[49,57],zero,zero,zero,zero,zmm3[50,58],zero,zero ; AVX512DQ-BW-FCP-NEXT: vporq %zmm4, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 @@ -2204,13 +2204,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-LABEL: store_i8_stride6_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rax -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%r8), %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-NEXT: vmovdqa (%rcx), %xmm6 ; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm5 @@ -2219,33 +2219,32 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX2-NEXT: vpshufb %xmm9, %xmm11, %xmm10 ; AVX2-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-NEXT: vpshufb %xmm9, %xmm13, %xmm9 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm10, %ymm5, %ymm9, %ymm9 ; AVX2-NEXT: vmovdqa (%r8), %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 -; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm12 -; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm12 +; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm7 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[4],ymm12[4],ymm7[5],ymm12[5],ymm7[6],ymm12[6],ymm7[7],ymm12[7],ymm7[16],ymm12[16],ymm7[17],ymm12[17],ymm7[18],ymm12[18],ymm7[19],ymm12[19],ymm7[20],ymm12[20],ymm7[21],ymm12[21],ymm7[22],ymm12[22],ymm7[23],ymm12[23] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm15 -; AVX2-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm7 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[16],ymm15[16],ymm7[17],ymm15[17],ymm7[18],ymm15[18],ymm7[19],ymm15[19],ymm7[20],ymm15[20],ymm7[21],ymm15[21],ymm7[22],ymm15[22],ymm7[23],ymm15[23] ; AVX2-NEXT: vmovdqa (%r9), %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm10, %ymm12, %ymm15, %ymm12 +; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-NEXT: vmovdqa (%r9), %xmm10 -; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm14, %ymm12, %ymm15, %ymm12 ; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] @@ -2265,24 +2264,26 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] +; AVX2-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] +; AVX2-NEXT: vmovdqa %ymm4, %ymm3 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm0, %ymm15, %ymm9, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm9 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 @@ -2299,18 +2300,18 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm12[8],ymm2[9],ymm12[9],ymm2[10],ymm12[10],ymm2[11],ymm12[11],ymm2[12],ymm12[12],ymm2[13],ymm12[13],ymm2[14],ymm12[14],ymm2[15],ymm12[15],ymm2[24],ymm12[24],ymm2[25],ymm12[25],ymm2[26],ymm12[26],ymm2[27],ymm12[27],ymm2[28],ymm12[28],ymm2[29],ymm12[29],ymm2[30],ymm12[30],ymm2[31],ymm12[31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15],ymm12[24],ymm2[24],ymm12[25],ymm2[25],ymm12[26],ymm2[26],ymm12[27],ymm2[27],ymm12[28],ymm2[28],ymm12[29],ymm2[29],ymm12[30],ymm2[30],ymm12[31],ymm2[31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -2354,18 +2355,18 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm11 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm3, %ymm9 @@ -2380,8 +2381,8 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm13, %ymm15, %ymm13 -; AVX2-FP-NEXT: vmovdqa (%r9), %xmm12 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa (%r9), %xmm12 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm13 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] @@ -2402,19 +2403,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 ; AVX2-FP-NEXT: vmovdqa %xmm7, %xmm9 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] @@ -2434,18 +2435,18 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -2489,18 +2490,18 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm7 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm13 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm9 @@ -2515,8 +2516,8 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm13, %ymm15, %ymm13 -; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm12 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm12 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm13 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] @@ -2537,19 +2538,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %xmm7, %xmm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] @@ -2569,18 +2570,18 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -2607,125 +2608,124 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i8_stride6_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-NEXT: vmovdqa (%rsi), %ymm3 ; AVX512-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512-NEXT: vmovdqa (%r8), %ymm0 +; AVX512-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX512-NEXT: vmovdqa (%r9), %ymm1 -; AVX512-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512-NEXT: vprold $16, %ymm9, %ymm9 -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm6 ^ (zmm14 & (zmm13 ^ zmm6)) -; AVX512-NEXT: vmovdqa (%r9), %xmm11 -; AVX512-NEXT: vmovdqa (%r8), %xmm12 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] +; AVX512-NEXT: vprold $16, %ymm7, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm7 +; AVX512-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm14 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm7 ^ (zmm6 & (zmm14 ^ zmm7)) +; AVX512-NEXT: vmovdqa (%r9), %xmm11 +; AVX512-NEXT: vmovdqa (%r8), %xmm13 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13)) -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-NEXT: vpshufb %xmm13, %xmm9, %xmm15 -; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm14)) +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] +; AVX512-NEXT: vpshufb %xmm14, %xmm10, %xmm15 +; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512-NEXT: vprold $16, %xmm15, %xmm15 -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 -; AVX512-NEXT: vpermq {{.*#+}} zmm15 = zmm13[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm7[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm15 ^ (mem & (zmm8 ^ zmm15)) -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512-NEXT: vpshufb %xmm7, %xmm11, %xmm9 -; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm10 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,0,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm14 & (zmm9 ^ zmm8)) -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm10 -; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm8 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[16],ymm10[16],ymm8[17],ymm10[17],ymm8[18],ymm10[18],ymm8[19],ymm10[19],ymm8[20],ymm10[20],ymm8[21],ymm10[21],ymm8[22],ymm10[22],ymm8[23],ymm10[23] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512-NEXT: vpermq {{.*#+}} zmm15 = zmm14[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5] +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512-NEXT: vpermq {{.*#+}} zmm9 = zmm8[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm15 ^ (mem & (zmm9 ^ zmm15)) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm10 +; AVX512-NEXT: vpshufb %xmm8, %xmm13, %xmm12 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,0,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm12 +; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[16],ymm12[16],ymm11[17],ymm12[17],ymm11[18],ymm12[18],ymm11[19],ymm12[19],ymm11[20],ymm12[20],ymm11[21],ymm12[21],ymm11[22],ymm12[22],ymm11[23],ymm12[23] +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512-NEXT: vpshufb %ymm14, %ymm4, %ymm4 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm3 & (zmm2 ^ zmm8)) -; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm2 -; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm4 +; AVX512-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm3 & (zmm2 ^ zmm11)) +; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm2 +; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm4 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm6 & (zmm10 ^ zmm9)) ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride6_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm9 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] @@ -2733,16 +2733,16 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm6 ^ (zmm9 & (zmm8 ^ zmm6)) -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm4 ^ (zmm9 & (zmm8 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[16],ymm4[16],ymm8[17],ymm4[17],ymm8[18],ymm4[18],ymm8[19],ymm4[19],ymm8[20],ymm4[20],ymm8[21],ymm4[21],ymm8[22],ymm4[22],ymm8[23],ymm4[23] ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm9)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm9)) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm8 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm11 @@ -2754,7 +2754,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm13, %zmm12 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] @@ -2770,155 +2770,155 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,0,1,4,4,4,5] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm15 & (zmm10 ^ zmm14)) -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512-FCP-NEXT: vprold $16, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vprold $16, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm15 & (zmm3 ^ zmm5)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,1,10,10,10,11] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm15 & (zmm7 ^ zmm6)) +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm15 & (zmm10 ^ zmm14)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride6_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512DQ-NEXT: vprold $16, %ymm9, %ymm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm6 ^ (zmm14 & (zmm13 ^ zmm6)) -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm11 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm12 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] +; AVX512DQ-NEXT: vprold $16, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm7 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm7 ^ (zmm6 & (zmm14 ^ zmm7)) +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm11 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm13 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13)) -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm9, %xmm15 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm14)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm10, %xmm15 +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512DQ-NEXT: vprold $16, %xmm15, %xmm15 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm15 = zmm13[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm7[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm15 ^ (mem & (zmm8 ^ zmm15)) -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm11, %xmm9 -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm12, %xmm10 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,0,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm14 & (zmm9 ^ zmm8)) -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm10 -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm2, %ymm8 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[16],ymm10[16],ymm8[17],ymm10[17],ymm8[18],ymm10[18],ymm8[19],ymm10[19],ymm8[20],ymm10[20],ymm8[21],ymm10[21],ymm8[22],ymm10[22],ymm8[23],ymm10[23] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm15 = zmm14[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm9 = zmm8[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm15 ^ (mem & (zmm9 ^ zmm15)) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm10 +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm13, %xmm12 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,0,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm12 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[16],ymm12[16],ymm11[17],ymm12[17],ymm11[18],ymm12[18],ymm11[19],ymm12[19],ymm11[20],ymm12[20],ymm11[21],ymm12[21],ymm11[22],ymm12[22],ymm11[23],ymm12[23] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm3 & (zmm2 ^ zmm8)) -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm3 & (zmm2 ^ zmm11)) +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm6 & (zmm10 ^ zmm9)) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride6_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm9 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] @@ -2926,16 +2926,16 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm6 ^ (zmm9 & (zmm8 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm4 ^ (zmm9 & (zmm8 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[16],ymm4[16],ymm8[17],ymm4[17],ymm8[18],ymm4[18],ymm8[19],ymm4[19],ymm8[20],ymm4[20],ymm8[21],ymm4[21],ymm8[22],ymm4[22],ymm8[23],ymm4[23] ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm9)) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm11 @@ -2947,7 +2947,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm13, %zmm12 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] @@ -2963,31 +2963,32 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,0,1,4,4,4,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm15 & (zmm10 ^ zmm14)) -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vprold $16, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm15 & (zmm3 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm15 & (zmm7 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm15 & (zmm10 ^ zmm14)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -3000,34 +3001,34 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm7 ; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm4 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-NEXT: vpermw %ymm7, %ymm8, %ymm7 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm8, %ymm6, %ymm4 ; AVX512BW-NEXT: vpshufb %ymm8, %ymm5, %ymm9 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512BW-NEXT: vpermw %ymm9, %ymm10, %ymm9 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512BW-NEXT: movl $1227114788, %r10d # imm = 0x49244924 ; AVX512BW-NEXT: kmovd %r10d, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm1, %ymm9 ; AVX512BW-NEXT: vpshufb %ymm7, %ymm0, %ymm10 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] ; AVX512BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 @@ -3041,9 +3042,9 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm13, %zmm14 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512BW-NEXT: vpshufb %xmm8, %xmm10, %xmm13 ; AVX512BW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15] @@ -3060,23 +3061,23 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm13, %zmm16 ; AVX512BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -3096,41 +3097,41 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm7 ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm7, %ymm8, %ymm7 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4 ; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm9, %ymm10, %ymm9 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: movl $1227114788, %r10d # imm = 0x49244924 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 ; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm10 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] ; AVX512BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm10 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm10 @@ -3138,13 +3139,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm10, %zmm14, %zmm15 ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm10 ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm8 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm10, %zmm14 ; AVX512BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -3155,23 +3156,23 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm7 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm10, %zmm16 ; AVX512BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -3191,34 +3192,34 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm7 ; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm2, %ymm4 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm8, %ymm7 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm6, %ymm4 ; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm5, %ymm9 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512DQ-BW-NEXT: vpermw %ymm9, %ymm10, %ymm9 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: movl $1227114788, %r10d # imm = 0x49244924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm1, %ymm9 ; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm0, %ymm10 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] ; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512DQ-BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 @@ -3232,9 +3233,9 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm10, %xmm13 ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15] @@ -3251,23 +3252,23 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm13, %zmm16 ; AVX512DQ-BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -3287,41 +3288,41 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm7, %ymm8, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm9, %ymm10, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movl $1227114788, %r10d # imm = 0x49244924 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm10 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm10 @@ -3329,13 +3330,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm10, %zmm14, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm10, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -3346,23 +3347,23 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm7 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm10, %zmm16 ; AVX512DQ-BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -4371,11 +4372,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-LABEL: store_i8_stride6_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-NEXT: subq $696, %rsp # imm = 0x2B8 ; AVX2-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-NEXT: vmovdqa (%rcx), %xmm8 ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm5 @@ -4383,37 +4382,39 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm1 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX2-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-NEXT: vmovdqa (%rsi), %xmm13 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm14 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm15, %xmm2 -; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] +; AVX2-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm5 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-NEXT: vpshufb %xmm3, %xmm14, %xmm5 ; AVX2-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm4 +; AVX2-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm4 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm5 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7 @@ -4436,18 +4437,19 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] ; AVX2-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovdqa (%r8), %xmm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,5,8,7,9,9,9,9] -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX2-NEXT: vmovdqa (%r8), %xmm15 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-NEXT: vpshufb %xmm2, %xmm15, %xmm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4456,8 +4458,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 @@ -4474,16 +4475,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%r9), %xmm9 -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm1 -; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 @@ -4494,7 +4494,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] @@ -4502,9 +4502,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm11 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] @@ -4521,14 +4521,14 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-NEXT: # ymm2 = ymm15[0],mem[0],ymm15[1],mem[1],ymm15[2],mem[2],ymm15[3],mem[3],ymm15[4],mem[4],ymm15[5],mem[5],ymm15[6],mem[6],ymm15[7],mem[7],ymm15[16],mem[16],ymm15[17],mem[17],ymm15[18],mem[18],ymm15[19],mem[19],ymm15[20],mem[20],ymm15[21],mem[21],ymm15[22],mem[22],ymm15[23],mem[23] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[16],ymm13[16],ymm12[17],ymm13[17],ymm12[18],ymm13[18],ymm12[19],ymm13[19],ymm12[20],ymm13[20],ymm12[21],ymm13[21],ymm12[22],ymm13[22],ymm12[23],ymm13[23] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] @@ -4541,20 +4541,19 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [2,1,0,3,4,4,4,4] -; AVX2-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-NEXT: vmovdqa %xmm15, %xmm14 +; AVX2-NEXT: vpshufb %xmm2, %xmm15, %xmm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm11, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm11 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm6, %ymm9, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -4562,25 +4561,24 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm6, %ymm5 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm4, %xmm11, %xmm6 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm4, %xmm15, %xmm6 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm6, %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb %xmm4, %xmm9, %xmm3 +; AVX2-NEXT: vpshufb %xmm4, %xmm10, %xmm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm2, %ymm10, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4596,7 +4594,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm6 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -4612,14 +4610,13 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-NEXT: # ymm5 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-NEXT: # ymm5 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm3 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm3 @@ -4632,18 +4629,16 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [10,13,12,11,14,13,14,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb %xmm1, %xmm14, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u] +; AVX2-NEXT: vpshufb %ymm4, %ymm9, %ymm6 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm3 ; AVX2-NEXT: vpshufb %ymm4, %ymm8, %ymm4 @@ -4655,15 +4650,14 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX2-NEXT: vpshufb %xmm4, %xmm15, %xmm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm4, %ymm9, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15] +; AVX2-NEXT: vpshufb %ymm4, %ymm10, %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-NEXT: vpshufb %ymm4, %ymm11, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -4687,17 +4681,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%rax) -; AVX2-NEXT: addq $664, %rsp # imm = 0x298 +; AVX2-NEXT: addq $696, %rsp # imm = 0x2B8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i8_stride6_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-FP-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4705,127 +4697,127 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm2 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm11 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 ; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 -; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FP-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX2-FP-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm7, %ymm4 +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm4 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm5 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm6 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm4 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm10, %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm15 +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm15, %ymm6 +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm7, %ymm5 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm5, %ymm0 -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,5,8,7,9,9,9,9] -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm11, %ymm5, %ymm7 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 -; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm6 -; AVX2-FP-NEXT: vmovdqa (%r9), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm5, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm7, %ymm5, %ymm5 +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm14[8],xmm9[9],xmm14[9],xmm9[10],xmm14[10],xmm9[11],xmm14[11],xmm9[12],xmm14[12],xmm9[13],xmm14[13],xmm9[14],xmm14[14],xmm9[15],xmm14[15] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4834,75 +4826,71 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm6, %ymm3 +; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FP-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX2-FP-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-FP-NEXT: vmovdqa %ymm15, %ymm12 -; AVX2-FP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm9[8],ymm13[8],ymm9[9],ymm13[9],ymm9[10],ymm13[10],ymm9[11],ymm13[11],ymm9[12],ymm13[12],ymm9[13],ymm13[13],ymm9[14],ymm13[14],ymm9[15],ymm13[15],ymm9[24],ymm13[24],ymm9[25],ymm13[25],ymm9[26],ymm13[26],ymm9[27],ymm13[27],ymm9[28],ymm13[28],ymm9[29],ymm13[29],ymm9[30],ymm13[30],ymm9[31],ymm13[31] +; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15],ymm2[24],ymm15[24],ymm2[25],ymm15[25],ymm2[26],ymm15[26],ymm2[27],ymm15[27],ymm2[28],ymm15[28],ymm2[29],ymm15[29],ymm2[30],ymm15[30],ymm2[31],ymm15[31] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [10,13,12,11,14,13,14,15] -; AVX2-FP-NEXT: vmovdqa %xmm7, %xmm15 -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm4 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm10, %ymm5 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm5 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u] +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm11, %ymm6 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm14, %ymm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm4, %ymm14, %ymm4 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm4, %ymm15, %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload @@ -4916,10 +4904,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FP-NEXT: vpunpcklbw (%rsp), %xmm2, %xmm5 # 16-byte Folded Reload ; AVX2-FP-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -4935,38 +4923,35 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm6, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm9[0],ymm13[0],ymm9[1],ymm13[1],ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[4],ymm13[4],ymm9[5],ymm13[5],ymm9[6],ymm13[6],ymm9[7],ymm13[7],ymm9[16],ymm13[16],ymm9[17],ymm13[17],ymm9[18],ymm13[18],ymm9[19],ymm13[19],ymm9[20],ymm13[20],ymm9[21],ymm13[21],ymm9[22],ymm13[22],ymm9[23],ymm13[23] ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-FP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FP-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[16],ymm10[16],ymm6[17],ymm10[17],ymm6[18],ymm10[18],ymm6[19],ymm10[19],ymm6[20],ymm10[20],ymm6[21],ymm10[21],ymm6[22],ymm10[22],ymm6[23],ymm10[23] ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm6, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [2,1,0,3,4,4,4,4] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm10, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm4 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm12, %ymm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] @@ -4978,12 +4963,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm13, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm14, %ymm5 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm15, %ymm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5007,17 +4991,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FP-NEXT: addq $664, %rsp # imm = 0x298 +; AVX2-FP-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i8_stride6_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-FCP-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5025,127 +5007,127 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm2 ; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm11 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 ; AVX2-FCP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm4 +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm5 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm6 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 -; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm15 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm5 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm5, %ymm0 -; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,5,8,7,9,9,9,9] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm11, %ymm5, %ymm7 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm6 -; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm14[8],xmm9[9],xmm14[9],xmm9[10],xmm14[10],xmm9[11],xmm14[11],xmm9[12],xmm14[12],xmm9[13],xmm14[13],xmm9[14],xmm14[14],xmm9[15],xmm14[15] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -5154,75 +5136,71 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm3, %ymm6, %ymm3 +; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm12 -; AVX2-FCP-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm9[8],ymm13[8],ymm9[9],ymm13[9],ymm9[10],ymm13[10],ymm9[11],ymm13[11],ymm9[12],ymm13[12],ymm9[13],ymm13[13],ymm9[14],ymm13[14],ymm9[15],ymm13[15],ymm9[24],ymm13[24],ymm9[25],ymm13[25],ymm9[26],ymm13[26],ymm9[27],ymm13[27],ymm9[28],ymm13[28],ymm9[29],ymm13[29],ymm9[30],ymm13[30],ymm9[31],ymm13[31] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15],ymm2[24],ymm15[24],ymm2[25],ymm15[25],ymm2[26],ymm15[26],ymm2[27],ymm15[27],ymm2[28],ymm15[28],ymm2[29],ymm15[29],ymm2[30],ymm15[30],ymm2[31],ymm15[31] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [10,13,12,11,14,13,14,15] -; AVX2-FCP-NEXT: vmovdqa %xmm7, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm4 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm5 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm4 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload @@ -5236,10 +5214,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FCP-NEXT: vpunpcklbw (%rsp), %xmm2, %xmm5 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -5255,38 +5233,35 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm9[0],ymm13[0],ymm9[1],ymm13[1],ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[4],ymm13[4],ymm9[5],ymm13[5],ymm9[6],ymm13[6],ymm9[7],ymm13[7],ymm9[16],ymm13[16],ymm9[17],ymm13[17],ymm9[18],ymm13[18],ymm9[19],ymm13[19],ymm9[20],ymm13[20],ymm9[21],ymm13[21],ymm9[22],ymm13[22],ymm9[23],ymm13[23] ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[16],ymm10[16],ymm6[17],ymm10[17],ymm6[18],ymm10[18],ymm6[19],ymm10[19],ymm6[20],ymm10[20],ymm6[21],ymm10[21],ymm6[22],ymm10[22],ymm6[23],ymm10[23] ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [2,1,0,3,4,4,4,4] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] @@ -5298,12 +5273,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5327,7 +5301,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FCP-NEXT: addq $664, %rsp # imm = 0x298 +; AVX2-FCP-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -5343,7 +5317,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%r9), %xmm12 ; AVX512-NEXT: vmovdqa 32(%r9), %xmm8 ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm0 ; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm1 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] @@ -5391,43 +5365,37 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 %xmm12, %xmm21 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm15, %ymm4, %ymm0 ; AVX512-NEXT: vpshufb %ymm15, %ymm11, %ymm1 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31] ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm19 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm12, %ymm13, %ymm0 ; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm1 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15],ymm8[24],ymm13[24],ymm8[25],ymm13[25],ymm8[26],ymm13[26],ymm8[27],ymm13[27],ymm8[28],ymm13[28],ymm8[29],ymm13[29],ymm8[30],ymm13[30],ymm8[31],ymm13[31] ; AVX512-NEXT: vmovdqa64 %ymm13, %ymm17 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u] ; AVX512-NEXT: vpshufb %ymm5, %ymm13, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm4, %ymm13, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%r9), %ymm14 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15] ; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm28 ; AVX512-NEXT: vmovdqa (%rsi), %ymm1 @@ -5490,8 +5458,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 %xmm29, %xmm5 ; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u] ; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm5 ; AVX512-NEXT: vpshufb %ymm11, %ymm13, %ymm6 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] @@ -5499,8 +5466,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm13 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3 ; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u] ; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm4 ; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm3 ; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm14 @@ -5616,7 +5582,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512-FCP-NEXT: vprold $16, %ymm0, %ymm18 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,10,10,10,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,0,0,1,10,10,10,11] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm11 @@ -5625,20 +5591,16 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm9 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm2 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm5 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm22 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 @@ -5664,8 +5626,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm29 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm28 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 @@ -5685,8 +5646,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm18 & (zmm1 ^ zmm0)) -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm2 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] @@ -5720,8 +5680,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm9 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm24 & (zmm9 ^ zmm13)) -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u] ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm12 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm13 @@ -5736,23 +5695,21 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm12 & (zmm8 ^ zmm9)) -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm9 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm12 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[16],ymm9[16],ymm12[17],ymm9[17],ymm12[18],ymm9[18],ymm12[19],ymm9[19],ymm12[20],ymm9[20],ymm12[21],ymm9[21],ymm12[22],ymm9[22],ymm12[23],ymm9[23] ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,2,2,3,10,10,10,11] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,2,3,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm9 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm6 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[16],ymm1[16],ymm6[17],ymm1[17],ymm6[18],ymm1[18],ymm6[19],ymm1[19],ymm6[20],ymm1[20],ymm6[21],ymm1[21],ymm6[22],ymm1[22],ymm6[23],ymm1[23] ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm18 & (zmm1 ^ zmm9)) @@ -5787,11 +5744,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] ; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,8,8,8,9] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,1,8,8,8,9] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm11 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm6 @@ -5860,7 +5817,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm12 ; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm8 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm1 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] @@ -5908,43 +5865,37 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm21 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm4, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31] ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm19 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm13, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15],ymm8[24],ymm13[24],ymm8[25],ymm13[25],ymm8[26],ymm13[26],ymm8[27],ymm13[27],ymm8[28],ymm13[28],ymm8[29],ymm13[29],ymm8[30],ymm13[30],ymm8[31],ymm13[31] ; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm17 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm13, %ymm0 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm13, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm14 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm0 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm28 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 @@ -6007,8 +5958,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm5 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm5 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm13, %ymm6 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] @@ -6016,8 +5966,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm13 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm3 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm14 @@ -6133,7 +6082,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-FCP-NEXT: vprold $16, %ymm0, %ymm18 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,0,0,1,10,10,10,11] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 @@ -6142,20 +6091,16 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm22 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 @@ -6181,8 +6126,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm28 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 @@ -6202,8 +6146,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm18 & (zmm1 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] @@ -6237,8 +6180,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm9 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm24 & (zmm9 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm13 @@ -6253,23 +6195,21 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm12 & (zmm8 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm12 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[16],ymm9[16],ymm12[17],ymm9[17],ymm12[18],ymm9[18],ymm12[19],ymm9[19],ymm12[20],ymm9[20],ymm12[21],ymm9[21],ymm12[22],ymm9[22],ymm12[23],ymm9[23] ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,2,2,3,10,10,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,2,3,10,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm9 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm6 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[16],ymm1[16],ymm6[17],ymm1[17],ymm6[18],ymm1[18],ymm6[19],ymm1[19],ymm6[20],ymm1[20],ymm6[21],ymm1[21],ymm6[22],ymm1[22],ymm6[23],ymm1[23] ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm18 & (zmm1 ^ zmm9)) @@ -6304,11 +6244,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,8,8,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,1,8,8,8,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm6 @@ -6378,7 +6318,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512BW-NEXT: vpermw %ymm3, %ymm8, %ymm3 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqa 32(%rcx), %ymm3 @@ -6437,7 +6377,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k3} ; AVX512BW-NEXT: vmovdqa64 (%rsi), %xmm21 ; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512BW-NEXT: vpshufb %xmm20, %xmm7, %xmm6 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm22 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm8 @@ -6741,7 +6681,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm8, %ymm3 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %ymm3 @@ -6800,7 +6740,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k3} ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %xmm21 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u] ; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm7, %xmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm22 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index f4055a953badd..ca196fc45aaa8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -22,9 +22,9 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] @@ -64,13 +64,13 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] -; AVX-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rax) +; AVX-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -83,13 +83,13 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] -; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rax) +; AVX2-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -102,13 +102,13 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] -; AVX2-FP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX2-FP-NEXT: vpextrd $2, %xmm0, 8(%rax) +; AVX2-FP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -121,13 +121,13 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] -; AVX2-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX2-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) +; AVX2-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; @@ -140,13 +140,13 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] -; AVX512-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512-NEXT: vpextrd $2, %xmm0, 8(%rax) +; AVX512-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: retq ; @@ -159,30 +159,30 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] -; AVX512-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) +; AVX512-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride7_vf2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512DQ-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) @@ -191,17 +191,17 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-LABEL: store_i8_stride7_vf2: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) @@ -216,13 +216,13 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] -; AVX512BW-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512BW-NEXT: vpextrd $2, %xmm0, 8(%rax) +; AVX512BW-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: retq ; @@ -235,30 +235,30 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] -; AVX512BW-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) +; AVX512BW-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i8_stride7_vf2: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) @@ -267,17 +267,17 @@ define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf2: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) @@ -310,10 +310,10 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm5 -; SSE-NEXT: movdqa (%r10), %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; SSE-NEXT: movdqa (%r10), %xmm2 ; SSE-NEXT: pxor %xmm7, %xmm7 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] @@ -382,38 +382,38 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-LABEL: store_i8_stride7_vf4: ; AVX: # %bb.0: ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,4,8,12],zero,zero,zero,xmm0[1,5,9,13],zero,zero,zero,xmm0[2,6] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[0,4,8],zero,zero,zero,zero,xmm1[1,5,9],zero,zero -; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,14],zero,zero,zero,xmm0[3,7,11,15],zero,zero,zero,xmm0[u,u,u,u] +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2,6,10],zero,zero,zero,zero,xmm1[3,7,11,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rax) +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm1 ; AVX-NEXT: vmovq %xmm0, 16(%rax) -; AVX-NEXT: vmovdqa %xmm2, (%rax) +; AVX-NEXT: vmovdqa %xmm1, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: store_i8_stride7_vf4: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -429,15 +429,15 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-LABEL: store_i8_stride7_vf4: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -453,15 +453,15 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-LABEL: store_i8_stride7_vf4: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -477,15 +477,15 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-LABEL: store_i8_stride7_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -501,15 +501,15 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-LABEL: store_i8_stride7_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -532,8 +532,8 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -556,8 +556,8 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -573,15 +573,15 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-LABEL: store_i8_stride7_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -597,15 +597,15 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-LABEL: store_i8_stride7_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -628,8 +628,8 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -652,8 +652,8 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -851,7 +851,6 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-LABEL: store_i8_stride7_vf8: ; AVX: # %bb.0: ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -861,49 +860,49 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u],zero,zero,xmm2[5,13,u,u,u],zero,zero,xmm2[6,14,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,5,13],zero,zero,xmm1[u,u,u,6,14],zero,zero,xmm1[u,u] -; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,12],zero,xmm0[u,u,u,u,5,13],zero,xmm0[u,u,u,u,6,14] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero -; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u],zero,zero,xmm2[5,13,u,u,u],zero,zero,xmm2[6,14,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,5,13],zero,zero,xmm1[u,u,u,6,14],zero,zero,xmm1[u,u] +; AVX-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,12],zero,xmm0[u,u,u,u,5,13],zero,xmm0[u,u,u,u,6,14] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm5[4,u,u,u,u],zero,zero,xmm5[5,u,u,u,u],zero,zero +; AVX-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; AVX-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u],zero,zero,xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u],zero,zero,xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[6,u,u,u,u],zero,zero,xmm5[7,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vmovq {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm6, %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10] ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,0,8],zero,xmm0[u,u,u,u,1,9],zero,xmm0[u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u],zero,zero,xmm3[0,u,u,u,u],zero,zero,xmm3[1,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u],zero,zero,xmm5[0,u,u,u,u],zero,zero,xmm5[1,u,u] ; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; AVX-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[u,u,u,3,11],zero,zero,xmm1[u,u,u,4,12],zero,zero +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,10,u,u,u],zero,zero,xmm2[3,11,u,u,u],zero,zero,xmm2[4,12] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,10],zero,xmm0[u,u,u,u,3,11],zero,xmm0[u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u],zero,zero,xmm5[2,u,u,u,u],zero,zero,xmm5[3,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rax) ; AVX-NEXT: vmovdqa %xmm6, (%rax) -; AVX-NEXT: vmovq %xmm5, 48(%rax) -; AVX-NEXT: vmovdqa %xmm4, 32(%rax) +; AVX-NEXT: vmovq %xmm4, 48(%rax) +; AVX-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: store_i8_stride7_vf8: ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero @@ -928,6 +927,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero @@ -945,7 +945,6 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-LABEL: store_i8_stride7_vf8: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero @@ -970,6 +969,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero @@ -987,7 +987,6 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-LABEL: store_i8_stride7_vf8: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero @@ -996,26 +995,27 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 -; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,5,3,0,3,1,5,0] +; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,3,u,3,1,5,u] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,7,1,3,5,7] -; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm5[1,5,9,13],zero,zero,zero,ymm5[2,6,10,14],zero,zero,zero,ymm5[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,3,5,7,1,3,5,7] +; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm4[1,5,9,13],zero,zero,zero,ymm4[2,6,10,14],zero,zero,zero,ymm4[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] -; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,6,0,4,2,6] -; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10,18,26],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,2,6,0,4,2,6] +; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) @@ -1041,24 +1041,24 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm1[0,8,u,u,u],zero,zero,ymm1[1,9,u,u,u],zero,zero,zero,zero,ymm1[u,u,u,19,27],zero,zero,ymm1[u,u,u,20,28],zero,zero +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u],zero,zero,ymm1[5,13,u,u,u],zero,zero,ymm1[6,14,u,u,u,23,31],zero,zero,ymm1[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,5,13],zero,zero,ymm4[u,u,u,6,14],zero,zero,ymm4[u,u,u],zero,zero,ymm4[23,31,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-NEXT: vporq %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] +; AVX512-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,u,u,u,u],zero,zero,ymm0[5,u,u,u,u],zero,zero,zero,ymm0[u,u,u,u,23,31],zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,0,8],zero,ymm3[u,u,u,u,1,9],zero,ymm3[u,u,u,u],zero,zero,ymm3[18,u,u,u,u],zero,zero,ymm3[19,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,12],zero,ymm3[u,u,u,u,5,13],zero,ymm3[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm3[23,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512-NEXT: vmovq %xmm1, 48(%rax) @@ -1078,22 +1078,22 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[18,22,26],zero,zero,zero,zero,ymm1[19,23,27],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,3,1,5,0] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,zero,zero,ymm4[u,u,u,19,27],zero,zero,ymm4[u,u,u,20,28],zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = mem & (ymm4 | ymm5) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,3,u,3,1,5,u] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[2,6,18],zero,zero,zero,zero,ymm0[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,zero,zero,ymm4[u,u,u,19,27],zero,zero,ymm4[u,u,u,20,28],zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = mem & (ymm4 | ymm1) ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,9,11,1,3,9,11] ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 @@ -1123,24 +1123,24 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm1[0,8,u,u,u],zero,zero,ymm1[1,9,u,u,u],zero,zero,zero,zero,ymm1[u,u,u,19,27],zero,zero,ymm1[u,u,u,20,28],zero,zero +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u],zero,zero,ymm1[5,13,u,u,u],zero,zero,ymm1[6,14,u,u,u,23,31],zero,zero,ymm1[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,5,13],zero,zero,ymm4[u,u,u,6,14],zero,zero,ymm4[u,u,u],zero,zero,ymm4[23,31,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-NEXT: vporq %zmm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] +; AVX512DQ-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,u,u,u,u],zero,zero,ymm0[5,u,u,u,u],zero,zero,zero,ymm0[u,u,u,u,23,31],zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,0,8],zero,ymm3[u,u,u,u,1,9],zero,ymm3[u,u,u,u],zero,zero,ymm3[18,u,u,u,u],zero,zero,ymm3[19,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,12],zero,ymm3[u,u,u,u,5,13],zero,ymm3[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm3[23,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512DQ-NEXT: vmovq %xmm1, 48(%rax) @@ -1160,22 +1160,22 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,2,6,0,4,2,6] ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[18,22,26],zero,zero,zero,zero,ymm1[19,23,27],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,3,0,3,1,5,0] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,zero,zero,ymm4[u,u,u,19,27],zero,zero,ymm4[u,u,u,20,28],zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = mem & (ymm4 | ymm5) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [1,5,3,u,3,1,5,u] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[2,6,18],zero,zero,zero,zero,ymm0[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,zero,zero,ymm4[u,u,u,19,27],zero,zero,ymm4[u,u,u,20,28],zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = mem & (ymm4 | ymm1) ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,9,11,1,3,9,11] ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 @@ -1200,22 +1200,22 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10,18,26],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zmm3[37,45],zero,zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zero,zero,zero,zmm3[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vporq %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} @@ -1245,10 +1245,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,2,6,0,4,2,6] +; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,2,6,0,4,2,6] -; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm4 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm4[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] @@ -1256,7 +1256,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm4[1,5,9,13],zero,zero,zero,ymm4[2,6,10,14],zero,zero,zero,ymm4[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,3,0,3,1,5,0] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,3,u,3,1,5,u] ; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[2,6,18],zero,zero,zero,zero,ymm0[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -1278,22 +1278,22 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10,18,26],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zmm3[37,45],zero,zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zero,zero,zero,zmm3[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %zmm3, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} @@ -1323,10 +1323,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,2,6,0,4,2,6] +; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,2,6,0,4,2,6] -; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm4 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm4[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] @@ -1334,7 +1334,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm4[1,5,9,13],zero,zero,zero,ymm4[2,6,10,14],zero,zero,zero,ymm4[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,3,0,3,1,5,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,3,u,3,1,5,u] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[2,6,18],zero,zero,zero,zero,ymm0[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -1713,189 +1713,188 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-LABEL: store_i8_stride7_vf16: ; AVX: # %bb.0: ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 -; AVX-NEXT: vmovdqa (%rsi), %xmm5 -; AVX-NEXT: vmovdqa (%rdx), %xmm6 -; AVX-NEXT: vmovdqa (%rcx), %xmm7 +; AVX-NEXT: vmovdqa (%rsi), %xmm4 +; AVX-NEXT: vmovdqa (%rdx), %xmm5 +; AVX-NEXT: vmovdqa (%rcx), %xmm6 ; AVX-NEXT: vmovdqa (%r8), %xmm0 ; AVX-NEXT: vmovdqa (%r9), %xmm2 -; AVX-NEXT: vmovdqa (%r10), %xmm1 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm10[6,7],zero,zero,zero,zero,zero,xmm10[8,9],zero,zero -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[4,5],zero,zero,zero,zero,zero,xmm11[6,7],zero,zero,zero,zero,zero,xmm11[8,9] -; AVX-NEXT: vpor %xmm4, %xmm8, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm11[0,1],zero,zero,zero,zero,zero,xmm11[2,3],zero,zero,zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[0,1],zero,zero,zero,zero,zero,xmm10[2,3],zero,zero,zero,zero,zero,xmm10[4,5] +; AVX-NEXT: vmovdqa (%rax), %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm10[6,7],zero,zero,zero,zero,zero,xmm10[8,9],zero,zero +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[4,5],zero,zero,zero,zero,zero,xmm7[6,7],zero,zero,zero,zero,zero,xmm7[8,9] ; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm7[0,1],zero,zero,zero,zero,zero,xmm7[2,3],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[0,1],zero,zero,zero,zero,zero,xmm10[2,3],zero,zero,zero,zero,zero,xmm10[4,5] +; AVX-NEXT: vpor %xmm9, %xmm11, %xmm11 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,4,5],zero,xmm9[u,u,u,u,6,7],zero,xmm9[u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u],zero,zero,xmm1[2,u,u,u,u],zero,zero,xmm1[3,u,u,u,u] -; AVX-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] -; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 -; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX-NEXT: vandnps %ymm8, %ymm12, %ymm8 -; AVX-NEXT: vorps %ymm4, %ymm8, %ymm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u,u],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] -; AVX-NEXT: vpor %xmm8, %xmm12, %xmm12 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u],zero,xmm7[7,u,u,u,u,u],zero,xmm7[8,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,4,5],zero,xmm9[u,u,u,u,6,7],zero,xmm9[u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u],zero,zero,xmm1[2,u,u,u,u],zero,zero,xmm1[3,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm11 +; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm14[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] +; AVX-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm12 +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u],zero,xmm4[7,u,u,u,u,u],zero,xmm4[8,u,u,u,u,u],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u,u,u] ; AVX-NEXT: vpor %xmm8, %xmm13, %xmm13 +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,7],zero,xmm5[u,u,u,u,u,8],zero,xmm5[u,u,u,u] +; AVX-NEXT: vpor %xmm14, %xmm8, %xmm14 ; AVX-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX-NEXT: vpblendvb %xmm8, %xmm12, %xmm13, %xmm12 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm11[10,11],zero,zero,zero,zero,zero,xmm11[12,13],zero,zero +; AVX-NEXT: vpblendvb %xmm8, %xmm13, %xmm14, %xmm13 +; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX-NEXT: vandnps %ymm12, %ymm14, %ymm12 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,zero,xmm7[10,11],zero,zero,zero,zero,zero,xmm7[12,13],zero,zero +; AVX-NEXT: vorps %ymm12, %ymm11, %ymm7 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[10,11],zero,zero,zero,zero,zero,xmm10[12,13],zero,zero,zero,zero -; AVX-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX-NEXT: vandps %ymm11, %ymm10, %ymm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,7],zero,xmm0[u,u,u,u,u,8],zero,xmm0[u,u] -; AVX-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm12[u,u,u,u,5,6],zero,xmm12[u,u,u,u,12,13],zero,xmm12[u] +; AVX-NEXT: vpor %xmm14, %xmm10, %xmm10 +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,7],zero,xmm0[u,u,u,u,u,8],zero,xmm0[u,u] +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm11[u,u,u,u,5,6],zero,xmm11[u,u,u,u,12,13],zero,xmm11[u] +; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[6,u,u,u,u],zero,zero,xmm1[7,u,u,u,u],zero,zero,xmm1[8,u] -; AVX-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX-NEXT: vpor %xmm13, %xmm11, %xmm11 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9],zero,xmm9[u,u,u,u,10,11],zero,xmm9[u,u,u,u,12,13] ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[4,u,u,u,u],zero,zero,xmm1[5,u,u,u,u],zero,zero +; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10 ; AVX-NEXT: vpor %xmm13, %xmm9, %xmm9 -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm9, %ymm9 -; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX-NEXT: vandnps %ymm9, %ymm12, %ymm9 ; AVX-NEXT: vorps %ymm9, %ymm10, %ymm9 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm10[8,9],zero,zero,zero,zero,zero,xmm10[10,11],zero,zero,zero -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6],zero,zero,zero,zero,zero,xmm12[9,8],zero,zero,zero,zero,zero,xmm12[11,10],zero ; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[2,3],zero,zero,zero,zero,zero,xmm6[4,5],zero,zero,zero,zero,zero,xmm6[6] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[2,3],zero,zero,zero,zero,zero,xmm5[4,5],zero,zero,zero,zero,zero,xmm5[6] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2],zero,zero,zero,zero,zero,xmm3[5,4],zero,zero,zero,zero,zero,xmm3[7,6],zero -; AVX-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u],zero,zero,xmm1[11,u,u,u,u],zero,zero,xmm1[12,u,u,u,u],zero ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u],zero,zero,xmm1[11,u,u,u,u],zero,zero,xmm1[12,u,u,u,u],zero -; AVX-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] +; AVX-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u],zero,zero,xmm1[9,u,u,u,u],zero,zero,xmm1[10,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm11, %xmm5 -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX-NEXT: vandnps %ymm5, %ymm8, %ymm5 -; AVX-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm6[12,13],zero,zero,zero,zero,zero,xmm6[14,15],zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm10[12,13],zero,zero,zero,zero,zero,xmm10[14,15],zero,zero,zero,zero,zero -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm10[12,13],zero,zero,zero,zero,zero,xmm10[14,15],zero,zero,zero,zero,zero +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,xmm0[u,u,u,u,13,12],zero,xmm0[u,u,u,u,15,14],zero ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX-NEXT: vmovaps %ymm3, 64(%rax) ; AVX-NEXT: vmovaps %ymm9, 32(%rax) -; AVX-NEXT: vmovaps %ymm4, (%rax) +; AVX-NEXT: vmovaps %ymm7, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: store_i8_stride7_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 -; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 -; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm2, %ymm2 +; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25] +; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] -; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] +; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm5[4],zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero +; AVX2-NEXT: vmovdqa (%rax), %xmm5 ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] ; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[1,1,0,0,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] ; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,2,0,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28] -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,0,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero ; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,5,6] ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] ; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm2[3,1,1,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[1,3,3,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero ; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] -; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[6,14],zero,zero,zero,zero,zero,xmm2[7,15],zero,zero,zero,zero,zero +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,7,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] +; AVX2-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm1 ; AVX2-NEXT: vmovdqa %ymm6, 64(%rax) ; AVX2-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-NEXT: vmovdqa %ymm4, 32(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i8_stride7_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm2, %ymm2 ; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25] ; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 @@ -1909,7 +1908,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm7, %ymm4, %ymm7 -; AVX2-FP-NEXT: vmovdqa (%r10), %xmm4 +; AVX2-FP-NEXT: vmovdqa (%rax), %xmm4 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] @@ -1917,9 +1916,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] @@ -1930,13 +1929,14 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm2[3,1,1,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm3[1,3,3,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero ; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 @@ -1963,22 +1963,21 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-LABEL: store_i8_stride7_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm4 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm0 ; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 ; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,2,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,2,0,0,1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] @@ -1987,11 +1986,11 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,5,2,6,1,5,2,6] ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm9 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 @@ -2001,16 +2000,17 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,2,3,3,2,2,3,3] +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,5,6] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero ; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 @@ -2037,68 +2037,68 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-LABEL: store_i8_stride7_vf16: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-NEXT: vmovdqa (%r9), %xmm2 -; AVX512-NEXT: vmovdqa (%r10), %xmm0 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u],zero,ymm6[5,u,u,u,u,u],zero,ymm6[6,u,u,u,u,u,23],zero,ymm6[u,u,u,u,u,24],zero,ymm6[u,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ~ymm7 & (ymm6 | ymm5) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm4, %ymm4 -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u,u],zero -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u,u,u,u,25] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 & (ymm8 | ymm6) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,ymm6[u,u,u,u,u,5],zero,ymm6[u,u,u,u,u,6],zero,ymm6[u,u,u,u,u],zero,ymm6[23,u,u,u,u,u],zero,ymm6[24,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-NEXT: vmovdqa (%r8), %xmm0 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 +; AVX512-NEXT: vmovdqa (%r9), %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u],zero,ymm5[5,u,u,u,u,u],zero,ymm5[6,u,u,u,u,u,23],zero,ymm5[u,u,u,u,u,24],zero,ymm5[u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ~ymm6 & (ymm5 | ymm4) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm5 +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm6 & (ymm7 | ymm4) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10],zero,zero,ymm4[u,u,u,19,27],zero,zero,ymm4[u,u,u,20,28],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm6 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = mem & (ymm8 | ymm7) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = mem & (ymm8 | ymm4) +; AVX512-NEXT: vmovdqa (%rax), %xmm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] +; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[1,1,0,0,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 ; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,1,0,4,4,5,4] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & mem) | zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & mem) | zmm6 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm5)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[3,1,1,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm3[3,1,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9,u,u,u],zero,zero,ymm7[2,10,u,u,u],zero,zero,ymm7[3,19,u,u,u],zero,zero,ymm7[28,20,u,u,u],zero,zero,ymm7[29,21,u] -; AVX512-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9],zero,ymm6[u,u,u,u,2,10],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u,20,28],zero,ymm6[u,u,u,u,21] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5)) +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] +; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm6 & ~mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[6,14,u,u,u],zero,zero,xmm2[7,15,u,u,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,14],zero,zero,xmm4[u,u,u,7,15],zero,zero,xmm4[u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,xmm0[u,u,u,u,13,12],zero,xmm0[u,u,u,u,15,14],zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm4[13,u,u,u,u],zero,zero,xmm4[14,u,u,u,u],zero,zero,xmm4[15] +; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512-NEXT: vzeroupper @@ -2107,36 +2107,35 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-LABEL: store_i8_stride7_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm4 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,ymm2[18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm2 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,5,2,6,1,5,2,6] ; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,zero,ymm7[1,5,u,u,u],zero,zero,ymm7[2,6,u,u,u],zero,zero,ymm7[19,23,u,u,u],zero,zero,ymm7[24,28,u,u,u],zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,5],zero,zero,ymm6[u,u,u,2,6],zero,zero,ymm6[u,u,u,19,23],zero,zero,ymm6[u,u,u,24,28],zero,zero,ymm6[u,u,u,25] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,17,u,17,u,16,16,u,0,1,0,1,2,3,u,1] ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,9,2,10,1,9,2,10] ; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm6 +; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,4],zero,ymm6[u,u,u,u,1,5],zero,ymm6[u,u,u,u,2,6],zero,ymm6[u,u,u,u,19,23],zero,ymm6[u,u,u,u,24,28],zero,ymm6[u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 @@ -2147,7 +2146,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] ; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 @@ -2159,12 +2158,13 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,xmm0[u,u,u,u,13,12],zero,xmm0[u,u,u,u,15,14],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) @@ -2174,68 +2174,68 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-LABEL: store_i8_stride7_vf16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u],zero,ymm6[5,u,u,u,u,u],zero,ymm6[6,u,u,u,u,u,23],zero,ymm6[u,u,u,u,u,24],zero,ymm6[u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ~ymm7 & (ymm6 | ymm5) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm4, %ymm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u,u],zero -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u,u,u,u,25] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 & (ymm8 | ymm6) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,ymm6[u,u,u,u,u,5],zero,ymm6[u,u,u,u,u,6],zero,ymm6[u,u,u,u,u],zero,ymm6[23,u,u,u,u,u],zero,ymm6[24,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u],zero,ymm5[5,u,u,u,u,u],zero,ymm5[6,u,u,u,u,u,23],zero,ymm5[u,u,u,u,u,24],zero,ymm5[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ~ymm6 & (ymm5 | ymm4) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm5 +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm6 & (ymm7 | ymm4) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10],zero,zero,ymm4[u,u,u,19,27],zero,zero,ymm4[u,u,u,20,28],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm6 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = mem & (ymm8 | ymm7) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = mem & (ymm8 | ymm4) +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] +; AVX512DQ-NEXT: vporq %zmm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[1,1,0,0,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,1,0,4,4,5,4] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & mem) | zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & mem) | zmm6 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm5)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[3,1,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[3,1,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9,u,u,u],zero,zero,ymm7[2,10,u,u,u],zero,zero,ymm7[3,19,u,u,u],zero,zero,ymm7[28,20,u,u,u],zero,zero,ymm7[29,21,u] -; AVX512DQ-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9],zero,ymm6[u,u,u,u,2,10],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u,20,28],zero,ymm6[u,u,u,u,21] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] +; AVX512DQ-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm6 & ~mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[6,14,u,u,u],zero,zero,xmm2[7,15,u,u,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,14],zero,zero,xmm4[u,u,u,7,15],zero,zero,xmm4[u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512DQ-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,xmm0[u,u,u,u,13,12],zero,xmm0[u,u,u,u,15,14],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm4[13,u,u,u,u],zero,zero,xmm4[14,u,u,u,u],zero,zero,xmm4[15] +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-NEXT: vzeroupper @@ -2244,36 +2244,35 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-LABEL: store_i8_stride7_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,ymm2[18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,5,2,6,1,5,2,6] ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,zero,ymm7[1,5,u,u,u],zero,zero,ymm7[2,6,u,u,u],zero,zero,ymm7[19,23,u,u,u],zero,zero,ymm7[24,28,u,u,u],zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,5],zero,zero,ymm6[u,u,u,2,6],zero,zero,ymm6[u,u,u,19,23],zero,zero,ymm6[u,u,u,24,28],zero,zero,ymm6[u,u,u,25] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512DQ-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [u,17,u,17,u,16,16,u,0,1,0,1,2,3,u,1] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,9,2,10,1,9,2,10] ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,4],zero,ymm6[u,u,u,u,1,5],zero,ymm6[u,u,u,u,2,6],zero,ymm6[u,u,u,u,19,23],zero,ymm6[u,u,u,u,24,28],zero,ymm6[u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 @@ -2284,7 +2283,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] ; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 @@ -2296,12 +2295,13 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,xmm0[u,u,u,u,13,12],zero,xmm0[u,u,u,u,15,14],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) @@ -2315,142 +2315,142 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-NEXT: vmovdqa (%r9), %xmm4 -; AVX512BW-NEXT: vmovdqa (%r10), %xmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero +; AVX512BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] -; AVX512BW-NEXT: vpor %ymm7, %ymm6, %ymm6 +; AVX512BW-NEXT: vmovdqa (%r10), %xmm7 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero,zero,ymm8[25] +; AVX512BW-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512BW-NEXT: vpor %ymm6, %ymm8, %ymm5 ; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] -; AVX512BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm5 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[18,26],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28] +; AVX512BW-NEXT: vpor %ymm2, %ymm6, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[4],zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2] +; AVX512BW-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,0,2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512BW-NEXT: vpermw %zmm2, %zmm8, %zmm8 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512BW-NEXT: vpermw %zmm7, %zmm8, %zmm8 ; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm6 {%k1} ; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,7,7,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,7,7,7] ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,3,2] ; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm4 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm6[1,3,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-NEXT: vpermw %ymm7, %ymm3, %ymm3 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[3,19],zero,zero,zero,zero,zero,ymm0[28,20],zero,zero,zero,zero,zero,ymm0[29,21],zero +; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,1,3] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1],zero,zero,zero,zero,zero,ymm1[10,2],zero,zero,zero,zero,zero,ymm1[11,3],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero,zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm5 {%k1} ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm0 {%k1} +; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm0 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512BW-NEXT: vmovdqa %xmm4, 96(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i8_stride7_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm4 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,2,10,1,9,2,10] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rdx # imm = 0x4081020408102040 -; AVX512BW-FCP-NEXT: kmovq %rdx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6] -; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[0,8],zero,zero,zero,zero,zero,zmm8[1,9],zero,zero,zero,zero,zero,zmm8[2,10],zero,zero,zero,zero,zero,zmm8[19,27],zero,zero,zero,zero,zero,zmm8[20,28],zero,zero,zero,zero,zero,zmm8[33,37],zero,zero,zero,zero,zero,zmm8[34,38],zero,zero,zero,zero,zero,zmm8[51,55],zero,zero,zero,zero,zero,zmm8[56,60],zero,zero,zero,zero,zero,zmm8[57] -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zero,zero,zmm7[0,8],zero,zero,zero,zero,zero,zmm7[1,9],zero,zero,zero,zero,zero,zmm7[18,26],zero,zero,zero,zero,zero,zmm7[19,27],zero,zero,zero,zero,zero,zmm7[20,28],zero,zero,zero,zero,zero,zmm7[33,37],zero,zero,zero,zero,zero,zmm7[34,38],zero,zero,zero,zero,zero,zmm7[51,55],zero,zero,zero,zero,zero,zmm7[56,60],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm8, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 -; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermw %ymm4, %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,3,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[3,19],zero,zero,zero,zero,zero,ymm5[28,20],zero,zero,zero,zero,zero,ymm5[29,21],zero -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[3,1,1,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,9,2,10,1,9,2,10] +; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512BW-FCP-NEXT: vmovdqa (%rax), %xmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,u,4,u,0,1,4,5,1,5,u,u,1,5,2,6] +; AVX512BW-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[0,8],zero,zero,zero,zero,zero,zmm9[1,9],zero,zero,zero,zero,zero,zmm9[2,10],zero,zero,zero,zero,zero,zmm9[19,27],zero,zero,zero,zero,zero,zmm9[20,28],zero,zero,zero,zero,zero,zmm9[33,37],zero,zero,zero,zero,zero,zmm9[34,38],zero,zero,zero,zero,zero,zmm9[51,55],zero,zero,zero,zero,zero,zmm9[56,60],zero,zero,zero,zero,zero,zmm9[57] +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512BW-FCP-NEXT: vpermd %zmm3, %zmm8, %zmm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm8 = zero,zero,zmm8[0,8],zero,zero,zero,zero,zero,zmm8[1,9],zero,zero,zero,zero,zero,zmm8[18,26],zero,zero,zero,zero,zero,zmm8[19,27],zero,zero,zero,zero,zero,zmm8[20,28],zero,zero,zero,zero,zero,zmm8[33,37],zero,zero,zero,zero,zero,zmm8[34,38],zero,zero,zero,zero,zero,zmm8[51,55],zero,zero,zero,zero,zero,zmm8[56,60],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vporq %zmm9, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-FCP-NEXT: movl $67637280, %eax # imm = 0x4081020 +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[1,3,3,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[3,19],zero,zero,zero,zero,zero,ymm8[28,20],zero,zero,zero,zero,zero,ymm8[29,21],zero +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm5 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,zero,zero,zero,ymm4[10,2],zero,zero,zero,zero,zero,ymm4[11,3],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero,zero,zero,zero,ymm4[21,29],zero,zero,zero +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[6,14],zero,zero,zero,zero,zero,xmm0[7,15],zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6,14],zero,zero,zero,zero,zero,xmm1[7,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[6,14],zero,zero,zero,zero,zero,xmm2[7,15],zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,zero,zero,zero,zero,xmm0[13,12],zero,zero,zero,zero,zero,xmm0[15,14],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm6[13],zero,zero,zero,zero,zero,zero,xmm6[14],zero,zero,zero,zero,zero,zero,xmm6[15] +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] -; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -2462,142 +2462,142 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] -; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm6, %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm7 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero,zero,ymm8[25] +; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm8, %ymm5 ; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm5 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[18,26],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28] +; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm6, %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[4],zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2] +; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,0,2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm8, %zmm8 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512DQ-BW-NEXT: vpermw %zmm7, %zmm8, %zmm8 ; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,7,7,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,7,7,7] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,3,2] ; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm4 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm3, %ymm2 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm6[1,3,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512DQ-BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[3,19],zero,zero,zero,zero,zero,ymm0[28,20],zero,zero,zero,zero,zero,ymm0[29,21],zero +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,1,3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1],zero,zero,zero,zero,zero,ymm1[10,2],zero,zero,zero,zero,zero,ymm1[11,3],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero,zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm5 {%k1} ; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, 96(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,2,10,1,9,2,10] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rdx # imm = 0x4081020408102040 -; AVX512DQ-BW-FCP-NEXT: kmovq %rdx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[0,8],zero,zero,zero,zero,zero,zmm8[1,9],zero,zero,zero,zero,zero,zmm8[2,10],zero,zero,zero,zero,zero,zmm8[19,27],zero,zero,zero,zero,zero,zmm8[20,28],zero,zero,zero,zero,zero,zmm8[33,37],zero,zero,zero,zero,zero,zmm8[34,38],zero,zero,zero,zero,zero,zmm8[51,55],zero,zero,zero,zero,zero,zmm8[56,60],zero,zero,zero,zero,zero,zmm8[57] -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zero,zero,zmm7[0,8],zero,zero,zero,zero,zero,zmm7[1,9],zero,zero,zero,zero,zero,zmm7[18,26],zero,zero,zero,zero,zero,zmm7[19,27],zero,zero,zero,zero,zero,zmm7[20,28],zero,zero,zero,zero,zero,zmm7[33,37],zero,zero,zero,zero,zero,zmm7[34,38],zero,zero,zero,zero,zero,zmm7[51,55],zero,zero,zero,zero,zero,zmm7[56,60],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm8, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 -; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm4, %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,3,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[3,19],zero,zero,zero,zero,zero,ymm5[28,20],zero,zero,zero,zero,zero,ymm5[29,21],zero -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[3,1,1,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,9,2,10,1,9,2,10] +; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rax), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,u,4,u,0,1,4,5,1,5,u,u,1,5,2,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[0,8],zero,zero,zero,zero,zero,zmm9[1,9],zero,zero,zero,zero,zero,zmm9[2,10],zero,zero,zero,zero,zero,zmm9[19,27],zero,zero,zero,zero,zero,zmm9[20,28],zero,zero,zero,zero,zero,zmm9[33,37],zero,zero,zero,zero,zero,zmm9[34,38],zero,zero,zero,zero,zero,zmm9[51,55],zero,zero,zero,zero,zero,zmm9[56,60],zero,zero,zero,zero,zero,zmm9[57] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm3, %zmm8, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm8 = zero,zero,zmm8[0,8],zero,zero,zero,zero,zero,zmm8[1,9],zero,zero,zero,zero,zero,zmm8[18,26],zero,zero,zero,zero,zero,zmm8[19,27],zero,zero,zero,zero,zero,zmm8[20,28],zero,zero,zero,zero,zero,zmm8[33,37],zero,zero,zero,zero,zero,zmm8[34,38],zero,zero,zero,zero,zero,zmm8[51,55],zero,zero,zero,zero,zero,zmm8[56,60],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vporq %zmm9, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-FCP-NEXT: movl $67637280, %eax # imm = 0x4081020 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[1,3,3,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[3,19],zero,zero,zero,zero,zero,ymm8[28,20],zero,zero,zero,zero,zero,ymm8[29,21],zero +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,zero,zero,zero,ymm4[10,2],zero,zero,zero,zero,zero,ymm4[11,3],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero,zero,zero,zero,ymm4[21,29],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[6,14],zero,zero,zero,zero,zero,xmm0[7,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6,14],zero,zero,zero,zero,zero,xmm1[7,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[6,14],zero,zero,zero,zero,zero,xmm2[7,15],zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,zero,zero,zero,zero,xmm0[13,12],zero,zero,zero,zero,zero,xmm0[15,14],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm6[13],zero,zero,zero,zero,zero,zero,xmm6[14],zero,zero,zero,zero,zero,zero,xmm6[15] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -3331,7 +3331,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX-LABEL: store_i8_stride7_vf32: ; AVX: # %bb.0: -; AVX-NEXT: subq $216, %rsp +; AVX-NEXT: subq $168, %rsp ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa 16(%rax), %xmm14 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u],zero,zero,xmm14[9,u,u,u,u],zero,zero,xmm14[10,u,u,u] @@ -3343,235 +3343,231 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u] -; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u] +; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u] -; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u] -; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u] -; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm3 -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 -; AVX-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u] +; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u] +; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm2 +; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm5 +; AVX-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u] +; AVX-NEXT: vpshufb %xmm13, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa 16(%rdx), %xmm15 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,7],zero,xmm15[u,u,u,u,u,8],zero,xmm15[u,u,u,u] +; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u] -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u] -; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm15 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128] -; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u],zero,xmm7[7,u,u,u,u,u],zero,xmm7[8,u,u,u,u,u],zero ; AVX-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9] -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX-NEXT: vandnps %ymm15, %ymm2, %ymm15 -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX-NEXT: vandnps %ymm7, %ymm2, %ymm7 -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,7,128,u,u,u,u,u,8,128,u,u,u,u,u,9] +; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX-NEXT: vandnps %ymm5, %ymm1, %ymm4 +; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa (%r9), %xmm7 -; AVX-NEXT: vpshufb %xmm11, %xmm7, %xmm0 -; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa (%r8), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa (%rax), %xmm8 -; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm5, %xmm8, %xmm2 -; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128] -; AVX-NEXT: vpshufb %xmm5, %xmm8, %xmm2 -; AVX-NEXT: vmovdqa %xmm5, %xmm8 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] -; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm15 -; AVX-NEXT: vmovdqa (%rcx), %xmm3 +; AVX-NEXT: vmovdqa (%r9), %xmm3 +; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm0 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX-NEXT: vmovdqa (%rdx), %xmm13 -; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm2 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX-NEXT: vmovdqa (%r8), %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rax), %xmm12 +; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128] +; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] +; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 +; AVX-NEXT: vmovdqa (%rcx), %xmm10 +; AVX-NEXT: vpshufb %xmm13, %xmm10, %xmm0 +; AVX-NEXT: vmovdqa (%rdx), %xmm9 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,7],zero,xmm9[u,u,u,u,u,8],zero,xmm9[u,u,u,u] +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX-NEXT: vmovdqa (%rsi), %xmm5 -; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u,u],zero ; AVX-NEXT: vmovdqa (%rdi), %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] -; AVX-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX-NEXT: vpshufb %xmm12, %xmm9, %xmm11 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm11 -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vandps %ymm4, %ymm11, %ymm11 -; AVX-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX-NEXT: vandnps %ymm15, %ymm11, %ymm15 +; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1 ; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 +; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm0 +; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm2 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] -; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] -; AVX-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm15 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm12 +; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[8,9],zero,xmm4[u,u,u,u,10,11],zero,xmm4[u,u,u,u,12,13] +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpor %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,4,5],zero,xmm4[u,u,u,u,6,7],zero,xmm4[u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm13 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 -; AVX-NEXT: vandnps %ymm12, %ymm4, %ymm12 -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX-NEXT: vandnps %ymm15, %ymm4, %ymm12 -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX-NEXT: vandnps %ymm13, %ymm11, %ymm13 +; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX-NEXT: vandnps %ymm4, %ymm13, %ymm11 +; AVX-NEXT: vandps %ymm0, %ymm13, %ymm4 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] -; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX-NEXT: vorps %ymm0, %ymm6, %ymm0 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 -; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX-NEXT: vandnps %ymm6, %ymm12, %ymm6 -; AVX-NEXT: vorps %ymm6, %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0],zero,xmm6[2,3,4,5,6,7],zero,xmm6[9,10,11,12,13,14],zero +; AVX-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm13 +; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX-NEXT: vandps %ymm6, %ymm13, %ymm13 +; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15] -; AVX-NEXT: vpor %xmm6, %xmm12, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX-NEXT: vandnps %ymm13, %ymm14, %ymm13 +; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm13 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0],zero,xmm13[2,3,4,5,6,7],zero,xmm13[9,10,11,12,13,14],zero +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] +; AVX-NEXT: vorps %ymm4, %ymm11, %ymm4 +; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpor %xmm14, %xmm13, %xmm4 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero -; AVX-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX-NEXT: vpshufb %xmm10, %xmm12, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] -; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero +; AVX-NEXT: vmovdqa %xmm2, %xmm11 +; AVX-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX-NEXT: vpshufb %xmm7, %xmm13, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] +; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u],zero,zero,xmm12[9,u,u,u,u],zero,zero,xmm12[10,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero -; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u],zero,zero,xmm12[11,u,u,u,u],zero,zero,xmm12[12,u,u,u,u],zero +; AVX-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm1, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX-NEXT: vpmovsxdq {{.*#+}} xmm2 = [16777216,197120] -; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX-NEXT: vorps %ymm3, %ymm2, %ymm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,0,1,u,u,u,u,u,2,3,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5] -; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm7 -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX-NEXT: vandps %ymm4, %ymm5, %ymm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX-NEXT: vandps %ymm6, %ymm4, %ymm4 ; AVX-NEXT: vorps %ymm1, %ymm4, %ymm4 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] -; AVX-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[4,5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm14[8],xmm10[9],xmm14[9],xmm10[10],xmm14[10],xmm10[11],xmm14[11],xmm10[12],xmm14[12],xmm10[13],xmm14[13],xmm10[14],xmm14[14],xmm10[15],xmm14[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15] -; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] -; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm12[13,u,u,u,u],zero,zero,xmm12[14,u,u,u,u],zero,zero,xmm12[15] +; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] +; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4 -; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX-NEXT: vorps %ymm5, %ymm4, %ymm5 +; AVX-NEXT: vandnps %ymm6, %ymm7, %ymm6 +; AVX-NEXT: vorps %ymm6, %ymm4, %ymm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] ; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] ; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] +; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] +; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX-NEXT: vandps %ymm3, %ymm2, %ymm2 @@ -3583,14 +3579,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovaps %ymm1, 128(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 32(%rax) -; AVX-NEXT: vmovaps %ymm5, 96(%rax) +; AVX-NEXT: vmovaps %ymm4, 96(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 160(%rax) -; AVX-NEXT: vmovaps %ymm6, 64(%rax) +; AVX-NEXT: vmovaps %ymm5, 64(%rax) ; AVX-NEXT: vmovdqa %xmm0, 192(%rax) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 208(%rax) -; AVX-NEXT: addq $216, %rsp +; AVX-NEXT: addq $168, %rsp ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3598,38 +3594,36 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-NEXT: vmovdqa (%r8), %ymm7 -; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0] -; AVX2-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX2-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u,255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0] -; AVX2-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] -; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero -; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u] -; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,0,255,0,u,u,u,255,0,255,0,u,u,u,u,u,255,0,255,0,u,u,u,255,0,255,0,u,u,u] +; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[27],zero,ymm3[27,28,29,30],zero,ymm3[28],zero,ymm3[26,27,30,31],zero,ymm3[29] +; AVX2-NEXT: vmovdqa (%r9), %ymm6 +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero +; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u] +; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0 +; AVX2-NEXT: vmovdqa (%rax), %ymm7 +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 @@ -3652,114 +3646,114 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX2-NEXT: vmovdqa (%rax), %xmm15 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,5,5,6] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm7, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-NEXT: vpblendvb %ymm7, %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm5, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-NEXT: vpblendvb %ymm5, %ymm8, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u,u,9] -; AVX2-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u,u,9] +; AVX2-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] -; AVX2-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] +; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX2-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] -; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 +; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm9, %ymm5 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm9 +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm9 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] -; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[1,1,0,0,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[1,1,0,0,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm10 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] -; AVX2-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] +; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero -; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] -; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero -; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero +; AVX2-NEXT: vpor %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22] +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero +; AVX2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero -; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25] +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero +; AVX2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] -; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27] +; AVX2-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero -; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero +; AVX2-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] -; AVX2-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpblendvb %ymm11, %ymm5, %ymm8, %ymm5 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18],zero -; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18],zero +; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero -; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,2,3,0,1,14],zero,ymm3[0,1,0,1,14,15],zero,ymm3[15,16,17,18,19,16],zero,ymm3[30,31,16,17,16,17],zero,ymm3[31,30,31] +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] +; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-NEXT: vmovdqa %ymm1, 160(%rax) @@ -3777,262 +3771,262 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-LABEL: store_i8_stride7_vf32: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] +; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FP-NEXT: vmovdqa (%rax), %xmm8 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa (%r9), %xmm9 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vmovdqa (%rax), %xmm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] -; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm4 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u,u],zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FP-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u],zero,xmm7[7],zero,xmm7[5,u,u,u],zero,xmm7[8],zero,xmm7[6,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,7],zero,xmm6[5],zero,xmm6[u,u,u,8],zero,xmm6[6],zero,xmm6[u,u] +; AVX2-FP-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = zero,xmm9[4,u,u,u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX2-FP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm14, %ymm13, %ymm13 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5 +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm10, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm8 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero ; AVX2-FP-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FP-NEXT: vmovdqa (%rax), %ymm7 -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero +; AVX2-FP-NEXT: vpor %ymm9, %ymm10, %ymm9 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm10, %ymm11, %ymm10 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero +; AVX2-FP-NEXT: vmovdqa (%rax), %ymm10 ; AVX2-FP-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero ; AVX2-FP-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] ; AVX2-FP-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[18],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero ; AVX2-FP-NEXT: vpor %ymm12, %ymm13, %ymm12 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero ; AVX2-FP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero -; AVX2-FP-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18],zero +; AVX2-FP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm7[1,2,3,0,1,14],zero,ymm7[0,1,0,1,14,15],zero,ymm7[15,16,17,18,19,16],zero,ymm7[30,31,16,17,16,17],zero,ymm7[31,30,31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm12, 128(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FP-NEXT: vmovdqa %ymm10, 192(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-FP-NEXT: vmovdqa %ymm9, 192(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i8_stride7_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm11 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,6] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa (%rax), %xmm9 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5,5,6] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm13 +; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] +; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,7],zero,xmm7[5],zero,xmm7[u,u,u,8],zero,xmm7[6],zero,xmm7[u,u] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero +; AVX2-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm14, %ymm6, %ymm14 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 -; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm7 -; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm4, %ymm14, %ymm4 +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm10, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,0,0,4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,2,0,0,1] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[1,1,0,0,4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero +; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm9 ; AVX2-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20],zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero -; AVX2-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero +; AVX2-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero ; AVX2-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] @@ -4043,16 +4037,16 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero ; AVX2-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] ; AVX2-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] @@ -4061,212 +4055,211 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero -; AVX2-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18],zero ; AVX2-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm10, 128(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm12, 160(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm11, 192(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i8_stride7_vf32: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512-NEXT: vmovdqa64 (%rcx), %ymm19 -; AVX512-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-NEXT: vmovdqa (%r9), %ymm5 -; AVX512-NEXT: vmovdqa64 (%r10), %ymm18 -; AVX512-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero -; AVX512-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] -; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1] -; AVX512-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6,u,u] -; AVX512-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] -; AVX512-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm11[0,1,0,1],zmm10[0,1,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) -; AVX512-NEXT: vmovdqa (%r9), %xmm10 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[4,u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6] -; AVX512-NEXT: vmovdqa (%r8), %xmm11 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero -; AVX512-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm7[0,1,0,1] -; AVX512-NEXT: vmovdqa (%r10), %xmm12 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512-NEXT: vpermq {{.*#+}} zmm17 = zmm0[0,0,1,0,4,4,5,4] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm16)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero,ymm6[25] -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm8[18,19,20,21],zero,ymm8[19],zero,ymm8[25,26,27,22],zero,ymm8[20],zero -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm15[2,3,2,3] -; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm16 -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm15[2,3,2,3],zmm1[2,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm8[23],zero,ymm8[21,22,23,26],zero,ymm8[24],zero,ymm8[28,29,26,27] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm6[19],zero,ymm6[21,20,21,22],zero,ymm6[20],zero,ymm6[22,23] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm15[2,3,2,3] -; AVX512-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22] -; AVX512-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm4[23],zero,ymm4[23,24,25,26],zero,ymm4[24],zero,ymm4[30,31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm4[18],zero,ymm4[20,21,20,21],zero,ymm4[19],zero,ymm4[19,20,21,22],zero -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm15[2,3,2,3],zmm2[2,3,2,3] -; AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512-NEXT: vpermi2d %zmm2, %zmm7, %zmm15 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u],zero,xmm3[7],zero,xmm3[5,u,u,u],zero,xmm3[8],zero,xmm3[6,u,u,u],zero +; AVX512-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,7],zero,xmm6[5],zero,xmm6[u,u,u,8],zero,xmm6[6],zero,xmm6[u,u,u,9] +; AVX512-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512-NEXT: vmovdqa64 (%r9), %ymm17 +; AVX512-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX512-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,0,1],zmm5[0,1,0,1] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm9[0,1,0,1],zmm8[0,1,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm5)) +; AVX512-NEXT: vmovdqa (%r9), %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] +; AVX512-NEXT: vmovdqa (%r8), %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4],zero,xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero +; AVX512-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm10[0,1,0,1],zmm5[0,1,0,1] +; AVX512-NEXT: vmovdqa (%rax), %xmm10 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm15, %zmm5 +; AVX512-NEXT: vpermq {{.*#+}} zmm16 = zmm5[0,0,1,0,4,4,5,4] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm13)) +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[2,3,2,3],zmm13[2,3,2,3] +; AVX512-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm14[2,3,2,3],zmm15[2,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512-NEXT: vporq %zmm13, %zmm14, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm15[2,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm2[19],zero,ymm2[21,20,21,22],zero,ymm2[20],zero,ymm2[22,23] +; AVX512-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[2,3,2,3],zmm14[2,3,2,3] +; AVX512-NEXT: vporq %zmm0, %zmm14, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm13)) +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512-NEXT: vmovdqa (%r8), %ymm13 +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[2,3,2,3],zmm14[2,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm13[23],zero,ymm13[23,24,25,26],zero,ymm13[24],zero,ymm13[30,31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,ymm13[20,21,20,21],zero,ymm13[19],zero,ymm13[19,20,21,22],zero +; AVX512-NEXT: vmovdqa (%rax), %ymm14 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[2,3,2,3],zmm1[2,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] +; AVX512-NEXT: vporq %zmm15, %zmm1, %zmm17 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512-NEXT: vpermi2d %zmm5, %zmm1, %zmm15 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm17)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm1)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14,u,u],zero,zero,zero,zero,ymm3[15,u,u],zero,zero,zero,zero,ymm3[16,u,u],zero,zero,zero,zero,ymm3[17,u,u],zero,zero,zero,zero,ymm3[18] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[u,u,0,1,14,15],zero,ymm6[u,u,13,2,3,16],zero,ymm6[u,u,28,29,16,17],zero,ymm6[u,u,19,28,29,18],zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u],zero,ymm13[14,u,u,u,u,u],zero,ymm13[15,u,u,u,u,u],zero,ymm13[16,u,u,u,u,u],zero,ymm13[17,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,14],zero,ymm8[u,u,u,u,u,15],zero,ymm8[u,u,u,u,u,16],zero,ymm8[u,u,u,u,u,17],zero,ymm8[u,u,u,u,u] -; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u],zero,ymm7[14,u,u,u,u,u],zero,ymm7[15,u,u,u,u,u],zero,ymm7[16,u,u,u,u,u],zero,ymm7[17,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 %ymm19, %ymm11 +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,14],zero,ymm11[u,u,u,u,u,15],zero,ymm11[u,u,u,u,u,16],zero,ymm11[u,u,u,u,u,17],zero,ymm11[u,u,u,u,u] +; AVX512-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u],zero,zero,zero,zero,ymm5[14],zero,ymm5[u],zero,zero,zero,zero,ymm5[15],zero,ymm5[u],zero,zero,zero,zero,ymm5[16],zero,ymm5[u],zero,zero,zero,zero,ymm5[17],zero,ymm5[u],zero,zero -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] -; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm13[30],zero,ymm13[28,u,u,u],zero,ymm13[31],zero,ymm13[29,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm13[u],zero,zero,zero,zero,ymm13[14],zero,ymm13[u],zero,zero,zero,zero,ymm13[15],zero,ymm13[u],zero,zero,zero,zero,ymm13[16],zero,ymm13[u],zero,zero,zero,zero,ymm13[17],zero,ymm13[u],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] +; AVX512-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm14[13,u,u,u,u],zero,zero,ymm14[14,u,u,u,u],zero,zero,ymm14[15,u,u,u,u],zero,zero,ymm14[16,u,u,u,u],zero,zero,ymm14[17,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm1)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm7[30],zero,ymm7[28,u,u,u],zero,ymm7[31],zero,ymm7[29,u] ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm6[28],zero,ymm6[30,31,30,31],zero,ymm6[29],zero,ymm6[31,28,29] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] ; AVX512-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27,u,u,u],zero,ymm9[30],zero,ymm9[28,u,u,u],zero,ymm9[31],zero -; AVX512-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero +; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm1)) -; AVX512-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride7_vf32: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm5 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[2,3,2,3],zmm7[2,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[2,3,2,3],zmm9[2,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero ; AVX512-FCP-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[2,3,2,3],zmm9[2,3,2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] ; AVX512-FCP-NEXT: vporq %zmm8, %zmm9, %zmm8 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm7)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[2,3,2,3],zmm7[2,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm2[18],zero,ymm2[20,21,20,21],zero,ymm2[19],zero,ymm2[19,20,21,22],zero +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[2,3,2,3],zmm10[2,3,2,3] ; AVX512-FCP-NEXT: vporq %zmm7, %zmm9, %zmm9 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,21,20,0,21,0,20,0,4,5,0,7,0,5,0,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,21,20,u,21,u,20,u,4,5,u,7,u,5,u,7] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm7 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm8)) @@ -4274,38 +4267,38 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] -; AVX512-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,0,1],zmm8[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] -; AVX512-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm13[0,1,0,1],zmm11[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] +; AVX512-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,0,1],zmm8[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-FCP-NEXT: vpor %xmm15, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,0,1],zmm11[0,1,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm11 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,17,0,17,0,16,16,0,0,0,0,0,2,3,0,1] -; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm13, %zmm17 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,17,u,17,u,16,16,u,0,u,0,u,2,3,u,1] +; AVX512-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm14[4,u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6] ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero +; AVX512-FCP-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,0,1],zmm1[0,1,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm17 ^ (mem & (zmm8 ^ zmm17)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -4317,22 +4310,22 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm2[u],zero,zero,zero,zero,ymm2[14],zero,ymm2[u],zero,zero,zero,zero,ymm2[15],zero,ymm2[u],zero,zero,zero,zero,ymm2[16],zero,ymm2[u],zero,zero,zero,zero,ymm2[17],zero,ymm2[u],zero,zero ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm12[u],zero,zero,zero,zero,ymm12[14],zero,ymm12[u],zero,zero,zero,zero,ymm12[15],zero,ymm12[u],zero,zero,zero,zero,ymm12[16],zero,ymm12[u],zero,zero,zero,zero,ymm12[17],zero,ymm12[u],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[13,u,u,u,u,u],zero,ymm12[14,u,u,u,u,u],zero,ymm12[15,u,u,u,u,u],zero,ymm12[16,u,u,u,u,u],zero,ymm12[17,u,u,u] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm10[0,1,0,1] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] ; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm0 & mem) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero @@ -4344,8 +4337,9 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm12[27,u,u,u],zero,ymm12[30],zero,ymm12[28,u,u,u],zero,ymm12[31],zero ; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] @@ -4362,176 +4356,175 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-LABEL: store_i8_stride7_vf32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %ymm19 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQ-NEXT: vmovdqa64 (%r10), %ymm18 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] -; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6,u,u] -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] -; AVX512DQ-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm11[0,1,0,1],zmm10[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[4,u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6] -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm11 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero -; AVX512DQ-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm7[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm12 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm17 = zmm0[0,0,1,0,4,4,5,4] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm16)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero,ymm6[25] -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm8[18,19,20,21],zero,ymm8[19],zero,ymm8[25,26,27,22],zero,ymm8[20],zero -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm15[2,3,2,3] -; AVX512DQ-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm16 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm15[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm8[23],zero,ymm8[21,22,23,26],zero,ymm8[24],zero,ymm8[28,29,26,27] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm6[19],zero,ymm6[21,20,21,22],zero,ymm6[20],zero,ymm6[22,23] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm15[2,3,2,3] -; AVX512DQ-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22] -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm4[23],zero,ymm4[23,24,25,26],zero,ymm4[24],zero,ymm4[30,31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm4[18],zero,ymm4[20,21,20,21],zero,ymm4[19],zero,ymm4[19,20,21,22],zero -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm15[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-NEXT: vporq %zmm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm7, %zmm15 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u],zero,xmm3[7],zero,xmm3[5,u,u,u],zero,xmm3[8],zero,xmm3[6,u,u,u],zero +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,7],zero,xmm6[5],zero,xmm6[u,u,u,8],zero,xmm6[6],zero,xmm6[u,u,u,9] +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-NEXT: vmovdqa64 (%r9), %ymm17 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX512DQ-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,0,1],zmm5[0,1,0,1] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm9[0,1,0,1],zmm8[0,1,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4],zero,xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero +; AVX512DQ-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm10[0,1,0,1],zmm5[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm10 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm15, %zmm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm16 = zmm5[0,0,1,0,4,4,5,4] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm13)) +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[2,3,2,3],zmm13[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm14[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512DQ-NEXT: vporq %zmm13, %zmm14, %zmm13 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm2[19],zero,ymm2[21,20,21,22],zero,ymm2[20],zero,ymm2[22,23] +; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQ-NEXT: vporq %zmm0, %zmm14, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm13)) +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm13[23],zero,ymm13[23,24,25,26],zero,ymm13[24],zero,ymm13[30,31] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,ymm13[20,21,20,21],zero,ymm13[19],zero,ymm13[19,20,21,22],zero +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm14 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] +; AVX512DQ-NEXT: vporq %zmm15, %zmm1, %zmm17 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm1, %zmm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm17)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm1)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14,u,u],zero,zero,zero,zero,ymm3[15,u,u],zero,zero,zero,zero,ymm3[16,u,u],zero,zero,zero,zero,ymm3[17,u,u],zero,zero,zero,zero,ymm3[18] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[u,u,0,1,14,15],zero,ymm6[u,u,13,2,3,16],zero,ymm6[u,u,28,29,16,17],zero,ymm6[u,u,19,28,29,18],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u],zero,ymm13[14,u,u,u,u,u],zero,ymm13[15,u,u,u,u,u],zero,ymm13[16,u,u,u,u,u],zero,ymm13[17,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,14],zero,ymm8[u,u,u,u,u,15],zero,ymm8[u,u,u,u,u,16],zero,ymm8[u,u,u,u,u,17],zero,ymm8[u,u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u],zero,ymm7[14,u,u,u,u,u],zero,ymm7[15,u,u,u,u,u],zero,ymm7[16,u,u,u,u,u],zero,ymm7[17,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,14],zero,ymm11[u,u,u,u,u,15],zero,ymm11[u,u,u,u,u,16],zero,ymm11[u,u,u,u,u,17],zero,ymm11[u,u,u,u,u] +; AVX512DQ-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u],zero,zero,zero,zero,ymm5[14],zero,ymm5[u],zero,zero,zero,zero,ymm5[15],zero,ymm5[u],zero,zero,zero,zero,ymm5[16],zero,ymm5[u],zero,zero,zero,zero,ymm5[17],zero,ymm5[u],zero,zero -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] -; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm13[30],zero,ymm13[28,u,u,u],zero,ymm13[31],zero,ymm13[29,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm13[u],zero,zero,zero,zero,ymm13[14],zero,ymm13[u],zero,zero,zero,zero,ymm13[15],zero,ymm13[u],zero,zero,zero,zero,ymm13[16],zero,ymm13[u],zero,zero,zero,zero,ymm13[17],zero,ymm13[u],zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] +; AVX512DQ-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm14[13,u,u,u,u],zero,zero,ymm14[14,u,u,u,u],zero,zero,ymm14[15,u,u,u,u],zero,zero,ymm14[16,u,u,u,u],zero,zero,ymm14[17,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm1)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm7[30],zero,ymm7[28,u,u,u],zero,ymm7[31],zero,ymm7[29,u] ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm6[28],zero,ymm6[30,31,30,31],zero,ymm6[29],zero,ymm6[31,28,29] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] ; AVX512DQ-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27,u,u,u],zero,ymm9[30],zero,ymm9[28,u,u,u],zero,ymm9[31],zero -; AVX512DQ-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero +; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm1)) -; AVX512DQ-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[2,3,2,3],zmm9[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[2,3,2,3],zmm9[2,3,2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] ; AVX512DQ-FCP-NEXT: vporq %zmm8, %zmm9, %zmm8 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm2[18],zero,ymm2[20,21,20,21],zero,ymm2[19],zero,ymm2[19,20,21,22],zero +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[2,3,2,3],zmm10[2,3,2,3] ; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm9, %zmm9 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,21,20,0,21,0,20,0,4,5,0,7,0,5,0,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [u,21,20,u,21,u,20,u,4,5,u,7,u,5,u,7] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm7 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm8)) @@ -4539,38 +4532,38 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] -; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,0,1],zmm8[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm13[0,1,0,1],zmm11[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm14[0,1,0,1],zmm8[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,0,1],zmm11[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm11 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,17,0,17,0,16,16,0,0,0,0,0,2,3,0,1] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm13, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,17,u,17,u,16,16,u,0,u,0,u,2,3,u,1] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm14[4,u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6] ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,0,1],zmm1[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm17 ^ (mem & (zmm8 ^ zmm17)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -4582,22 +4575,22 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm2[u],zero,zero,zero,zero,ymm2[14],zero,ymm2[u],zero,zero,zero,zero,ymm2[15],zero,ymm2[u],zero,zero,zero,zero,ymm2[16],zero,ymm2[u],zero,zero,zero,zero,ymm2[17],zero,ymm2[u],zero,zero ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm12[u],zero,zero,zero,zero,ymm12[14],zero,ymm12[u],zero,zero,zero,zero,ymm12[15],zero,ymm12[u],zero,zero,zero,zero,ymm12[16],zero,ymm12[u],zero,zero,zero,zero,ymm12[17],zero,ymm12[u],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[13,u,u,u,u,u],zero,ymm12[14,u,u,u,u,u],zero,ymm12[15,u,u,u,u,u],zero,ymm12[16,u,u,u,u,u],zero,ymm12[17,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm10[0,1,0,1] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] ; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm0 & mem) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero @@ -4609,8 +4602,9 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm12[27,u,u,u],zero,ymm12[30],zero,ymm12[28,u,u,u],zero,ymm12[31],zero ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] @@ -4628,25 +4622,25 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,0,1,14],zero,ymm6[14,15,0,1,14,15],zero,ymm6[13,14,15,16,17,16],zero,ymm6[30,31,30,31,16,17],zero,ymm6[31,28,29,30,31] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm13 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm4 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[0,1,14],zero,ymm12[12,13,0,1,14,15],zero,ymm12[3,12,13,2,3,16],zero,ymm12[30,31,28,29,16,17],zero,ymm12[31,18,19,28,29,18],zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero,zero,zero,zero,zero,ymm13[18] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[0,1,14],zero,ymm13[12,13,0,1,14,15],zero,ymm13[3,12,13,2,3,16],zero,ymm13[30,31,28,29,16,17],zero,ymm13[31,18,19,28,29,18],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] -; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512BW-NEXT: vpor %ymm2, %ymm5, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 @@ -4659,13 +4653,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero ; AVX512BW-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm11 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512BW-NEXT: vmovdqa (%r8), %xmm10 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm14 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] ; AVX512BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 ; AVX512BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512BW-NEXT: kmovq %rcx, %k1 @@ -4673,27 +4667,27 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm0 {%k1} -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,1,1,4,4,5,5] ; AVX512BW-NEXT: movl $676341840, %ecx # imm = 0x28502850 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k1} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k1} = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm9[25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero ; AVX512BW-NEXT: vporq %ymm15, %ymm16, %ymm15 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm16[18,19,20,21],zero,zmm16[19],zero,zmm16[25,26,27,22],zero,zmm16[20],zero,zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm14[2,3,2,3],zmm15[2,3,2,3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm14[18,19,20,21],zero,zmm14[19],zero,zmm14[25,26,27,22],zero,zmm14[20],zero,zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero,zmm14[59],zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm16 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] -; AVX512BW-NEXT: vporq %zmm14, %zmm16, %zmm14 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm14[18],zero,zero,zero,zero,zmm14[21],zero,zmm14[19],zero,zero,zero,zero,zmm14[22],zero,zmm14[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero,zmm14[59],zero,zmm14[57] +; AVX512BW-NEXT: vporq %zmm16, %zmm14, %zmm14 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,3,2,3,6,7,6,7] ; AVX512BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-NEXT: kmovq %rcx, %k2 ; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k2} -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm2[18],zero,zmm2[20,21,20,21],zero,zmm2[19],zero,zmm2[19,20,21,22],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm2[55],zero,zmm2[55,56,57,58],zero,zmm2[56],zero,zmm2[62,63] +; AVX512BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero ; AVX512BW-NEXT: vporq %zmm16, %zmm17, %zmm16 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] @@ -4701,31 +4695,31 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: kmovq %rcx, %k2 ; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} ; AVX512BW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,3,3,6,6,7,7] +; AVX512BW-NEXT: movl $338170920, %edx # imm = 0x14281428 +; AVX512BW-NEXT: kmovd %edx, %k2 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k2} = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] ; AVX512BW-NEXT: kmovq %rcx, %k2 ; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm14 {%k2} -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,3,3,6,6,7,7] -; AVX512BW-NEXT: movl $338170920, %ecx # imm = 0x14281428 -; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 {%k2} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[2,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,3,4,6,7,7] -; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm9 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,2,3] +; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm7 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[2,3,2,3] ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm12, %ymm6 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] +; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,0,1],zmm7[0,1,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u],zero,xmm4[7],zero,xmm4[5,u,u,u],zero,xmm4[8],zero,xmm4[6,u,u,u],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero,xmm3[u,u,u,9] -; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero,xmm3[u,u,u,9] +; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm9[0,1,0,1] @@ -4733,12 +4727,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[4],zero,xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero ; AVX512BW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,0,1],zmm4[0,1,0,1] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 ; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-NEXT: kmovq %rcx, %k1 @@ -4782,55 +4776,55 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm7 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] -; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] +; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpor %ymm10, %ymm11, %ymm7 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm13 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm12 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm10 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] ; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 ; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm15[19],zero,zmm15[21,20,21,22],zero,zmm15[20],zero,zmm15[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm15[55],zero,zmm15[53,54,55,58],zero,zmm15[56],zero,zmm15[60,61,58,59] ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20],zero,zero,zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm16[57],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm10, %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm10[2,3,2,3,6,7,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm10 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero +; AVX512BW-FCP-NEXT: vporq %zmm15, %zmm16, %zmm15 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] ; AVX512BW-FCP-NEXT: vporq %zmm10, %zmm16, %zmm10 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] ; AVX512BW-FCP-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero @@ -4843,27 +4837,27 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u] ; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,0,1],zmm15[0,1,0,1] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] -; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,0,1],zmm15[0,1,0,1] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm14[0,1,0,1] +; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm9 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm9[0,1,0,1] ; AVX512BW-FCP-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[4],zero,xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm14, %xmm9 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,0,1],zmm9[0,1,0,1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 ; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 @@ -4906,25 +4900,25 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,0,1,14],zero,ymm6[14,15,0,1,14,15],zero,ymm6[13,14,15,16,17,16],zero,ymm6[30,31,30,31,16,17],zero,ymm6[31,28,29,30,31] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm13 ; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm4 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[0,1,14],zero,ymm12[12,13,0,1,14,15],zero,ymm12[3,12,13,2,3,16],zero,ymm12[30,31,28,29,16,17],zero,ymm12[31,18,19,28,29,18],zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero,zero,zero,zero,zero,ymm13[18] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[0,1,14],zero,ymm13[12,13,0,1,14,15],zero,ymm13[3,12,13,2,3,16],zero,ymm13[30,31,28,29,16,17],zero,ymm13[31,18,19,28,29,18],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] -; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm5, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 @@ -4937,13 +4931,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm11 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm10 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm14 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 ; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 @@ -4951,27 +4945,27 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,1,1,4,4,5,5] ; AVX512DQ-BW-NEXT: movl $676341840, %ecx # imm = 0x28502850 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k1} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k1} = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm9[25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vporq %ymm15, %ymm16, %ymm15 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm16[18,19,20,21],zero,zmm16[19],zero,zmm16[25,26,27,22],zero,zmm16[20],zero,zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm14[2,3,2,3],zmm15[2,3,2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm14[18,19,20,21],zero,zmm14[19],zero,zmm14[25,26,27,22],zero,zmm14[20],zero,zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero,zmm14[59],zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm16 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] -; AVX512DQ-BW-NEXT: vporq %zmm14, %zmm16, %zmm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm14[18],zero,zero,zero,zero,zmm14[21],zero,zmm14[19],zero,zero,zero,zero,zmm14[22],zero,zmm14[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero,zmm14[59],zero,zmm14[57] +; AVX512DQ-BW-NEXT: vporq %zmm16, %zmm14, %zmm14 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm2[18],zero,zmm2[20,21,20,21],zero,zmm2[19],zero,zmm2[19,20,21,22],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm2[55],zero,zmm2[55,56,57,58],zero,zmm2[56],zero,zmm2[62,63] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero ; AVX512DQ-BW-NEXT: vporq %zmm16, %zmm17, %zmm16 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] @@ -4979,31 +4973,31 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,3,3,6,6,7,7] +; AVX512DQ-BW-NEXT: movl $338170920, %edx # imm = 0x14281428 +; AVX512DQ-BW-NEXT: kmovd %edx, %k2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm12 {%k2} = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] ; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,3,3,6,6,7,7] -; AVX512DQ-BW-NEXT: movl $338170920, %ecx # imm = 0x14281428 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm13 {%k2} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[2,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,3,4,6,7,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm9 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm7 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[2,3,2,3] ; AVX512DQ-BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm12, %ymm6 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,0,1],zmm7[0,1,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u],zero,xmm4[7],zero,xmm4[5,u,u,u],zero,xmm4[8],zero,xmm4[6,u,u,u],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero,xmm3[u,u,u,9] -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero,xmm3[u,u,u,9] +; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm9[0,1,0,1] @@ -5011,12 +5005,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[4],zero,xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero ; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,0,1],zmm4[0,1,0,1] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 ; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 @@ -5060,55 +5054,55 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] -; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] +; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpor %ymm10, %ymm11, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 ; AVX512DQ-BW-FCP-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm15[19],zero,zmm15[21,20,21,22],zero,zmm15[20],zero,zmm15[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm15[55],zero,zmm15[53,54,55,58],zero,zmm15[56],zero,zmm15[60,61,58,59] ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20],zero,zero,zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm16[57],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm10, %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm10[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm15, %zmm16, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-FCP-NEXT: vporq %zmm10, %zmm16, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-FCP-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero @@ -5121,27 +5115,27 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,0,1],zmm15[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,0,1],zmm15[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm14[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm9[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[4],zero,xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm14, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,0,1],zmm9[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 @@ -6607,35 +6601,35 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX-LABEL: store_i8_stride7_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $616, %rsp # imm = 0x268 +; AVX-NEXT: subq $584, %rsp # imm = 0x248 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa 16(%rax), %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] -; AVX-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX-NEXT: vmovdqa 16(%r8), %xmm11 ; AVX-NEXT: vmovdqa 16(%r9), %xmm8 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u],zero,zero,xmm6[11,u,u,u,u],zero,zero,xmm6[12,u,u,u,u],zero -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; AVX-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] @@ -6655,8 +6649,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm1 ; AVX-NEXT: vmovdqa %xmm3, %xmm8 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u] -; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm3 -; AVX-NEXT: vmovdqa %xmm4, %xmm10 +; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm3 +; AVX-NEXT: vmovdqa %xmm4, %xmm11 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u] ; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 @@ -6664,22 +6658,21 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm3 ; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u] -; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm1 -; AVX-NEXT: vmovdqa %xmm3, %xmm12 +; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u] +; AVX-NEXT: vpshufb %xmm10, %xmm13, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u] -; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm3 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128] +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128] ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,7,128,u,u,u,u,u,8,128,u,u,u,u,u,9] -; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm3 +; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [u,7,128,u,u,u,u,u,8,128,u,u,u,u,u,9] +; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm14, %xmm12, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] ; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 @@ -6693,18 +6686,18 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX-NEXT: vpshufb %xmm8, %xmm11, %xmm0 -; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm0 +; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa 32(%rax), %xmm8 ; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm1 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[4,u,u,u,u],zero,zero,xmm8[5,u,u,u,u],zero,zero -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm3, %xmm10 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm3, %xmm11 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] @@ -6712,22 +6705,23 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 ; AVX-NEXT: vmovdqa 32(%rcx), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm1 ; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm3 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm3 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm5 ; AVX-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm6 -; AVX-NEXT: vpshufb %xmm13, %xmm3, %xmm7 +; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm7 ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm7 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] ; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5 @@ -6740,7 +6734,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] @@ -6769,39 +6763,37 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 48(%rax), %xmm12 +; AVX-NEXT: vmovdqa 48(%rax), %xmm8 ; AVX-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 48(%r9), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128] -; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128] +; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm0 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13] -; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13] +; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm1 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm1 +; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 +; AVX-NEXT: vmovdqa 48(%rcx), %xmm13 +; AVX-NEXT: vmovdqa 48(%rdx), %xmm11 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm14, %ymm3 ; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1 ; AVX-NEXT: vandps %ymm7, %ymm3, %ymm3 ; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 @@ -6812,70 +6804,68 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa (%r9), %xmm6 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u] -; AVX-NEXT: vmovdqa (%r8), %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,7],zero,xmm8[u,u,u,u,u,8],zero,xmm8[u,u] +; AVX-NEXT: vmovdqa (%r8), %xmm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,7],zero,xmm7[u,u,u,u,u,8],zero,xmm7[u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u] -; AVX-NEXT: vmovdqa (%rax), %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[6,u,u,u,u],zero,zero,xmm7[7,u,u,u,u],zero,zero,xmm7[8,u] +; AVX-NEXT: vmovdqa (%rax), %xmm12 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[6,u,u,u,u],zero,zero,xmm12[7,u,u,u,u],zero,zero,xmm12[8,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm1 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX-NEXT: vpshufb %xmm4, %xmm12, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX-NEXT: vpor %xmm1, %xmm15, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 +; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm14 +; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm14 ; AVX-NEXT: vmovdqa (%rcx), %xmm5 ; AVX-NEXT: vmovdqa (%rdx), %xmm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u] +; AVX-NEXT: vpor %xmm0, %xmm15, %xmm0 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm14 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm4 -; AVX-NEXT: vmovdqa (%rsi), %xmm14 +; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm15 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm4 +; AVX-NEXT: vmovdqa (%rsi), %xmm15 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9] -; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[u],zero,xmm15[7,u,u,u,u,u],zero,xmm15[8,u,u,u,u,u],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9] +; AVX-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 -; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX-NEXT: vandnps %ymm4, %ymm9, %ymm4 -; AVX-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm1 +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX-NEXT: vandps %ymm0, %ymm1, %ymm1 ; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX-NEXT: vandnps %ymm15, %ymm0, %ymm4 +; AVX-NEXT: vandnps %ymm14, %ymm0, %ymm4 ; AVX-NEXT: vandps %ymm0, %ymm1, %ymm1 ; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] -; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u],zero,zero,xmm12[11,u,u,u,u],zero,zero,xmm12[12,u,u,u,u],zero +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] +; AVX-NEXT: vpor %xmm1, %xmm10, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] -; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u],zero,zero,xmm12[9,u,u,u,u],zero,zero,xmm12[10,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm10, %xmm4 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] -; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] -; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] ; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -6889,239 +6879,238 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm11 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 -; AVX-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX-NEXT: vandps %ymm4, %ymm9, %ymm4 -; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,xmm8[7,u,u,u,u,u],zero,xmm8[8,u,u,u,u,u],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9] -; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] -; AVX-NEXT: vpshufb %xmm6, %xmm11, %xmm11 -; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm2[4,u,u,u,u],zero,zero,xmm2[5,u,u,u,u],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u],zero,zero,xmm2[2,u,u,u,u],zero,zero,xmm2[3,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5],zero,xmm3[u,u,u,u,6,7],zero,xmm3[u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u],zero,xmm13[7,u,u,u,u,u],zero,xmm13[8,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u,7],zero,xmm11[u,u,u,u,u,8],zero,xmm11[u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] +; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm15 +; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm4, %ymm4 +; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u,u,u,u],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,7],zero,xmm9[u,u,u,u,u,8],zero,xmm9[u,u,u,u,u,9] +; AVX-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] +; AVX-NEXT: vpshufb %xmm7, %xmm15, %xmm15 +; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm2, %ymm2 +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2 +; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[u,u,6,7,8,9],zero,xmm4[u,u,13,14,15] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm6[9,u,u],zero,zero,zero,zero,xmm6[10,u,u],zero,zero,zero -; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero -; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10,11],zero,xmm4[13,14,15] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm12[9],zero,zero,zero,zero,zero,zero,xmm12[10],zero,zero,zero -; AVX-NEXT: vpor %xmm4, %xmm11, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4],zero,xmm3[u,u,8,9,10,11],zero,xmm3[u,u,15] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u],zero,zero,zero,zero,xmm6[7,u,u],zero,zero,zero,zero,xmm6[8,u,u],zero -; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4,5],zero,xmm3[u,8,9,10,11,12],zero,xmm3[u,15] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero -; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4,5,6],zero,xmm3[8,9,10,11,12,13],zero,xmm3[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[6],zero,zero,zero,zero,zero,zero,xmm12[7],zero,zero,zero,zero,zero,zero,xmm12[8],zero -; AVX-NEXT: vpor %xmm4, %xmm3, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm5[9,u,u],zero,zero,zero,zero,xmm5[10,u,u],zero,zero,zero +; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX-NEXT: vpor %xmm4, %xmm15, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15] +; AVX-NEXT: vorps %ymm1, %ymm3, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[9,u],zero,zero,zero,zero,zero,xmm0[10,u],zero,zero,zero +; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4],zero,xmm1[6,7,8,9,10,11],zero,xmm1[13,14,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,1,2,3,4],zero,xmm2[u,u,8,9,10,11],zero,xmm2[u,u,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm8[9],zero,zero,zero,zero,zero,zero,xmm8[10],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,xmm5[7,u,u],zero,zero,zero,zero,xmm5[8,u,u],zero +; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm4 -; AVX-NEXT: vpmovsxdq {{.*#+}} xmm10 = [218890240,986624] -; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] -; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX-NEXT: vandnps %ymm3, %ymm8, %ymm3 -; AVX-NEXT: vandps %ymm0, %ymm8, %ymm0 -; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13] -; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[u,4,5,6,7,8],zero,xmm3[u,11,12,13,14,15] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[11,u],zero,zero,zero,zero,zero,xmm5[12,u],zero,zero,zero,zero,zero -; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8,9],zero,xmm3[11,12,13,14,15] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero -; AVX-NEXT: vpor %xmm4, %xmm3, %xmm1 +; AVX-NEXT: vpor %xmm4, %xmm2, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,1,2,3,4,5],zero,xmm1[u,8,9,10,11,12],zero,xmm1[u,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u],zero,zero,zero,zero,zero,xmm0[7,u],zero,zero,zero,zero,zero,xmm0[8,u],zero +; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6],zero,xmm1[8,9,10,11,12,13],zero,xmm1[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[6],zero,zero,zero,zero,zero,zero,xmm8[7],zero,zero,zero,zero,zero,zero,xmm8[8],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[0],zero,xmm2[u,u,4,5,6,7],zero,xmm2[u,u,11,12,13,14],zero +; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[13,u],zero,zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,zero,xmm5[15,u] -; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15] -; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm5[11,u,u],zero,zero,zero,zero,xmm5[12,u,u],zero,zero,zero,zero,xmm5[13] +; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[u,4,5,6,7,8],zero,xmm1[u,11,12,13,14,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[u,4,5,6,7,0],zero,xmm2[u,11,12,13,14,1],zero,xmm2[u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[11,u],zero,zero,zero,zero,zero,xmm0[12,u],zero,zero,zero,zero,zero +; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[13,u],zero,zero,zero,zero,zero,xmm0[14,u],zero,zero,zero,zero,zero,xmm0[15,u] +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8,9],zero,xmm1[11,12,13,14,15] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6,7],zero,xmm2[9,10,11,12,13,14],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm8[11],zero,zero,zero,zero,zero,zero,xmm8[12],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm8[13],zero,zero,zero,zero,zero,zero,xmm8[14],zero,zero,zero,zero,zero,zero,xmm8[15] +; AVX-NEXT: vpor %xmm3, %xmm1, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpmovsxdq {{.*#+}} xmm15 = [16777216,197120] +; AVX-NEXT: vpor %xmm4, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,0,1,u,u,u,u,u,2,3,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10],zero,xmm2[u,u,u,u,13,12],zero,xmm2[u,u,u,u,15,14],zero +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm15[13,u,u,u,u],zero,zero,xmm15[14,u,u,u,u],zero,zero,xmm15[15] +; AVX-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm9 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] +; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm9 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 +; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] +; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,128,128,2,u,u,u,u,128,128,3,u,u,u,u] +; AVX-NEXT: vpshufb %xmm0, %xmm15, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,128,u,u,u,u,6,7,128,u,u,u,u] +; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm9 +; AVX-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm9 +; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[4,5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3] +; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 +; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX-NEXT: vorps %ymm5, %ymm2, %ymm9 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm5 +; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm8 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm8 +; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm8 +; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] +; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX-NEXT: vorps %ymm5, %ymm2, %ymm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm4 +; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm3 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm7 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX-NEXT: vandps %ymm4, %ymm8, %ymm4 -; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13,u,u,u,u],zero,zero,xmm12[14,u,u,u,u],zero,zero,xmm12[15] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15] -; AVX-NEXT: vpor %xmm7, %xmm4, %xmm7 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm10 -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] -; AVX-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm10 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm10 -; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm12 -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 -; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10 -; AVX-NEXT: vorps %ymm5, %ymm10, %ymm5 -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,128,128,2,u,u,u,u,128,128,3,u,u,u,u] -; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm10 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,128,u,u,u,u,6,7,128,u,u,u,u] -; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm13 -; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm13 -; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] -; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 -; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX-NEXT: vandnps %ymm10, %ymm13, %ymm10 -; AVX-NEXT: vorps %ymm5, %ymm10, %ymm10 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm14 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm14 -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm8 -; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 -; AVX-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX-NEXT: vandps %ymm12, %ymm8, %ymm8 -; AVX-NEXT: vorps %ymm5, %ymm8, %ymm5 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm8 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm12 -; AVX-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm12 -; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3] -; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 -; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX-NEXT: vandnps %ymm8, %ymm13, %ymm8 -; AVX-NEXT: vorps %ymm5, %ymm8, %ymm5 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm8 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX-NEXT: vandnps %ymm8, %ymm0, %ymm3 -; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX-NEXT: # xmm6 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero -; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] -; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 128(%rax) -; AVX-NEXT: vmovaps %ymm1, 96(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, 128(%rax) +; AVX-NEXT: vmovaps %ymm0, 96(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 64(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 32(%rax) -; AVX-NEXT: vmovaps %ymm5, (%rax) -; AVX-NEXT: vmovaps %ymm10, 224(%rax) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm1, 352(%rax) +; AVX-NEXT: vmovaps %ymm2, (%rax) +; AVX-NEXT: vmovaps %ymm9, 224(%rax) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 352(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 320(%rax) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7140,7 +7129,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovaps %xmm0, 384(%rax) ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, 400(%rax) -; AVX-NEXT: addq $616, %rsp # imm = 0x268 +; AVX-NEXT: addq $584, %rsp # imm = 0x248 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -7216,10 +7205,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0] ; AVX2-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] @@ -7387,8 +7376,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] @@ -7430,8 +7418,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm7 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] @@ -7584,63 +7571,63 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $616, %rsp # imm = 0x268 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,ymm8[27,20,21,26],zero,ymm8[24],zero,ymm8[26,27,26,27],zero,ymm8[25] -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,ymm7[27,20,21,26],zero,ymm7[24],zero,ymm7[26,27,26,27],zero,ymm7[25] +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] -; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27] +; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -7802,29 +7789,28 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31] +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,18,19,30],zero,ymm4[28],zero,ymm4[28,29,30,31],zero,ymm4[29],zero,ymm4[31] ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm2 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero,zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm6 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm14 -; AVX2-FP-NEXT: vmovdqa (%r9), %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,ymm14[27,28,29,30],zero,ymm14[28],zero,ymm14[26,27,30,31],zero,ymm14[29] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero -; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,ymm13[27,28,29,30],zero,ymm13[28],zero,ymm13[26,27,30,31],zero,ymm13[29] +; AVX2-FP-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpor %ymm2, %ymm7, %ymm7 -; AVX2-FP-NEXT: vmovdqa (%rax), %ymm10 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] @@ -7832,138 +7818,133 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[23],zero,ymm13[27,20,21,26],zero,ymm13[24],zero,ymm13[26,27,26,27],zero,ymm13[25] -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25] +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero ; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm2 -; AVX2-FP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] -; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FP-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX2-FP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] +; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm0 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm11 -; AVX2-FP-NEXT: vpor %ymm8, %ymm11, %ymm8 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm7, %ymm12 +; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX2-FP-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm6, %ymm11 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm15 -; AVX2-FP-NEXT: vpor %ymm12, %ymm15, %ymm12 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FP-NEXT: vpor %ymm11, %ymm15, %ymm11 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm12, %ymm4 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm5, %ymm9 -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm3, %ymm10 +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm3 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm5, %ymm8 +; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm9 +; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm2, %ymm9 +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm12, %ymm10 ; AVX2-FP-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm2, %ymm10 -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm13, %ymm11 -; AVX2-FP-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm9, %ymm7, %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm11, %ymm2, %ymm12 +; AVX2-FP-NEXT: vpor %ymm10, %ymm12, %ymm10 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm9, %ymm10, %ymm9 -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm12, %ymm4, %ymm13 -; AVX2-FP-NEXT: vpor %ymm11, %ymm13, %ymm11 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] -; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm6, %ymm15 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm3, %ymm15 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm15, %ymm11 -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm15, %ymm10 -; AVX2-FP-NEXT: vpshufb %ymm12, %ymm14, %ymm12 -; AVX2-FP-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] -; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm15, %ymm10 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm9, %ymm15, %ymm9 +; AVX2-FP-NEXT: vpshufb %ymm11, %ymm13, %ymm11 +; AVX2-FP-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm14, %ymm11 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] +; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm7, %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FP-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm6, %ymm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm11 +; AVX2-FP-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm13 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm12 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm7 -; AVX2-FP-NEXT: vpor %ymm7, %ymm13, %ymm7 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm7, %ymm10 -; AVX2-FP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm13, %ymm1, %ymm6 +; AVX2-FP-NEXT: vpor %ymm6, %ymm12, %ymm6 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 +; AVX2-FP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vpshufb %ymm12, %ymm5, %ymm1 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm1 +; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm4 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm15, %ymm4 -; AVX2-FP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm7, %ymm6 +; AVX2-FP-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm3, %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm3, 320(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm9, 128(%rax) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm8, 128(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm14, 352(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7992,63 +7973,63 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $616, %rsp # imm = 0x268 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-FCP-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,ymm8[27,20,21,26],zero,ymm8[24],zero,ymm8[26,27,26,27],zero,ymm8[25] -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,ymm7[27,20,21,26],zero,ymm7[24],zero,ymm7[26,27,26,27],zero,ymm7[25] +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27] +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -8088,7 +8069,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1] ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm4 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm8 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm13 @@ -8191,17 +8172,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,6] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5,5,6] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,5,5,6] -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5,5,6] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] @@ -8211,25 +8192,25 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31] +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[17,18,19,30],zero,ymm4[28],zero,ymm4[28,29,30,31],zero,ymm4[29],zero,ymm4[31] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm2 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero,zero,zero,zero +; AVX2-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm6 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm14 -; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,ymm14[27,28,29,30],zero,ymm14[28],zero,ymm14[26,27,30,31],zero,ymm14[29] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[27,28,29,30],zero,ymm1[28],zero,ymm1[26,27,30,31],zero,ymm1[29] ; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm14 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero ; AVX2-FCP-NEXT: vpor %ymm2, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm10 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] @@ -8241,135 +8222,133 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[23],zero,ymm13[27,20,21,26],zero,ymm13[24],zero,ymm13[26,27,26,27],zero,ymm13[25] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25] +; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero ; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm2 -; AVX2-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] -; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] +; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero +; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm11 -; AVX2-FCP-NEXT: vpor %ymm8, %ymm11, %ymm8 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm11 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm15 -; AVX2-FCP-NEXT: vpor %ymm12, %ymm15, %ymm12 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm15 +; AVX2-FCP-NEXT: vpor %ymm11, %ymm15, %ymm11 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm8, %ymm12, %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm10 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm8 +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm9 +; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm10 ; AVX2-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm10 -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm11 -; AVX2-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm9, %ymm10, %ymm9 -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm13 -; AVX2-FCP-NEXT: vpor %ymm11, %ymm13, %ymm11 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5] -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm15, %ymm13 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm11 -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm10 -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm12 +; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm12 ; AVX2-FCP-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FCP-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: # ymm12 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX2-FCP-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FCP-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: # ymm11 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm15, %ymm11 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FCP-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm11 +; AVX2-FCP-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm12 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm7 -; AVX2-FCP-NEXT: vpor %ymm7, %ymm13, %ymm7 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm7, %ymm10 -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm6 +; AVX2-FCP-NEXT: vpor %ymm6, %ymm12, %ymm6 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm1 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm4 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm6 +; AVX2-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm3, 320(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm9, 128(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm8, 128(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm14, 352(%rax) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rax) @@ -8397,71 +8376,67 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i8_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $1256, %rsp # imm = 0x4E8 -; AVX512-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512-NEXT: vpshufb %ymm5, %ymm9, %ymm0 -; AVX512-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512-NEXT: vpshufb %ymm4, %ymm10, %ymm1 +; AVX512-NEXT: subq $1320, %rsp # imm = 0x528 +; AVX512-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512-NEXT: vpshufb %ymm2, %ymm15, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%rcx), %ymm11 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm4, %ymm11, %ymm0 +; AVX512-NEXT: vmovdqa (%rdx), %ymm10 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512-NEXT: vpshufb %ymm3, %ymm13, %ymm2 +; AVX512-NEXT: vpshufb %ymm3, %ymm10, %ymm2 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %ymm14 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512-NEXT: vmovdqa (%r9), %ymm15 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm6, %ymm15, %ymm7 +; AVX512-NEXT: vmovdqa (%r8), %ymm13 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm6, %ymm13, %ymm0 +; AVX512-NEXT: vmovdqa (%r9), %ymm8 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [13,u,u,u,u,u,128,14,u,u,u,u,u,128,15,u,u,u,u,u,128,16,u,u,u,u,u,128,17,u,u,u] +; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm7 ; AVX512-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm8, %ymm9, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm7 -; AVX512-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512-NEXT: vporq %ymm0, %ymm7, %ymm31 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm9, %ymm13, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm12, %ymm15, %ymm7 +; AVX512-NEXT: vpor %ymm0, %ymm7, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512-NEXT: vpshufb %ymm9, %ymm10, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm10, %ymm23 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,128,30,128,28,u,u,u,128,31,128,29,u] ; AVX512-NEXT: vpshufb %ymm10, %ymm11, %ymm7 ; AVX512-NEXT: vmovdqa64 %ymm11, %ymm16 ; AVX512-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] -; AVX512-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm0, %ymm14, %ymm7 -; AVX512-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] -; AVX512-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm11, %ymm15, %ymm13 -; AVX512-NEXT: vmovdqa64 %ymm15, %ymm24 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] +; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm7 +; AVX512-NEXT: vmovdqa64 %ymm13, %ymm29 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,128,27,u,u,u,128,30,128,28,u,u,u,128,31,128] +; AVX512-NEXT: vpshufb %ymm11, %ymm8, %ymm13 +; AVX512-NEXT: vmovdqa64 %ymm8, %ymm18 ; AVX512-NEXT: vpor %ymm7, %ymm13, %ymm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm13 +; AVX512-NEXT: vpshufb %ymm1, %ymm7, %ymm13 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm8 ; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm12 ; AVX512-NEXT: vpor %ymm13, %ymm12, %ymm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] -; AVX512-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm13 -; AVX512-NEXT: vmovdqa64 %ymm14, %ymm25 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512-NEXT: vpshufb %ymm1, %ymm8, %ymm12 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,u,u,u,128,26,128,24,u,u,u,128,27,128] +; AVX512-NEXT: vpshufb %ymm1, %ymm7, %ymm13 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm20 ; AVX512-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%rdx), %ymm12 @@ -8472,8 +8447,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27] -; AVX512-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpor %ymm9, %ymm10, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%r8), %ymm9 ; AVX512-NEXT: vpshufb %ymm0, %ymm9, %ymm0 ; AVX512-NEXT: vmovdqa 32(%r9), %ymm10 @@ -8485,12 +8460,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpor %ymm0, %ymm11, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa 32(%rax), %ymm14 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[2,3,2,3],zmm0[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX512-NEXT: vpshufb %ymm4, %ymm13, %ymm0 ; AVX512-NEXT: vpshufb %ymm3, %ymm12, %ymm1 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8498,37 +8474,35 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm12[18,19,20,21],zero,ymm12[19],zero,ymm12[25,26,27,22],zero,ymm12[20],zero ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm0 -; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] ; AVX512-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] ; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm1 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm8, %ymm9, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm9[18],zero,ymm9[20,21,20,21],zero,ymm9[19],zero,ymm9[19,20,21,22],zero ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %ymm2, %ymm9, %ymm0 -; AVX512-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512-NEXT: vpshufb %ymm6, %ymm9, %ymm0 +; AVX512-NEXT: vpshufb %ymm5, %ymm10, %ymm1 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm30 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512-NEXT: vmovdqa (%rcx), %xmm0 @@ -8536,288 +8510,286 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm30 -; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpshufb %xmm5, %xmm6, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm6, %xmm31 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %xmm7 -; AVX512-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512-NEXT: vmovdqa (%r9), %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%r8), %xmm8 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm6 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm28 ; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512-NEXT: vporq %xmm4, %xmm5, %xmm26 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512-NEXT: vporq %xmm3, %xmm2, %xmm24 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u] ; AVX512-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm28 -; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm20 -; AVX512-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512-NEXT: vpshufb %xmm3, %xmm15, %xmm1 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX512-NEXT: vporq %xmm1, %xmm0, %xmm23 -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm26 -; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpshufb %ymm12, %ymm3, %ymm1 -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm25 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm19 +; AVX512-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512-NEXT: vpshufb %ymm12, %ymm15, %ymm1 +; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm27 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm23, %ymm3 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm25 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm23 +; AVX512-NEXT: vmovdqa64 %ymm18, %ymm1 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqa64 %ymm29, %ymm1 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vmovdqa (%rax), %ymm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm17 -; AVX512-NEXT: vmovdqa (%rax), %xmm1 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqa (%rax), %ymm5 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa (%rax), %xmm0 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512-NEXT: vmovdqa %xmm9, %xmm12 -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm19 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm31[2,3,2,3],zmm3[0,1,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512-NEXT: vmovdqa %xmm10, %xmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[2,3,2,3],zmm4[0,1,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-NEXT: vpshufb %ymm5, %ymm14, %ymm5 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm14 +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm12 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX512-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[2,3,2,3],zmm14[0,1,0,1] +; AVX512-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm14[0,1,0,1] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm17 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[2,3,2,3],zmm14[0,1,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512-NEXT: vpshufb %ymm4, %ymm14, %ymm4 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm14 -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm14 -; AVX512-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: # zmm29 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm31 ^ (zmm8 & (zmm16 ^ zmm31)) -; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[2,3,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm31, %zmm3 -; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512-NEXT: # ymm31 = mem[2,3,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm31 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm3 ^ (zmm8 & (zmm31 ^ zmm3)) -; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm6[0,1,0,1],mem[0,1,0,1] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm0 -; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm4[0,1,0,1],zmm10[0,1,0,1] -; AVX512-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm14 +; AVX512-NEXT: vmovdqa64 %xmm30, %xmm15 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm30 # 64-byte Folded Reload +; AVX512-NEXT: # zmm30 = zmm7[0,1,0,1],mem[0,1,0,1] +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm7 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm31 # 64-byte Folded Reload +; AVX512-NEXT: # zmm31 = zmm6[0,1,0,1],mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; AVX512-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm28 # 64-byte Folded Reload +; AVX512-NEXT: # zmm28 = zmm8[0,1,0,1],mem[0,1,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512-NEXT: vpshufb %xmm8, %xmm15, %xmm15 +; AVX512-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm26[0,1,0,1],zmm8[0,1,0,1] +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX512-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm4[0,1,0,1],zmm12[0,1,0,1] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm11[8],xmm15[8],xmm11[9],xmm15[9],xmm11[10],xmm15[10],xmm11[11],xmm15[11],xmm11[12],xmm15[12],xmm11[13],xmm15[13],xmm11[14],xmm15[14],xmm11[15],xmm15[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX512-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm23[0,1,0,1],zmm9[0,1,0,1] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: # zmm11 = zmm11[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm24[0,1,0,1],zmm11[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm15[0,1,0,1] +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm4)) +; AVX512-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512-NEXT: vinserti64x4 $1, (%rsp), %zmm7, %zmm7 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm7 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm13)) +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm18 ^ (zmm7 & (zmm2 ^ zmm18)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm2)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = zmm2[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: # zmm10 = zmm10[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm2)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = zmm2[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm10)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm2)) +; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = mem[2,3,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm2 +; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[2,3,2,3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload -; AVX512-NEXT: # zmm13 = zmm13[2,3,2,3],mem[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm11)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: # zmm11 = zmm11[2,3,2,3],mem[2,3,2,3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm23 & (zmm11 ^ zmm13)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (zmm23 & (zmm12 ^ zmm10)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm25[2,3,2,3],zmm26[2,3,2,3] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm21[2,3,2,3],zmm22[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm13 ^ (zmm23 & (zmm15 ^ zmm13)) -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) -; AVX512-NEXT: vinserti64x4 $1, (%rsp), %zmm6, %zmm5 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm5 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,0,1,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm16)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm11)) +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm2 ^ (zmm7 & (zmm10 ^ zmm2)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = zmm2[0,1,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm2 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm10)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm30)) +; AVX512-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,0,4,4,5,4] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm28)) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512-NEXT: # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm4 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm31)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm29)) -; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm8)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm20 ^ (mem & (zmm9 ^ zmm20)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm12)) -; AVX512-NEXT: vporq %zmm27, %zmm24, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm3)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm31)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm8 ^ (zmm9 & (zmm11 ^ zmm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm19 ^ (mem & (zmm2 ^ zmm19)) +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = zmm27[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm23[2,3,2,3],zmm25[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm9 & (zmm6 ^ zmm4)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm11)) +; AVX512-NEXT: vporq %zmm21, %zmm20, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512-NEXT: addq $1320, %rsp # imm = 0x528 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1448, %rsp # imm = 0x5A8 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512-FCP-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[0,1,14],zero,ymm10[12,13,0,1,14,15],zero,ymm10[3,12,13,2,3,16],zero,ymm10[30,31,28,29,16,17],zero,ymm10[31,18,19,28,29,18],zero ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm9 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [13,u,u,u,u,u,128,14,u,u,u,u,u,128,15,u,u,u,u,u,128,16,u,u,u,u,u,128,17,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm1 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero,ymm5[25] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23,u,u,u],zero,ymm11[26],zero,ymm11[24,u,u,u],zero,ymm11[27],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero,ymm11[25] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm14[23,u,u,u],zero,ymm14[26],zero,ymm14[24,u,u,u],zero,ymm14[27],zero ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,128,30,128,28,u,u,u,128,31,128,29,u] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] ; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm12 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm12 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,128,27,u,u,u,128,30,128,28,u,u,u,128,31,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero @@ -8825,370 +8797,371 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm15 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm13 -; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[2,3,2,3],zmm13[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm24 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm19 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm20 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,u,6,u,4,u,6,7,u,17,u,17,u,16,16,u] ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm8 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[1,1,0,0,4,5,6,7] ; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm6, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm7, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm26 -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm30 +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm7 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm9 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512-FCP-NEXT: vpor %xmm6, %xmm9, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512-FCP-NEXT: vpor %xmm7, %xmm9, %xmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,5,6] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm23 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u] +; AVX512-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm30 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm15 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm28 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512-FCP-NEXT: vpor %xmm7, %xmm9, %xmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm9 +; AVX512-FCP-NEXT: vpor %ymm7, %ymm9, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero,zero,ymm11[18] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero +; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm14[14],zero,zero,zero,zero,zero,zero,ymm14[15],zero,zero,zero,zero,zero,zero,ymm14[16],zero,zero,zero,zero,zero,zero,ymm14[17],zero,zero,zero,zero,zero,zero,ymm14[18] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[0,1,14],zero,ymm11[12,13,0,1,14,15],zero,ymm11[3,12,13,2,3,16],zero,ymm11[30,31,28,29,16,17],zero,ymm11[31,18,19,28,29,18],zero ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm4 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm3[18],zero,ymm3[20,21,20,21],zero,ymm3[19],zero,ymm3[19,20,21,22],zero +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm10 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm2 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vporq %xmm3, %xmm4, %xmm26 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero,ymm13[25] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm12 +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,1,u,1,u,0,0,u,16,u,16,u,18,19,u,17] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm15 +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm13 +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm4 +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero ; AVX512-FCP-NEXT: vporq %ymm3, %ymm4, %ymm25 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm24 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vporq %ymm3, %ymm0, %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] ; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm23 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm22 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero +; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm3[2,3,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm14[23],zero,ymm14[23,24,25,26],zero,ymm14[24],zero,ymm14[30,31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm3 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[2,3,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm3 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm3[2,3,2,3],zmm0[2,3,2,3] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] +; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm3 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm29 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm27 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm3[0,1,0,1] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[2,3,2,3],zmm3[0,1,0,1] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm11 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm3[0,1,0,1],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,0,1],zmm3[0,1,0,1] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm1[8],xmm9[9],xmm1[9],xmm9[10],xmm1[10],xmm9[11],xmm1[11],xmm9[12],xmm1[12],xmm9[13],xmm1[13],xmm9[14],xmm1[14],xmm9[15],xmm1[15] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm2 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vpermd %ymm13, %ymm28, %ymm13 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm6 = zmm6[0,1,0,1],mem[0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm26[0,1,0,1] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm24[2,3,2,3],zmm25[2,3,2,3] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm22[2,3,2,3],zmm23[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm12 & (zmm7 ^ zmm5)) -; AVX512-FCP-NEXT: vporq %zmm20, %zmm19, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm5)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = zmm1[0,1,0,1],mem[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm7 = zmm7[0,1,0,1],mem[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm24[2,3,2,3],zmm25[2,3,2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm20[2,3,2,3],zmm23[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm6 ^ (zmm10 & (zmm9 ^ zmm6)) +; AVX512-FCP-NEXT: vporq %zmm19, %zmm18, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm6)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm9)) +; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm5[0,1,0,1],mem[0,1,0,1] ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm12 & (zmm5 ^ zmm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm17 ^ (zmm12 & (zmm31 ^ zmm17)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm21 ^ (zmm7 & (zmm16 ^ zmm21)) -; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm11 = mem[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm12 = mem[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (zmm7 & (zmm12 ^ zmm11)) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4)) -; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm11[0,1,0,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm8 = zmm8[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm10 & (zmm4 ^ zmm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm16 ^ (zmm10 & (zmm22 ^ zmm16)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm3 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm6)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm21 ^ (zmm3 & (zmm12 ^ zmm21)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm2 ^ (mem & (zmm31 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm12)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm30 ^ (mem & (zmm28 ^ zmm30)) +; AVX512-FCP-NEXT: vpermq $238, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm2 = mem[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm4 ^ (mem & (zmm30 ^ zmm4)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm16)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (mem & (zmm9 ^ zmm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm12)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm18 ^ (mem & (zmm2 ^ zmm18)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm4 = mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm3 & (zmm4 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (mem & (zmm28 ^ zmm22)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = zmm2[0,1,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm17 ^ (mem & (zmm5 ^ zmm17)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm7)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512-FCP-NEXT: addq $1448, %rsp # imm = 0x5A8 +; AVX512-FCP-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $1256, %rsp # imm = 0x4E8 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm9, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm10, %ymm1 +; AVX512DQ-NEXT: subq $1320, %rsp # imm = 0x528 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm15, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm11, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm13, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm10, %ymm2 ; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm14 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm15 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm15, %ymm7 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm13 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm13, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm8 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [13,u,u,u,u,u,128,14,u,u,u,u,u,128,15,u,u,u,u,u,128,16,u,u,u,u,u,128,17,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm8, %ymm7 ; AVX512DQ-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm9, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-NEXT: vporq %ymm0, %ymm7, %ymm31 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm13, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm15, %ymm7 +; AVX512DQ-NEXT: vpor %ymm0, %ymm7, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm10, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm23 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,128,30,128,28,u,u,u,128,31,128,29,u] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm11, %ymm7 ; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm16 ; AVX512DQ-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] -; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] -; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm15, %ymm13 -; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm24 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm7 +; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm29 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,128,27,u,u,u,128,30,128,28,u,u,u,128,31,128] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm8, %ymm13 +; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm18 ; AVX512DQ-NEXT: vpor %ymm7, %ymm13, %ymm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm13 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm7, %ymm13 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm8 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm12 ; AVX512DQ-NEXT: vpor %ymm13, %ymm12, %ymm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] -; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm13 -; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm25 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm8, %ymm12 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,u,u,u,128,26,128,24,u,u,u,128,27,128] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm7, %ymm13 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm20 ; AVX512DQ-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm12 @@ -9199,8 +9172,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27] -; AVX512DQ-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpor %ymm9, %ymm10, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm9 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm9, %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm10 @@ -9212,12 +9185,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpor %ymm0, %ymm11, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm14 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm13, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm12, %ymm1 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9225,37 +9199,35 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm12[18,19,20,21],zero,ymm12[19],zero,ymm12[25,26,27,22],zero,ymm12[20],zero ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm1 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm9, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm9[18],zero,ymm9[20,21,20,21],zero,ymm9[19],zero,ymm9[19,20,21,22],zero ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm9, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm9, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm10, %ymm1 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm30 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm6 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 @@ -9263,288 +9235,286 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm30 -; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm6, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm31 +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm7 -; AVX512DQ-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm8 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm28 ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512DQ-NEXT: vporq %xmm4, %xmm5, %xmm26 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512DQ-NEXT: vporq %xmm3, %xmm2, %xmm24 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u] ; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm28 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm20 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm15, %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX512DQ-NEXT: vporq %xmm1, %xmm0, %xmm23 -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm26 -; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm3, %ymm1 -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm25 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm19 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm15, %ymm1 +; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm27 ; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm25 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm23 +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm17 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm1 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm0 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm12 -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm19 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm31[2,3,2,3],zmm3[0,1,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[2,3,2,3],zmm4[0,1,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm14, %ymm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm14 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm12 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[2,3,2,3],zmm14[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm14, %xmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm14[0,1,0,1] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm17 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[2,3,2,3],zmm14[0,1,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm14, %ymm4 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm14 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm29 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm31 ^ (zmm8 & (zmm16 ^ zmm31)) -; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm31, %zmm3 -; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm31 = mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm31 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm3 ^ (zmm8 & (zmm31 ^ zmm3)) -; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm6[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm4[0,1,0,1],zmm10[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm15 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm30 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm30 = zmm7[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm7 +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm31 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm31 = zmm6[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm28 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm28 = zmm8[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm15, %xmm15 +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm26[0,1,0,1],zmm8[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm4[0,1,0,1],zmm12[0,1,0,1] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm11[8],xmm15[8],xmm11[9],xmm15[9],xmm11[10],xmm15[10],xmm11[11],xmm15[11],xmm11[12],xmm15[12],xmm11[13],xmm15[13],xmm11[14],xmm15[14],xmm11[15],xmm15[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm23[0,1,0,1],zmm9[0,1,0,1] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm11 = zmm11[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm24[0,1,0,1],zmm11[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm15[0,1,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm4)) +; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, (%rsp), %zmm7, %zmm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm7 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm13)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm18 ^ (zmm7 & (zmm2 ^ zmm18)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = zmm2[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm10 = zmm10[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = zmm2[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm10)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm2)) +; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm2 = mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[2,3,2,3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm13 = zmm13[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm11)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm11 = zmm11[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm23 & (zmm11 ^ zmm13)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (zmm23 & (zmm12 ^ zmm10)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm25[2,3,2,3],zmm26[2,3,2,3] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm21[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm13 ^ (zmm23 & (zmm15 ^ zmm13)) -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) -; AVX512DQ-NEXT: vinserti64x4 $1, (%rsp), %zmm6, %zmm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm5 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,0,1,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm16)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm11)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm2 ^ (zmm7 & (zmm10 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = zmm2[0,1,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm2 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm10)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm30)) +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,0,4,4,5,4] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm28)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm4 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm31)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm29)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm8)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm20 ^ (mem & (zmm9 ^ zmm20)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm12)) -; AVX512DQ-NEXT: vporq %zmm27, %zmm24, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm3)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm31)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm8 ^ (zmm9 & (zmm11 ^ zmm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm19 ^ (mem & (zmm2 ^ zmm19)) +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = zmm27[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm23[2,3,2,3],zmm25[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm9 & (zmm6 ^ zmm4)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm11)) +; AVX512DQ-NEXT: vporq %zmm21, %zmm20, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-NEXT: addq $1320, %rsp # imm = 0x528 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $1448, %rsp # imm = 0x5A8 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-FCP-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[0,1,14],zero,ymm10[12,13,0,1,14,15],zero,ymm10[3,12,13,2,3,16],zero,ymm10[30,31,28,29,16,17],zero,ymm10[31,18,19,28,29,18],zero ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [13,u,u,u,u,u,128,14,u,u,u,u,u,128,15,u,u,u,u,u,128,16,u,u,u,u,u,128,17,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm1 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero,ymm5[25] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23,u,u,u],zero,ymm11[26],zero,ymm11[24,u,u,u],zero,ymm11[27],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero,ymm11[25] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm14[23,u,u,u],zero,ymm14[26],zero,ymm14[24,u,u,u],zero,ymm14[27],zero ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,128,30,128,28,u,u,u,128,31,128,29,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,128,27,u,u,u,128,30,128,28,u,u,u,128,31,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero @@ -9552,300 +9522,305 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm15 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[2,3,2,3],zmm13[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm24 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm19 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm20 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,u,6,u,4,u,6,7,u,17,u,17,u,16,16,u] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm8 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[1,1,0,0,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm7, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm30 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm7 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm9, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,5,6] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm7, %zmm8, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm28 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm9 +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm9, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero,zero,ymm11[18] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm14[14],zero,zero,zero,zero,zero,zero,ymm14[15],zero,zero,zero,zero,zero,zero,ymm14[16],zero,zero,zero,zero,zero,zero,ymm14[17],zero,zero,zero,zero,zero,zero,ymm14[18] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[0,1,14],zero,ymm11[12,13,0,1,14,15],zero,ymm11[3,12,13,2,3,16],zero,ymm11[30,31,28,29,16,17],zero,ymm11[31,18,19,28,29,18],zero ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm4 ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm3[18],zero,ymm3[20,21,20,21],zero,ymm3[19],zero,ymm3[19,20,21,22],zero +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm2 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm4, %xmm26 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero,ymm13[25] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [u,1,u,1,u,0,0,u,16,u,16,u,18,19,u,17] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm4 +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero ; AVX512DQ-FCP-NEXT: vporq %ymm3, %ymm4, %ymm25 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vporq %ymm3, %ymm0, %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] ; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm23 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm3[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm14[23],zero,ymm14[23,24,25,26],zero,ymm14[24],zero,ymm14[30,31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm3[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm29 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm27 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm3[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[2,3,2,3],zmm3[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm3[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,0,1],zmm3[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm1[8],xmm9[9],xmm1[9],xmm9[10],xmm1[10],xmm9[11],xmm1[11],xmm9[12],xmm1[12],xmm9[13],xmm1[13],xmm9[14],xmm1[14],xmm9[15],xmm1[15] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm28, %ymm13 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm6 = zmm6[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm26[0,1,0,1] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm24[2,3,2,3],zmm25[2,3,2,3] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm22[2,3,2,3],zmm23[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm12 & (zmm7 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vporq %zmm20, %zmm19, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = zmm1[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm24[2,3,2,3],zmm25[2,3,2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm20[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm6 ^ (zmm10 & (zmm9 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vporq %zmm19, %zmm18, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[0,1,0,1],mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm12 & (zmm5 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm17 ^ (zmm12 & (zmm31 ^ zmm17)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm21 ^ (zmm7 & (zmm16 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm11 = mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm12 = mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (zmm7 & (zmm12 ^ zmm11)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm11[0,1,0,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm8 = zmm8[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm10 & (zmm4 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm16 ^ (zmm10 & (zmm22 ^ zmm16)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm3 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm21 ^ (zmm3 & (zmm12 ^ zmm21)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm2 ^ (mem & (zmm31 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm30 ^ (mem & (zmm28 ^ zmm30)) +; AVX512DQ-FCP-NEXT: vpermq $238, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm2 = mem[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm4 ^ (mem & (zmm30 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm16)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (mem & (zmm9 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm18 ^ (mem & (zmm2 ^ zmm18)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm4 = mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm3 & (zmm4 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (mem & (zmm28 ^ zmm22)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm2 = zmm2[0,1,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm17 ^ (mem & (zmm5 ^ zmm17)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $1448, %rsp # imm = 0x5A8 +; AVX512DQ-FCP-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index a9da7abaa945c..73e0ea741a887 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -24,8 +24,8 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: movdqa (%r11), %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movdqa (%r11), %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] @@ -67,10 +67,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-NEXT: vmovdqa (%r11), %xmm3 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX-NEXT: vmovdqa %xmm0, (%rax) @@ -87,10 +87,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vmovdqa (%r11), %xmm3 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX2-NEXT: vmovdqa %xmm0, (%rax) @@ -107,10 +107,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vmovdqa (%r11), %xmm3 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) @@ -127,10 +127,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -147,10 +147,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vmovdqa (%r11), %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512-NEXT: vmovdqa %xmm0, (%rax) @@ -167,10 +167,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -187,10 +187,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax) @@ -207,10 +207,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -227,10 +227,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) @@ -247,10 +247,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -267,10 +267,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rax) @@ -287,10 +287,10 @@ define void @store_i8_stride8_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax) @@ -324,26 +324,26 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: movdqa (%r11), %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: movdqa (%r11), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] ; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,6,4] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,7,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm7, %xmm5 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] ; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: por %xmm5, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] @@ -363,18 +363,18 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: por %xmm8, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: packuswb %xmm7, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] @@ -403,10 +403,10 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX-NEXT: vmovdqa (%r11), %xmm3 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm3[0] ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 @@ -424,20 +424,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-NEXT: vmovdqa (%r11), %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-NEXT: vmovdqa (%r10), %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -447,20 +447,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FP-NEXT: vmovdqa (%r11), %xmm3 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-FP-NEXT: vmovdqa (%r10), %xmm3 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX2-FP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FP-NEXT: vzeroupper @@ -470,20 +470,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm3 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper @@ -493,20 +493,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-NEXT: vmovdqa (%r11), %xmm3 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512-NEXT: vmovdqa (%r10), %xmm3 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-NEXT: vzeroupper @@ -516,20 +516,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm3 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper @@ -539,20 +539,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm3 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-NEXT: vzeroupper @@ -562,20 +562,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -585,20 +585,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%r10), %xmm3 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -608,20 +608,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm3 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -631,20 +631,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm3 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -654,20 +654,20 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -856,7 +856,6 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -878,9 +877,10 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,8],zero,zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vpor %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31] ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero @@ -896,7 +896,6 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -918,9 +917,10 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,8],zero,zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[18,26],zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero +; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vpor %ymm1, %ymm6, %ymm1 -; AVX2-FP-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm2[4,12],zero,zero,zero,zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,ymm2[22,30],zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero +; AVX2-FP-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31] ; AVX2-FP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero @@ -936,7 +936,6 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -957,11 +956,12 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [201851904,218694913,235537922,252380931] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 @@ -1020,7 +1020,6 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -1040,11 +1039,12 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [201851904,218694913,235537922,252380931] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm0 @@ -1103,7 +1103,6 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -1123,11 +1122,12 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [201851904,218694913,235537922,252380931] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,8,10,0,2,8,10] ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm0 @@ -1153,23 +1153,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8],zero,zero,zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[18,26],zero,zero,zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[36,44],zero,zero,zero,zero,zero,zero,zmm4[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[54,62],zero,zero,zero,zero,zero,zero,zmm4[55,63],zero,zero,zero,zero ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[36,44],zero,zero,zero,zero,zero,zero,zmm0[37,45],zero,zero,zero,zero,zmm0[54,62],zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX512BW-NEXT: vporq %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm3[0,8],zero,zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[18,26],zero,zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[36,44],zero,zero,zero,zero,zero,zero,zmm3[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[54,62],zero,zero,zero,zero,zero,zero,zmm3[55,63] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[54,62],zero,zero,zero,zero,zero,zero,zmm1[55,63] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zmm2[54,62],zero,zero,zero,zero,zero,zero,zmm2[55,63],zero,zero -; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zmm1[54,62],zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | zmm0 | zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1190,7 +1190,7 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,16,18,0,2,16,18,1,3,17,19,1,3,17,19] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,16,18,0,2,16,18,1,3,17,19,1,3,17,19] ; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12],zero,zero,zero,zero,zmm0[1,5,9,13],zero,zero,zero,zero,zmm0[18,22,26,30],zero,zero,zero,zero,zmm0[19,23,27,31],zero,zero,zero,zero,zmm0[32,36,40,44],zero,zero,zero,zero,zmm0[33,37,41,45],zero,zero,zero,zero,zmm0[50,54,58,62],zero,zero,zero,zero,zmm0[51,55,59,63],zero,zero,zero,zero ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 @@ -1215,23 +1215,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8],zero,zero,zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[18,26],zero,zero,zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[36,44],zero,zero,zero,zero,zero,zero,zmm4[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[54,62],zero,zero,zero,zero,zero,zero,zmm4[55,63],zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[36,44],zero,zero,zero,zero,zero,zero,zmm0[37,45],zero,zero,zero,zero,zmm0[54,62],zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX512DQ-BW-NEXT: vporq %zmm4, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm3[0,8],zero,zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[18,26],zero,zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[36,44],zero,zero,zero,zero,zero,zero,zmm3[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[54,62],zero,zero,zero,zero,zero,zero,zmm3[55,63] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[54,62],zero,zero,zero,zero,zero,zero,zmm1[55,63] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zmm2[54,62],zero,zero,zero,zero,zero,zero,zmm2[55,63],zero,zero -; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zmm1[54,62],zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | zmm0 | zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1252,7 +1252,7 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,2,16,18,0,2,16,18,1,3,17,19,1,3,17,19] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,16,18,0,2,16,18,1,3,17,19,1,3,17,19] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12],zero,zero,zero,zero,zmm0[1,5,9,13],zero,zero,zero,zero,zmm0[18,22,26,30],zero,zero,zero,zero,zmm0[19,23,27,31],zero,zero,zero,zero,zmm0[32,36,40,44],zero,zero,zero,zero,zmm0[33,37,41,45],zero,zero,zero,zero,zmm0[50,54,58,62],zero,zero,zero,zero,zmm0[51,55,59,63],zero,zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm2 @@ -1610,25 +1610,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX2-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX2-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] ; AVX2-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX2-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -1674,25 +1674,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX2-FP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX2-FP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -1738,25 +1738,25 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] -; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,2048,0,2305,0,2562,0,2819] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] -; AVX2-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [2048,2305,2562,2819] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,3076,0,3333,0,3590,0,3847] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX2-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [3076,3333,3590,3847] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -1800,27 +1800,27 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm6 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] ; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] -; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] -; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] ; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -1864,27 +1864,27 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm6 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] -; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -1928,27 +1928,27 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm6 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] -; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -1992,27 +1992,27 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] @@ -2065,8 +2065,8 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm2[0,2,0,2,4,6,4,6] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128] -; AVX512BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 +; AVX512BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] @@ -2103,8 +2103,8 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 ; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128] ; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm7, %zmm7 @@ -2161,8 +2161,8 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm2[0,2,0,2,4,6,4,6] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128] -; AVX512DQ-BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] @@ -2199,8 +2199,8 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm7, %zmm7 @@ -3163,7 +3163,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm9, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] @@ -3230,7 +3230,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm5 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 @@ -3313,7 +3313,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] @@ -3380,7 +3380,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm5 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 @@ -3635,16 +3635,17 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm16 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm30 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm14 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm13 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28 @@ -3653,18 +3654,18 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm9 = [1284,1798] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX512-FCP-NEXT: vmovdqa 16(%r10), %xmm10 ; AVX512-FCP-NEXT: vmovdqa 16(%rax), %xmm11 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX512-FCP-NEXT: vmovdqa 16(%r8), %xmm6 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512-FCP-NEXT: vmovdqa 16(%r9), %xmm9 +; AVX512-FCP-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,10,11,10,11,10,11,0,1,2,3,12,13,12,13,12,13,10,11,14,15,14,15] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm3 @@ -3675,51 +3676,49 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm5 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512-FCP-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm31 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm18 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm20 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm15 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm12 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 @@ -3735,21 +3734,22 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6 ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 @@ -3966,13 +3966,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm3 ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 @@ -4123,7 +4123,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %xmm7 ; AVX512BW-NEXT: vmovdqa 16(%r8), %xmm8 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 ; AVX512BW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512BW-NEXT: vmovdqa 16(%rcx), %xmm11 @@ -4135,7 +4135,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm17 ; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm18 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %ecx, %k1 @@ -4183,7 +4183,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm7 ; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm8 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm11 @@ -4195,7 +4195,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm18 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 @@ -4243,7 +4243,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm7 ; AVX512DQ-BW-NEXT: vmovdqa 16(%r8), %xmm8 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rcx), %xmm11 @@ -4255,7 +4255,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm18 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 @@ -4303,7 +4303,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rcx), %xmm11 @@ -4315,7 +4315,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm18 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 @@ -6078,7 +6078,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm1 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 @@ -6120,7 +6120,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] @@ -6136,7 +6136,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm10 = [1284,1798] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm5 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 @@ -6187,7 +6187,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm14, %ymm7 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] @@ -6203,7 +6203,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [1284,1798] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm2 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 @@ -6256,7 +6256,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm15, %ymm8 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm10 = [2312,2826,3340,3854] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] @@ -6272,7 +6272,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm14 = [1284,1798] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm3, %xmm5 ; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 @@ -6399,7 +6399,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm1 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 @@ -6441,7 +6441,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] @@ -6457,7 +6457,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm10 = [1284,1798] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm5 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 @@ -6508,7 +6508,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm7 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] @@ -6524,7 +6524,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [1284,1798] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm2 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 @@ -6577,7 +6577,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm8 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm10 = [2312,2826,3340,3854] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] @@ -6593,7 +6593,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm14 = [1284,1798] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm5 ; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 @@ -7226,15 +7226,15 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm3 = mem ^ (zmm7 & (zmm3 ^ mem)) ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] @@ -7910,15 +7910,15 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm3 = mem ^ (zmm7 & (zmm3 ^ mem)) ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] @@ -8065,7 +8065,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm22 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512BW-NEXT: movl $572662306, %r11d # imm = 0x22222222 ; AVX512BW-NEXT: kmovd %r11d, %k1 ; AVX512BW-NEXT: vpermw %zmm4, %zmm6, %zmm0 {%k1} @@ -8079,9 +8079,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%r8), %xmm10 ; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm26 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] ; AVX512BW-NEXT: vpermw %zmm11, %zmm12, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] ; AVX512BW-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 ; AVX512BW-NEXT: kmovd %r11d, %k2 ; AVX512BW-NEXT: vpermw %zmm9, %zmm13, %zmm11 {%k2} @@ -8188,9 +8188,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 @@ -8202,7 +8202,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 ; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1} @@ -8216,9 +8216,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] ; AVX512BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] ; AVX512BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k2 ; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2} @@ -8367,7 +8367,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm22 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512DQ-BW-NEXT: movl $572662306, %r11d # imm = 0x22222222 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 ; AVX512DQ-BW-NEXT: vpermw %zmm4, %zmm6, %zmm0 {%k1} @@ -8381,9 +8381,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm26 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] ; AVX512DQ-BW-NEXT: vpermw %zmm11, %zmm12, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] ; AVX512DQ-BW-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k2 ; AVX512DQ-BW-NEXT: vpermw %zmm9, %zmm13, %zmm11 {%k2} @@ -8490,9 +8490,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 @@ -8504,7 +8504,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] ; AVX512DQ-BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1} @@ -8518,9 +8518,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] ; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2} diff --git a/llvm/test/CodeGen/X86/vector-llrint-f16.ll b/llvm/test/CodeGen/X86/vector-llrint-f16.ll index 5e5c5849fc22e..f8b470c8f3b8e 100644 --- a/llvm/test/CodeGen/X86/vector-llrint-f16.ll +++ b/llvm/test/CodeGen/X86/vector-llrint-f16.ll @@ -30,15 +30,15 @@ define <2 x i64> @llrint_v2i64_v2f16(<2 x half> %x) { ; AVX-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX-NEXT: vcvttss2si %xmm1, %rax -; AVX-NEXT: vmovq %rax, %xmm1 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: vcvttss2si %xmm0, %rax +; AVX-NEXT: vcvttss2si %xmm0, %rcx ; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vmovq %rcx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; ; FP16-LABEL: llrint_v2i64_v2f16: @@ -107,51 +107,51 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; AVX-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX-NEXT: vcvttss2si %xmm2, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vcvtph2ps %xmm0, %xmm2 -; AVX-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX-NEXT: vcvttss2si %xmm2, %rax -; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm3 -; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm3 ; AVX-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax +; AVX-NEXT: vcvttss2si %xmm2, %rax +; AVX-NEXT: vcvtph2ps %xmm3, %xmm2 ; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 -; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcvttss2si %xmm1, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX-NEXT: vcvttss2si %xmm2, %rax +; AVX-NEXT: vcvtph2ps %xmm4, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax +; AVX-NEXT: vcvttss2si %xmm2, %rcx +; AVX-NEXT: vcvtph2ps %xmm3, %xmm2 ; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX-NEXT: vcvttss2si %xmm2, %rax +; AVX-NEXT: vcvtph2ps %xmm4, %xmm2 +; AVX-NEXT: vmovq %rcx, %xmm4 +; AVX-NEXT: vcvttss2si %xmm2, %rcx +; AVX-NEXT: vmovq %rax, %xmm5 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm4[0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; AVX-NEXT: vmovq %rcx, %xmm1 ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax -; AVX-NEXT: vmovq %rax, %xmm3 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vcvttss2si %xmm3, %rax ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vmovq %rax, %xmm3 ; AVX-NEXT: vcvttss2si %xmm0, %rax ; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] @@ -170,129 +170,129 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; AVX-LABEL: llrint_v16i64_v16f16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa %ymm0, %ymm2 -; AVX-NEXT: vpsrlq $48, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %ymm0, %ymm1 +; AVX-NEXT: vpsrlq $48, %xmm1, %xmm0 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX-NEXT: vcvttss2si %xmm0, %rax ; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcvttss2si %xmm1, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vcvtph2ps %xmm2, %xmm1 -; AVX-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcvttss2si %xmm1, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpsrld $16, %xmm2, %xmm3 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX-NEXT: vcvttss2si %xmm2, %rax +; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-NEXT: vcvtph2ps %xmm1, %xmm2 +; AVX-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX-NEXT: vcvttss2si %xmm2, %rax +; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm3 ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX-NEXT: vcvttss2si %xmm3, %rax ; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX-NEXT: vcvttss2si %xmm1, %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3] +; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX-NEXT: vcvttss2si %xmm4, %rax +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax +; AVX-NEXT: vcvttss2si %xmm3, %rcx +; AVX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vmovq %rcx, %xmm4 +; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax -; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX-NEXT: vcvttss2si %xmm4, %rax -; AVX-NEXT: vmovq %rax, %xmm4 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX-NEXT: vpsrlq $48, %xmm3, %xmm2 +; AVX-NEXT: vcvttss2si %xmm2, %rax +; AVX-NEXT: vcvtph2ps %xmm3, %xmm2 +; AVX-NEXT: vmovq %rax, %xmm5 +; AVX-NEXT: vcvttss2si %xmm2, %rax +; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX-NEXT: vpsrlq $48, %xmm3, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcvttss2si %xmm1, %rax +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm5[0] +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm3[1,1,3,3] ; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX-NEXT: vmovq %rax, %xmm4 ; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX-NEXT: vcvttss2si %xmm2, %rax +; AVX-NEXT: vcvtph2ps %xmm3, %xmm2 +; AVX-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX-NEXT: vcvttss2si %xmm2, %rcx ; AVX-NEXT: vmovq %rax, %xmm2 -; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX-NEXT: vcvttss2si %xmm4, %rax -; AVX-NEXT: vmovq %rax, %xmm4 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] -; AVX-NEXT: vcvtph2ps %xmm3, %xmm4 -; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX-NEXT: vcvttss2si %xmm4, %rax -; AVX-NEXT: vmovq %rax, %xmm4 ; AVX-NEXT: vpsrld $16, %xmm3, %xmm5 ; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 ; AVX-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 ; AVX-NEXT: vcvtps2ph $4, %xmm5, %xmm5 ; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 ; AVX-NEXT: vcvttss2si %xmm5, %rax -; AVX-NEXT: vmovq %rax, %xmm5 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 ; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX-NEXT: vmovq %rcx, %xmm5 +; AVX-NEXT: vmovq %rax, %xmm6 ; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm3[3,3,3,3] +; AVX-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX-NEXT: vroundss $4, %xmm6, %xmm6, %xmm6 +; AVX-NEXT: vcvtps2ph $4, %xmm6, %xmm6 ; AVX-NEXT: vcvttss2si %xmm4, %rax -; AVX-NEXT: vmovq %rax, %xmm4 -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,3,3,3] -; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; AVX-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX-NEXT: vcvttss2si %xmm5, %rax -; AVX-NEXT: vmovq %rax, %xmm5 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] +; AVX-NEXT: vcvtph2ps %xmm6, %xmm4 +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 ; AVX-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 ; AVX-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX-NEXT: vcvttss2si %xmm5, %rax +; AVX-NEXT: vcvttss2si %xmm4, %rcx +; AVX-NEXT: vcvtph2ps %xmm5, %xmm4 ; AVX-NEXT: vmovq %rax, %xmm5 ; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX-NEXT: vcvttss2si %xmm4, %rax ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rax +; AVX-NEXT: vmovq %rcx, %xmm4 +; AVX-NEXT: vcvttss2si %xmm3, %rcx ; AVX-NEXT: vmovq %rax, %xmm3 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX-NEXT: vmovq %rcx, %xmm5 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0] ; AVX-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX-NEXT: retq ; @@ -325,127 +325,127 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX-NEXT: vcvttss2si %xmm3, %rcx ; AVX-NEXT: vmovq %rcx, %xmm3 +; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX-NEXT: vcvttss2si %xmm4, %rcx ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX-NEXT: vcvttss2si %xmm3, %rcx +; AVX-NEXT: vcvttss2si %xmm3, %rdx ; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,3,3,3] -; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX-NEXT: vcvttss2si %xmm4, %rcx -; AVX-NEXT: vmovq %rcx, %xmm4 +; AVX-NEXT: vmovq %rdx, %xmm4 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm5 +; AVX-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 +; AVX-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX-NEXT: vcvttss2si %xmm5, %rcx ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX-NEXT: vcvtph2ps %xmm1, %xmm4 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm4 +; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 ; AVX-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; AVX-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; AVX-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX-NEXT: vcvttss2si %xmm4, %rcx +; AVX-NEXT: vcvttss2si %xmm4, %rdx ; AVX-NEXT: vmovq %rcx, %xmm4 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm5 -; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; AVX-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX-NEXT: vcvttss2si %xmm5, %rcx -; AVX-NEXT: vmovq %rcx, %xmm5 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm5 ; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 ; AVX-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 ; AVX-NEXT: vcvtps2ph $4, %xmm5, %xmm5 ; AVX-NEXT: vcvtph2ps %xmm5, %xmm5 ; AVX-NEXT: vcvttss2si %xmm5, %rcx -; AVX-NEXT: vmovq %rcx, %xmm5 +; AVX-NEXT: vmovq %rdx, %xmm5 ; AVX-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] ; AVX-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX-NEXT: vroundss $4, %xmm6, %xmm6, %xmm6 ; AVX-NEXT: vcvtps2ph $4, %xmm6, %xmm6 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX-NEXT: vmovq %rcx, %xmm5 ; AVX-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX-NEXT: vcvttss2si %xmm6, %rcx ; AVX-NEXT: vmovq %rcx, %xmm6 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX-NEXT: vcvttss2si %xmm1, %rcx ; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vshufpd {{.*#+}} xmm6 = xmm8[1,0] +; AVX-NEXT: vshufpd {{.*#+}} xmm6 = xmm7[1,0] ; AVX-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX-NEXT: vroundss $4, %xmm6, %xmm6, %xmm6 ; AVX-NEXT: vcvtps2ph $4, %xmm6, %xmm6 ; AVX-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX-NEXT: vcvttss2si %xmm6, %rcx ; AVX-NEXT: vmovq %rcx, %xmm6 +; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX-NEXT: vroundss $4, %xmm8, %xmm8, %xmm8 +; AVX-NEXT: vcvtps2ph $4, %xmm8, %xmm8 +; AVX-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX-NEXT: vcvttss2si %xmm8, %rcx ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0] -; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm8[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3,3,3] ; AVX-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX-NEXT: vroundss $4, %xmm6, %xmm6, %xmm6 ; AVX-NEXT: vcvtps2ph $4, %xmm6, %xmm6 ; AVX-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX-NEXT: vcvttss2si %xmm6, %rcx +; AVX-NEXT: vcvttss2si %xmm6, %rdx ; AVX-NEXT: vmovq %rcx, %xmm6 -; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] -; AVX-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX-NEXT: vroundss $4, %xmm7, %xmm7, %xmm7 -; AVX-NEXT: vcvtps2ph $4, %xmm7, %xmm7 -; AVX-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX-NEXT: vcvttss2si %xmm7, %rcx -; AVX-NEXT: vmovq %rcx, %xmm7 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0] -; AVX-NEXT: vcvtph2ps %xmm8, %xmm7 -; AVX-NEXT: vroundss $4, %xmm7, %xmm7, %xmm7 -; AVX-NEXT: vcvtps2ph $4, %xmm7, %xmm7 -; AVX-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX-NEXT: vcvttss2si %xmm7, %rcx -; AVX-NEXT: vmovq %rcx, %xmm7 -; AVX-NEXT: vpsrld $16, %xmm8, %xmm9 -; AVX-NEXT: vcvtph2ps %xmm9, %xmm9 +; AVX-NEXT: vmovq %rdx, %xmm8 +; AVX-NEXT: vcvtph2ps %xmm7, %xmm9 ; AVX-NEXT: vroundss $4, %xmm9, %xmm9, %xmm9 ; AVX-NEXT: vcvtps2ph $4, %xmm9, %xmm9 ; AVX-NEXT: vcvtph2ps %xmm9, %xmm9 ; AVX-NEXT: vcvttss2si %xmm9, %rcx -; AVX-NEXT: vmovq %rcx, %xmm9 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; AVX-NEXT: vpsrlq $48, %xmm8, %xmm9 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm8[0],xmm6[0] +; AVX-NEXT: vpsrld $16, %xmm7, %xmm8 +; AVX-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX-NEXT: vroundss $4, %xmm8, %xmm8, %xmm8 +; AVX-NEXT: vcvtps2ph $4, %xmm8, %xmm8 +; AVX-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX-NEXT: vcvttss2si %xmm8, %rdx +; AVX-NEXT: vmovq %rcx, %xmm8 +; AVX-NEXT: vpsrlq $48, %xmm7, %xmm9 ; AVX-NEXT: vcvtph2ps %xmm9, %xmm9 ; AVX-NEXT: vroundss $4, %xmm9, %xmm9, %xmm9 ; AVX-NEXT: vcvtps2ph $4, %xmm9, %xmm9 ; AVX-NEXT: vcvtph2ps %xmm9, %xmm9 ; AVX-NEXT: vcvttss2si %xmm9, %rcx -; AVX-NEXT: vmovq %rcx, %xmm9 -; AVX-NEXT: vmovshdup {{.*#+}} xmm8 = xmm8[1,1,3,3] +; AVX-NEXT: vmovq %rdx, %xmm9 +; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3] +; AVX-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX-NEXT: vroundss $4, %xmm7, %xmm7, %xmm7 +; AVX-NEXT: vcvtps2ph $4, %xmm7, %xmm7 +; AVX-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX-NEXT: vcvttss2si %xmm7, %rdx +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm9[0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vcvtph2ps %xmm8, %xmm8 ; AVX-NEXT: vroundss $4, %xmm8, %xmm8, %xmm8 ; AVX-NEXT: vcvtps2ph $4, %xmm8, %xmm8 -; AVX-NEXT: vcvtph2ps %xmm8, %xmm8 -; AVX-NEXT: vcvttss2si %xmm8, %rcx -; AVX-NEXT: vmovq %rcx, %xmm8 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0] -; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vmovq %rcx, %xmm9 +; AVX-NEXT: vmovq %rdx, %xmm10 +; AVX-NEXT: vcvtph2ps %xmm8, %xmm11 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm9[0] +; AVX-NEXT: vshufpd {{.*#+}} xmm9 = xmm0[1,0] ; AVX-NEXT: vcvtph2ps %xmm9, %xmm9 ; AVX-NEXT: vroundss $4, %xmm9, %xmm9, %xmm9 ; AVX-NEXT: vcvtps2ph $4, %xmm9, %xmm9 +; AVX-NEXT: vcvttss2si %xmm11, %rcx ; AVX-NEXT: vcvtph2ps %xmm9, %xmm9 +; AVX-NEXT: vmovq %rcx, %xmm10 ; AVX-NEXT: vcvttss2si %xmm9, %rcx ; AVX-NEXT: vmovq %rcx, %xmm9 -; AVX-NEXT: vshufpd {{.*#+}} xmm10 = xmm0[1,0] -; AVX-NEXT: vcvtph2ps %xmm10, %xmm10 -; AVX-NEXT: vroundss $4, %xmm10, %xmm10, %xmm10 -; AVX-NEXT: vcvtps2ph $4, %xmm10, %xmm10 -; AVX-NEXT: vcvtph2ps %xmm10, %xmm10 -; AVX-NEXT: vcvttss2si %xmm10, %rcx -; AVX-NEXT: vmovq %rcx, %xmm10 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm9[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm10 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vcvtph2ps %xmm10, %xmm10 ; AVX-NEXT: vroundss $4, %xmm10, %xmm10, %xmm10 @@ -474,85 +474,85 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; AVX-NEXT: vcvtph2ps %xmm12, %xmm12 ; AVX-NEXT: vcvttss2si %xmm12, %rcx ; AVX-NEXT: vmovq %rcx, %xmm12 +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm13 +; AVX-NEXT: vcvtph2ps %xmm13, %xmm13 +; AVX-NEXT: vroundss $4, %xmm13, %xmm13, %xmm13 +; AVX-NEXT: vcvtps2ph $4, %xmm13, %xmm13 +; AVX-NEXT: vcvtph2ps %xmm13, %xmm13 +; AVX-NEXT: vcvttss2si %xmm13, %rcx ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm12[0] -; AVX-NEXT: vpsrlq $48, %xmm0, %xmm12 +; AVX-NEXT: vmovshdup {{.*#+}} xmm12 = xmm0[1,1,3,3] ; AVX-NEXT: vcvtph2ps %xmm12, %xmm12 ; AVX-NEXT: vroundss $4, %xmm12, %xmm12, %xmm12 ; AVX-NEXT: vcvtps2ph $4, %xmm12, %xmm12 ; AVX-NEXT: vcvtph2ps %xmm12, %xmm12 -; AVX-NEXT: vcvttss2si %xmm12, %rcx +; AVX-NEXT: vcvttss2si %xmm12, %rdx +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpsrldq {{.*#+}} xmm12 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vcvtph2ps %xmm12, %xmm12 +; AVX-NEXT: vroundss $4, %xmm12, %xmm12, %xmm12 +; AVX-NEXT: vcvtps2ph $4, %xmm12, %xmm12 +; AVX-NEXT: vcvtph2ps %xmm12, %xmm12 +; AVX-NEXT: vcvttss2si %xmm12, %rsi ; AVX-NEXT: vmovq %rcx, %xmm12 -; AVX-NEXT: vmovshdup {{.*#+}} xmm13 = xmm0[1,1,3,3] +; AVX-NEXT: vshufpd {{.*#+}} xmm13 = xmm0[1,0] ; AVX-NEXT: vcvtph2ps %xmm13, %xmm13 ; AVX-NEXT: vroundss $4, %xmm13, %xmm13, %xmm13 ; AVX-NEXT: vcvtps2ph $4, %xmm13, %xmm13 ; AVX-NEXT: vcvtph2ps %xmm13, %xmm13 ; AVX-NEXT: vcvttss2si %xmm13, %rcx -; AVX-NEXT: vmovq %rcx, %xmm13 +; AVX-NEXT: vmovq %rdx, %xmm13 +; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vcvtph2ps %xmm14, %xmm14 +; AVX-NEXT: vroundss $4, %xmm14, %xmm14, %xmm14 +; AVX-NEXT: vcvtps2ph $4, %xmm14, %xmm14 +; AVX-NEXT: vcvtph2ps %xmm14, %xmm14 +; AVX-NEXT: vcvttss2si %xmm14, %rdx ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm13[0],xmm12[0] -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,3,3,3] ; AVX-NEXT: vcvtph2ps %xmm13, %xmm13 ; AVX-NEXT: vroundss $4, %xmm13, %xmm13, %xmm13 ; AVX-NEXT: vcvtps2ph $4, %xmm13, %xmm13 +; AVX-NEXT: vmovq %rsi, %xmm14 +; AVX-NEXT: vmovq %rcx, %xmm15 ; AVX-NEXT: vcvtph2ps %xmm13, %xmm13 ; AVX-NEXT: vcvttss2si %xmm13, %rcx -; AVX-NEXT: vmovq %rcx, %xmm13 -; AVX-NEXT: vshufpd {{.*#+}} xmm14 = xmm0[1,0] -; AVX-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX-NEXT: vroundss $4, %xmm14, %xmm14, %xmm14 -; AVX-NEXT: vcvtps2ph $4, %xmm14, %xmm14 -; AVX-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX-NEXT: vcvttss2si %xmm14, %rcx -; AVX-NEXT: vmovq %rcx, %xmm14 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm14[0],xmm13[0] -; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vcvtph2ps %xmm0, %xmm13 +; AVX-NEXT: vroundss $4, %xmm13, %xmm13, %xmm13 +; AVX-NEXT: vcvtps2ph $4, %xmm13, %xmm13 +; AVX-NEXT: vcvtph2ps %xmm13, %xmm13 +; AVX-NEXT: vcvttss2si %xmm13, %rsi +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm15[0],xmm14[0] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm14 ; AVX-NEXT: vcvtph2ps %xmm14, %xmm14 ; AVX-NEXT: vroundss $4, %xmm14, %xmm14, %xmm14 ; AVX-NEXT: vcvtps2ph $4, %xmm14, %xmm14 +; AVX-NEXT: vmovq %rdx, %xmm15 ; AVX-NEXT: vcvtph2ps %xmm14, %xmm14 -; AVX-NEXT: vcvttss2si %xmm14, %rcx -; AVX-NEXT: vmovq %rcx, %xmm14 -; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,3,3,3] -; AVX-NEXT: vcvtph2ps %xmm15, %xmm15 -; AVX-NEXT: vroundss $4, %xmm15, %xmm15, %xmm15 -; AVX-NEXT: vcvtps2ph $4, %xmm15, %xmm15 -; AVX-NEXT: vcvtph2ps %xmm15, %xmm15 -; AVX-NEXT: vcvttss2si %xmm15, %rcx -; AVX-NEXT: vmovq %rcx, %xmm15 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm15[0],xmm14[0] -; AVX-NEXT: vcvtph2ps %xmm0, %xmm15 -; AVX-NEXT: vroundss $4, %xmm15, %xmm15, %xmm15 -; AVX-NEXT: vcvtps2ph $4, %xmm15, %xmm15 -; AVX-NEXT: vcvtph2ps %xmm15, %xmm15 -; AVX-NEXT: vcvttss2si %xmm15, %rcx -; AVX-NEXT: vpsrld $16, %xmm0, %xmm15 -; AVX-NEXT: vcvtph2ps %xmm15, %xmm15 -; AVX-NEXT: vroundss $4, %xmm15, %xmm15, %xmm15 -; AVX-NEXT: vcvtps2ph $4, %xmm15, %xmm15 -; AVX-NEXT: vcvtph2ps %xmm15, %xmm15 -; AVX-NEXT: vcvttss2si %xmm15, %rdx -; AVX-NEXT: vmovq %rcx, %xmm15 -; AVX-NEXT: vmovq %rdx, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm15[0],xmm2[0] +; AVX-NEXT: vmovq %rcx, %xmm2 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm15[0] ; AVX-NEXT: vpsrlq $48, %xmm0, %xmm15 ; AVX-NEXT: vcvtph2ps %xmm15, %xmm15 ; AVX-NEXT: vroundss $4, %xmm15, %xmm15, %xmm15 ; AVX-NEXT: vcvtps2ph $4, %xmm15, %xmm15 -; AVX-NEXT: vcvtph2ps %xmm15, %xmm15 -; AVX-NEXT: vcvttss2si %xmm15, %rcx +; AVX-NEXT: vcvttss2si %xmm14, %rcx +; AVX-NEXT: vcvtph2ps %xmm15, %xmm14 +; AVX-NEXT: vcvttss2si %xmm14, %rdx +; AVX-NEXT: vmovq %rsi, %xmm14 +; AVX-NEXT: vmovq %rcx, %xmm15 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm15[0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX-NEXT: vcvttss2si %xmm0, %rdx -; AVX-NEXT: vmovq %rcx, %xmm0 -; AVX-NEXT: vmovq %rdx, %xmm15 +; AVX-NEXT: vcvttss2si %xmm0, %rcx +; AVX-NEXT: vmovq %rdx, %xmm0 +; AVX-NEXT: vmovq %rcx, %xmm15 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm15[0],xmm0[0] ; AVX-NEXT: vmovdqa %xmm0, 80(%rdi) -; AVX-NEXT: vmovdqa %xmm2, 64(%rdi) -; AVX-NEXT: vmovdqa %xmm14, 112(%rdi) +; AVX-NEXT: vmovdqa %xmm14, 64(%rdi) +; AVX-NEXT: vmovdqa %xmm2, 112(%rdi) ; AVX-NEXT: vmovdqa %xmm13, 96(%rdi) ; AVX-NEXT: vmovdqa %xmm12, 16(%rdi) ; AVX-NEXT: vmovdqa %xmm11, (%rdi) diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll index 7017eb60df41d..3687508e5ea8a 100644 --- a/llvm/test/CodeGen/X86/vector-llrint.ll +++ b/llvm/test/CodeGen/X86/vector-llrint.ll @@ -28,22 +28,21 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) { ; SSE-LABEL: llrint_v2i64_v2f32: ; SSE: # %bb.0: ; SSE-NEXT: cvtss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: cvtss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: llrint_v2i64_v2f32: ; AVX: # %bb.0: ; AVX-NEXT: vcvtss2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-NEXT: vcvtss2si %xmm0, %rax +; AVX-NEXT: vcvtss2si %xmm0, %rcx ; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vmovq %rcx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; ; AVX512DQ-LABEL: llrint_v2i64_v2f32: @@ -124,38 +123,38 @@ declare <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float>) define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) { ; SSE-LABEL: llrint_v8i64_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: cvtss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: cvtss2si %xmm3, %rax -; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSE-NEXT: cvtss2si %xmm3, %rax -; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: cvtss2si %xmm2, %rax +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] +; SSE-NEXT: cvtss2si %xmm2, %rcx ; SSE-NEXT: movq %rax, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: cvtss2si %xmm1, %rax +; SSE-NEXT: movq %rcx, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: cvtss2si %xmm2, %rax ; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: cvtss2si %xmm3, %rax -; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] -; SSE-NEXT: cvtss2si %xmm3, %rax +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: cvtss2si %xmm0, %rax ; SSE-NEXT: movq %rax, %xmm5 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: cvtss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvtss2si %xmm1, %rax ; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: llrint_v8i64_v8f32: @@ -197,17 +196,17 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) { ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vcvtss2si %xmm2, %rax +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vcvtss2si %xmm2, %rcx ; AVX512-NEXT: vmovq %rax, %xmm2 -; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX512-NEXT: vcvtss2si %xmm3, %rax -; AVX512-NEXT: vmovq %rax, %xmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vmovq %rcx, %xmm3 ; AVX512-NEXT: vcvtss2si %xmm1, %rax -; AVX512-NEXT: vmovq %rax, %xmm3 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512-NEXT: vcvtss2si %xmm1, %rax +; AVX512-NEXT: vcvtss2si %xmm1, %rcx ; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vmovq %rcx, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vcvtss2si %xmm2, %rax @@ -238,8 +237,8 @@ declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>) define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; SSE-LABEL: llrint_v16i64_v16f32: ; SSE: # %bb.0: -; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: cvtss2si %xmm0, %rcx +; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: movq %rcx, %xmm4 ; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] @@ -255,11 +254,11 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; SSE-NEXT: movq %rcx, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE-NEXT: cvtss2si %xmm1, %rcx +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: cvtss2si %xmm5, %rdx ; SSE-NEXT: movq %rcx, %xmm5 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[1,1] -; SSE-NEXT: cvtss2si %xmm6, %rcx -; SSE-NEXT: movq %rcx, %xmm6 +; SSE-NEXT: movq %rdx, %xmm6 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] ; SSE-NEXT: movaps %xmm1, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[3,3] @@ -289,17 +288,17 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; SSE-NEXT: movaps %xmm3, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] ; SSE-NEXT: cvtss2si %xmm8, %rcx -; SSE-NEXT: movq %rcx, %xmm8 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0] ; SSE-NEXT: movaps %xmm3, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3] -; SSE-NEXT: cvtss2si %xmm8, %rcx +; SSE-NEXT: cvtss2si %xmm8, %rdx ; SSE-NEXT: movq %rcx, %xmm8 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] ; SSE-NEXT: cvtss2si %xmm3, %rcx -; SSE-NEXT: movq %rcx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0] -; SSE-NEXT: movdqa %xmm3, 112(%rdi) +; SSE-NEXT: movq %rdx, %xmm3 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: movq %rcx, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0] +; SSE-NEXT: movdqa %xmm8, 112(%rdi) ; SSE-NEXT: movdqa %xmm7, 96(%rdi) ; SSE-NEXT: movdqa %xmm2, 80(%rdi) ; SSE-NEXT: movdqa %xmm6, 64(%rdi) @@ -314,16 +313,16 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; AVX1-NEXT: vmovaps %ymm0, %ymm2 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] ; AVX1-NEXT: vcvtss2si %xmm0, %rax +; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX1-NEXT: vcvtss2si %xmm0, %rcx ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX1-NEXT: vcvtss2si %xmm3, %rax -; AVX1-NEXT: vmovq %rax, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vmovq %rcx, %xmm3 ; AVX1-NEXT: vcvtss2si %xmm2, %rax +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vcvtss2si %xmm3, %rcx ; AVX1-NEXT: vmovq %rax, %xmm3 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] -; AVX1-NEXT: vcvtss2si %xmm4, %rax -; AVX1-NEXT: vmovq %rax, %xmm4 +; AVX1-NEXT: vmovq %rcx, %xmm4 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -354,21 +353,21 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; AVX1-NEXT: vcvtss2si %xmm5, %rax ; AVX1-NEXT: vmovq %rax, %xmm5 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] -; AVX1-NEXT: vcvtss2si %xmm3, %rax -; AVX1-NEXT: vmovq %rax, %xmm3 -; AVX1-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm1[3,3,3,3] ; AVX1-NEXT: vcvtss2si %xmm5, %rax -; AVX1-NEXT: vmovq %rax, %xmm5 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX1-NEXT: vcvtss2si %xmm3, %rcx +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vmovq %rcx, %xmm5 ; AVX1-NEXT: vcvtss2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm5 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vcvtss2si %xmm1, %rax +; AVX1-NEXT: vcvtss2si %xmm1, %rcx ; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0] +; AVX1-NEXT: vmovq %rcx, %xmm5 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 ; AVX1-NEXT: vmovaps %ymm4, %ymm1 ; AVX1-NEXT: retq @@ -378,18 +377,18 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vcvtss2si %xmm2, %rax +; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vcvtss2si %xmm2, %rcx ; AVX512-NEXT: vmovq %rax, %xmm2 -; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX512-NEXT: vcvtss2si %xmm3, %rax -; AVX512-NEXT: vmovq %rax, %xmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vmovq %rcx, %xmm3 ; AVX512-NEXT: vcvtss2si %xmm1, %rax -; AVX512-NEXT: vmovq %rax, %xmm3 +; AVX512-NEXT: vmovq %rax, %xmm4 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX512-NEXT: vcvtss2si %xmm1, %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vcvtss2si %xmm2, %rax ; AVX512-NEXT: vmovq %rax, %xmm2 @@ -475,22 +474,21 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) { ; SSE-LABEL: llrint_v2i64_v2f64: ; SSE: # %bb.0: ; SSE-NEXT: cvtsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvtsd2si %xmm0, %rax +; SSE-NEXT: cvtsd2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: llrint_v2i64_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vcvtsd2si %xmm0, %rax -; AVX-NEXT: vmovq %rax, %xmm1 ; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vcvtsd2si %xmm0, %rax +; AVX-NEXT: vcvtsd2si %xmm0, %rcx ; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vmovq %rcx, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; ; AVX512DQ-LABEL: llrint_v2i64_v2f64: @@ -603,17 +601,17 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vcvtsd2si %xmm2, %rax -; AVX1-NEXT: vmovq %rax, %xmm3 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX1-NEXT: vcvtsd2si %xmm2, %rax +; AVX1-NEXT: vcvtsd2si %xmm2, %rcx ; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vmovq %rcx, %xmm3 ; AVX1-NEXT: vcvtsd2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm3 ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vcvtsd2si %xmm0, %rax +; AVX1-NEXT: vcvtsd2si %xmm0, %rcx ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vmovq %rcx, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vcvtsd2si %xmm2, %rax @@ -639,13 +637,13 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-NEXT: vcvtsd2si %xmm1, %rax ; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; AVX512-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-NEXT: vcvtsd2si %xmm3, %rax +; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-NEXT: vcvtsd2si %xmm3, %rcx ; AVX512-NEXT: vmovq %rax, %xmm3 -; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512-NEXT: vcvtsd2si %xmm2, %rax -; AVX512-NEXT: vmovq %rax, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vmovq %rcx, %xmm2 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-lrint-f16.ll b/llvm/test/CodeGen/X86/vector-lrint-f16.ll index 1316f808aa27e..ffcb864699f1c 100644 --- a/llvm/test/CodeGen/X86/vector-lrint-f16.ll +++ b/llvm/test/CodeGen/X86/vector-lrint-f16.ll @@ -88,11 +88,11 @@ define <2 x iXLen> @lrint_v2f16(<2 x half> %x) { ; X86-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvttss2si %xmm1, %ecx ; X86-AVX-I16-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 @@ -102,11 +102,11 @@ define <2 x iXLen> @lrint_v2f16(<2 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 @@ -116,11 +116,11 @@ define <2 x iXLen> @lrint_v2f16(<2 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 @@ -152,11 +152,11 @@ define <2 x iXLen> @lrint_v2f16(<2 x half> %x) { ; X64-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvttss2si %xmm1, %ecx ; X64-AVX-I16-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 @@ -166,11 +166,11 @@ define <2 x iXLen> @lrint_v2f16(<2 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 @@ -180,11 +180,11 @@ define <2 x iXLen> @lrint_v2f16(<2 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 @@ -265,11 +265,11 @@ define <4 x iXLen> @lrint_v4f16(<4 x half> %x) { ; X86-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvttss2si %xmm1, %ecx ; X86-AVX-I16-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 @@ -279,11 +279,11 @@ define <4 x iXLen> @lrint_v4f16(<4 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 @@ -293,11 +293,11 @@ define <4 x iXLen> @lrint_v4f16(<4 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 @@ -329,11 +329,11 @@ define <4 x iXLen> @lrint_v4f16(<4 x half> %x) { ; X64-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvttss2si %xmm1, %ecx ; X64-AVX-I16-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 @@ -343,11 +343,11 @@ define <4 x iXLen> @lrint_v4f16(<4 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 @@ -357,11 +357,11 @@ define <4 x iXLen> @lrint_v4f16(<4 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 @@ -393,11 +393,11 @@ define <4 x iXLen> @lrint_v4f16(<4 x half> %x) { ; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx ; X86-AVX-I32-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 @@ -429,11 +429,11 @@ define <4 x iXLen> @lrint_v4f16(<4 x half> %x) { ; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx ; X64-AVX-I32-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 @@ -470,11 +470,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X86-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvttss2si %xmm1, %ecx ; X86-AVX-I16-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 @@ -484,11 +484,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 @@ -498,11 +498,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 @@ -534,11 +534,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X64-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvttss2si %xmm1, %ecx ; X64-AVX-I16-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 @@ -548,11 +548,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 @@ -562,11 +562,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 @@ -599,11 +599,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx ; X86-AVX-I32-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 @@ -613,11 +613,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm2 @@ -626,11 +626,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %ecx ; X86-AVX-I32-NEXT: vmovd %ecx, %xmm2 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 @@ -664,11 +664,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx ; X64-AVX-I32-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 @@ -678,11 +678,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm2 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm2 @@ -691,11 +691,11 @@ define <8 x iXLen> @lrint_v8f16(<8 x half> %x) { ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %ecx ; X64-AVX-I32-NEXT: vmovd %ecx, %xmm2 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax ; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 @@ -734,11 +734,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %ecx ; X86-AVX-I16-NEXT: vmovd %ecx, %xmm2 -; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 @@ -748,11 +748,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -762,11 +762,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 @@ -776,12 +776,12 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvttss2si %xmm1, %eax +; X86-AVX-I16-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I16-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm1, %xmm3 ; X86-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; X86-AVX-I16-NEXT: vpsrld $16, %xmm0, %xmm2 -; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 -; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 -; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm2 ; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 @@ -789,11 +789,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %ecx ; X86-AVX-I16-NEXT: vmovd %ecx, %xmm2 -; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 @@ -803,11 +803,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -817,11 +817,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 @@ -855,11 +855,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %ecx ; X64-AVX-I16-NEXT: vmovd %ecx, %xmm2 -; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 @@ -869,11 +869,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -883,11 +883,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 @@ -897,12 +897,12 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvttss2si %xmm1, %eax +; X64-AVX-I16-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 +; X64-AVX-I16-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm1, %xmm3 ; X64-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; X64-AVX-I16-NEXT: vpsrld $16, %xmm0, %xmm2 -; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 -; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 -; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm2 ; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 @@ -910,11 +910,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %ecx ; X64-AVX-I16-NEXT: vmovd %ecx, %xmm2 -; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 @@ -924,11 +924,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -938,11 +938,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 @@ -976,11 +976,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx ; X86-AVX-I32-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 @@ -990,11 +990,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm2 @@ -1010,73 +1010,72 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vpsrlq $48, %xmm0, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 -; X86-AVX-I32-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 ; X86-AVX-I32-NEXT: vextracti128 $1, %ymm0, %xmm0 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %eax -; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx -; X86-AVX-I32-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X86-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm3 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx ; X86-AVX-I32-NEXT: vmovd %ecx, %xmm3 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %eax ; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %eax +; X86-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm4 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %eax +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm4 +; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %ecx +; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %edx +; X86-AVX-I32-NEXT: vmovd %ecx, %xmm4 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 ; X86-AVX-I32-NEXT: vpsrlq $48, %xmm0, %xmm0 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 ; X86-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 ; X86-AVX-I32-NEXT: vcvttss2si %xmm0, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 -; X86-AVX-I32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; X86-AVX-I32-NEXT: vmovdqa %ymm2, %ymm0 +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm4, %xmm4 +; X86-AVX-I32-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; X86-AVX-I32-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm1 ; X86-AVX-I32-NEXT: retl ; ; X86-FP16-I32-LABEL: lrint_v16f16: @@ -1099,11 +1098,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx ; X64-AVX-I32-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 @@ -1113,11 +1112,11 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm2 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax ; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm2 @@ -1133,73 +1132,72 @@ define <16 x iXLen> @lrint_v16f16(<16 x half> %x) { ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vpsrlq $48, %xmm0, %xmm3 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax ; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 ; X64-AVX-I32-NEXT: vextracti128 $1, %ymm0, %xmm0 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %eax -; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx -; X64-AVX-I32-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X64-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm3 ; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx ; X64-AVX-I32-NEXT: vmovd %ecx, %xmm3 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %eax ; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %eax +; X64-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm4 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %eax +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm4 +; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %ecx +; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %edx +; X64-AVX-I32-NEXT: vmovd %ecx, %xmm4 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vpsrlq $48, %xmm0, %xmm0 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 ; X64-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 ; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 -; X64-AVX-I32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; X64-AVX-I32-NEXT: vmovdqa %ymm2, %ymm0 +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm4, %xmm4 +; X64-AVX-I32-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; X64-AVX-I32-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm1 ; X64-AVX-I32-NEXT: retq ; ; X64-FP16-I32-LABEL: lrint_v16f16: @@ -1214,6 +1212,18 @@ declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half>) define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X86-AVX-I16-LABEL: lrint_v32f32: ; X86-AVX-I16: # %bb.0: +; X86-AVX-I16-NEXT: pushl %ebp +; X86-AVX-I16-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-I16-NEXT: pushl %ebx +; X86-AVX-I16-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-I16-NEXT: pushl %edi +; X86-AVX-I16-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX-I16-NEXT: pushl %esi +; X86-AVX-I16-NEXT: .cfi_def_cfa_offset 20 +; X86-AVX-I16-NEXT: .cfi_offset %esi, -20 +; X86-AVX-I16-NEXT: .cfi_offset %edi, -16 +; X86-AVX-I16-NEXT: .cfi_offset %ebx, -12 +; X86-AVX-I16-NEXT: .cfi_offset %ebp, -8 ; X86-AVX-I16-NEXT: vextracti128 $1, %ymm0, %xmm2 ; X86-AVX-I16-NEXT: vpsrld $16, %xmm2, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 @@ -1227,11 +1237,11 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %ecx ; X86-AVX-I16-NEXT: vmovd %ecx, %xmm3 -; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax ; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 @@ -1241,11 +1251,11 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax ; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -1255,11 +1265,11 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax ; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 @@ -1269,12 +1279,12 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax +; X86-AVX-I16-NEXT: vpsrld $16, %xmm0, %xmm2 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm4 ; X86-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 -; X86-AVX-I16-NEXT: vpsrld $16, %xmm0, %xmm3 -; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm3 ; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 @@ -1289,11 +1299,11 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vpsrlq $48, %xmm0, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax ; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 @@ -1303,141 +1313,149 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm4 +; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %ecx ; X86-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm0 ; X86-AVX-I16-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm0 ; X86-AVX-I16-NEXT: vcvttss2si %xmm0, %eax -; X86-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm3, %xmm0 -; X86-AVX-I16-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; X86-AVX-I16-NEXT: vextracti128 $1, %ymm1, %xmm2 -; X86-AVX-I16-NEXT: vpsrld $16, %xmm2, %xmm3 -; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm3 -; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %ecx -; X86-AVX-I16-NEXT: vmovd %ecx, %xmm3 -; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; X86-AVX-I16-NEXT: vextracti128 $1, %ymm1, %xmm3 +; X86-AVX-I16-NEXT: vpsrld $16, %xmm3, %xmm0 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm0 +; X86-AVX-I16-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm0, %xmm5 +; X86-AVX-I16-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm0 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm5, %xmm4 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %ecx +; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm4 +; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %edx +; X86-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vpsrlq $48, %xmm2, %xmm4 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %esi +; X86-AVX-I16-NEXT: vpsrlq $48, %xmm3, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %edi +; X86-AVX-I16-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %ebx +; X86-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3,3,3] +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %ebp +; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 -; X86-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 -; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; X86-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 -; X86-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 -; X86-AVX-I16-NEXT: vpsrld $16, %xmm1, %xmm3 +; X86-AVX-I16-NEXT: vmovd %edx, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $2, %esi, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $3, %edi, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $4, %ebx, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $5, %ebp, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm3 -; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vpsrld $16, %xmm1, %xmm3 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I16-NEXT: vcvttss2si %xmm3, %ecx -; X86-AVX-I16-NEXT: vmovd %ecx, %xmm3 -; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm5 +; X86-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm4, %xmm3 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm5, %xmm4 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax +; X86-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm4 +; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %ecx ; X86-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %edx ; X86-AVX-I16-NEXT: vpsrlq $48, %xmm1, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %esi ; X86-AVX-I16-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %edi ; X86-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %ebx ; X86-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,3,3,3] ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 +; X86-AVX-I16-NEXT: vcvttss2si %xmm4, %ebp +; X86-AVX-I16-NEXT: vmovd %ecx, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $3, %esi, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $4, %edi, %xmm4, %xmm4 +; X86-AVX-I16-NEXT: vpinsrw $5, %ebx, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X86-AVX-I16-NEXT: vpinsrw $6, %ebp, %xmm4, %xmm4 ; X86-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I16-NEXT: vcvttss2si %xmm1, %eax -; X86-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm3, %xmm1 -; X86-AVX-I16-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; X86-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1 +; X86-AVX-I16-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX-I16-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; X86-AVX-I16-NEXT: popl %esi +; X86-AVX-I16-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX-I16-NEXT: popl %edi +; X86-AVX-I16-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-I16-NEXT: popl %ebx +; X86-AVX-I16-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-I16-NEXT: popl %ebp +; X86-AVX-I16-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-I16-NEXT: retl ; ; X86-FP16-I16-LABEL: lrint_v32f32: @@ -1460,11 +1478,11 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %ecx ; X64-AVX-I16-NEXT: vmovd %ecx, %xmm3 -; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax ; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 @@ -1474,11 +1492,11 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax ; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -1488,11 +1506,11 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax ; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 @@ -1502,12 +1520,12 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 ; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax +; X64-AVX-I16-NEXT: vpsrld $16, %xmm0, %xmm2 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm4 ; X64-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 -; X64-AVX-I16-NEXT: vpsrld $16, %xmm0, %xmm3 -; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax ; X64-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm3 ; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 @@ -1522,11 +1540,11 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vpsrlq $48, %xmm0, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax ; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 @@ -1536,141 +1554,141 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm4 +; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %ecx ; X64-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm0 ; X64-AVX-I16-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm0 ; X64-AVX-I16-NEXT: vcvttss2si %xmm0, %eax -; X64-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm3, %xmm0 -; X64-AVX-I16-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; X64-AVX-I16-NEXT: vextracti128 $1, %ymm1, %xmm2 -; X64-AVX-I16-NEXT: vpsrld $16, %xmm2, %xmm3 -; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm3 -; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %ecx -; X64-AVX-I16-NEXT: vmovd %ecx, %xmm3 -; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; X64-AVX-I16-NEXT: vextracti128 $1, %ymm1, %xmm3 +; X64-AVX-I16-NEXT: vpsrld $16, %xmm3, %xmm0 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I16-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm0, %xmm5 +; X64-AVX-I16-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm0 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm5, %xmm4 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %ecx +; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm4 +; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %edx +; X64-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vpsrlq $48, %xmm2, %xmm4 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %esi +; X64-AVX-I16-NEXT: vpsrlq $48, %xmm3, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %edi +; X64-AVX-I16-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %r8d +; X64-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3,3,3] +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %r9d +; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 -; X64-AVX-I16-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 -; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; X64-AVX-I16-NEXT: vcvtph2ps %xmm2, %xmm2 -; X64-AVX-I16-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 -; X64-AVX-I16-NEXT: vpsrld $16, %xmm1, %xmm3 +; X64-AVX-I16-NEXT: vmovd %edx, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $2, %esi, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $3, %edi, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $4, %r8d, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $5, %r9d, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm3 -; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vpsrld $16, %xmm1, %xmm3 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I16-NEXT: vcvttss2si %xmm3, %ecx -; X64-AVX-I16-NEXT: vmovd %ecx, %xmm3 -; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm3, %xmm5 +; X64-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm4, %xmm3 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm5, %xmm4 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax +; X64-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm4 +; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %ecx ; X64-AVX-I16-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %edx ; X64-AVX-I16-NEXT: vpsrlq $48, %xmm1, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %esi ; X64-AVX-I16-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %edi ; X64-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %r8d ; X64-AVX-I16-NEXT: vshufps {{.*#+}} xmm4 = xmm1[3,3,3,3] ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I16-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 +; X64-AVX-I16-NEXT: vcvttss2si %xmm4, %r9d +; X64-AVX-I16-NEXT: vmovd %ecx, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $3, %esi, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $4, %edi, %xmm4, %xmm4 +; X64-AVX-I16-NEXT: vpinsrw $5, %r8d, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X64-AVX-I16-NEXT: vpinsrw $6, %r9d, %xmm4, %xmm4 ; X64-AVX-I16-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I16-NEXT: vcvttss2si %xmm1, %eax -; X64-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm3, %xmm1 -; X64-AVX-I16-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; X64-AVX-I16-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1 +; X64-AVX-I16-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX-I16-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; X64-AVX-I16-NEXT: retq ; ; X64-FP16-I16-LABEL: lrint_v32f32: @@ -1680,234 +1698,255 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; ; X86-AVX-I32-LABEL: lrint_v32f32: ; X86-AVX-I32: # %bb.0: -; X86-AVX-I32-NEXT: vmovdqa %ymm0, %ymm2 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vcvttss2si %xmm0, %eax -; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm0 = xmm2[1,0] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vcvttss2si %xmm0, %ecx -; X86-AVX-I32-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3] +; X86-AVX-I32-NEXT: pushl %ebx +; X86-AVX-I32-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-I32-NEXT: pushl %edi +; X86-AVX-I32-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-I32-NEXT: pushl %esi +; X86-AVX-I32-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX-I32-NEXT: subl $48, %esp +; X86-AVX-I32-NEXT: .cfi_def_cfa_offset 64 +; X86-AVX-I32-NEXT: .cfi_offset %esi, -16 +; X86-AVX-I32-NEXT: .cfi_offset %edi, -12 +; X86-AVX-I32-NEXT: .cfi_offset %ebx, -8 +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax +; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %ecx +; X86-AVX-I32-NEXT: vmovd %ecx, %xmm2 +; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X86-AVX-I32-NEXT: vpsrld $16, %xmm2, %xmm3 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx -; X86-AVX-I32-NEXT: vmovd %ecx, %xmm3 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vpsrlq $48, %xmm2, %xmm4 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; X86-AVX-I32-NEXT: vextracti128 $1, %ymm2, %xmm2 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm4 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm2 +; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %ecx +; X86-AVX-I32-NEXT: vmovd %ecx, %xmm2 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] +; X86-AVX-I32-NEXT: vpsrlq $48, %xmm0, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx -; X86-AVX-I32-NEXT: vmovd %ecx, %xmm3 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3,3,3] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vpsrld $16, %xmm2, %xmm4 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %eax -; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm4 -; X86-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vcvttss2si %xmm4, %ecx -; X86-AVX-I32-NEXT: vmovd %ecx, %xmm4 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4 -; X86-AVX-I32-NEXT: vpsrlq $48, %xmm2, %xmm2 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 -; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 -; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2 -; X86-AVX-I32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill +; X86-AVX-I32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax -; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %ecx ; X86-AVX-I32-NEXT: vmovd %ecx, %xmm2 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax ; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 -; X86-AVX-I32-NEXT: vpsrld $16, %xmm1, %xmm3 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X86-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx -; X86-AVX-I32-NEXT: vmovd %ecx, %xmm3 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vpsrlq $48, %xmm1, %xmm5 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; X86-AVX-I32-NEXT: vextracti128 $1, %ymm1, %xmm1 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vmovdqu %xmm2, (%esp) # 16-byte Spill +; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm2 +; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %ecx +; X86-AVX-I32-NEXT: vmovd %ecx, %xmm2 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx -; X86-AVX-I32-NEXT: vmovd %ecx, %xmm3 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm5 = xmm1[3,3,3,3] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 -; X86-AVX-I32-NEXT: vpsrld $16, %xmm1, %xmm5 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm5 -; X86-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vcvttss2si %xmm5, %ecx -; X86-AVX-I32-NEXT: vmovd %ecx, %xmm5 -; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] -; X86-AVX-I32-NEXT: vcvtph2ps %xmm6, %xmm6 -; X86-AVX-I32-NEXT: vroundss $4, %xmm6, %xmm6, %xmm6 -; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm6, %xmm6 -; X86-AVX-I32-NEXT: vcvtph2ps %xmm6, %xmm6 -; X86-AVX-I32-NEXT: vcvttss2si %xmm6, %eax -; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm5, %xmm5 -; X86-AVX-I32-NEXT: vpsrlq $48, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vpsrlq $48, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvttss2si %xmm0, %eax +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm5 +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvttss2si %xmm0, %eax +; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0] +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvttss2si %xmm0, %ecx +; X86-AVX-I32-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax +; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax +; X86-AVX-I32-NEXT: vpsrld $16, %xmm1, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm6 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm0 +; X86-AVX-I32-NEXT: vcvttss2si %xmm0, %eax +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm0 +; X86-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvttss2si %xmm0, %ecx +; X86-AVX-I32-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %eax +; X86-AVX-I32-NEXT: vpsrlq $48, %xmm1, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X86-AVX-I32-NEXT: vcvttss2si %xmm2, %ecx +; X86-AVX-I32-NEXT: vextracti128 $1, %ymm1, %xmm7 +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %edx +; X86-AVX-I32-NEXT: vshufpd {{.*#+}} xmm1 = xmm7[1,0] +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %esi +; X86-AVX-I32-NEXT: vshufps {{.*#+}} xmm1 = xmm7[3,3,3,3] ; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %edi +; X86-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %ebx +; X86-AVX-I32-NEXT: vpsrld $16, %xmm7, %xmm1 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %eax -; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1 -; X86-AVX-I32-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm3 -; X86-AVX-I32-NEXT: vmovdqa %ymm4, %ymm1 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm7, %xmm1 +; X86-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm2 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm0 +; X86-AVX-I32-NEXT: vcvttss2si %xmm0, %ecx +; X86-AVX-I32-NEXT: vmovd %esi, %xmm0 +; X86-AVX-I32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vpinsrd $2, %edi, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm7[1,1,3,3] +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm3 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 # 32-byte Reload +; X86-AVX-I32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; X86-AVX-I32-NEXT: vcvttss2si %xmm1, %edx +; X86-AVX-I32-NEXT: vmovd %ecx, %xmm1 +; X86-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X86-AVX-I32-NEXT: vpinsrd $2, %edx, %xmm1, %xmm0 +; X86-AVX-I32-NEXT: vinserti128 $1, (%esp), %ymm5, %ymm1 # 16-byte Folded Reload +; X86-AVX-I32-NEXT: vpsrlq $48, %xmm7, %xmm5 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 +; X86-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 +; X86-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; X86-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 +; X86-AVX-I32-NEXT: vcvttss2si %xmm5, %eax +; X86-AVX-I32-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 +; X86-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X86-AVX-I32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; X86-AVX-I32-NEXT: vmovdqa %ymm4, %ymm0 +; X86-AVX-I32-NEXT: addl $48, %esp +; X86-AVX-I32-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX-I32-NEXT: popl %esi +; X86-AVX-I32-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX-I32-NEXT: popl %edi +; X86-AVX-I32-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-I32-NEXT: popl %ebx +; X86-AVX-I32-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-I32-NEXT: retl ; ; X86-FP16-I32-LABEL: lrint_v32f32: @@ -1920,234 +1959,232 @@ define <32 x iXLen> @lrint_v32f32(<32 x half> %x) { ; ; X64-AVX-I32-LABEL: lrint_v32f32: ; X64-AVX-I32: # %bb.0: -; X64-AVX-I32-NEXT: vmovdqa %ymm0, %ymm2 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %eax -; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm0 = xmm2[1,0] -; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %ecx -; X64-AVX-I32-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3] +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax +; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] +; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %ecx +; X64-AVX-I32-NEXT: vmovd %ecx, %xmm2 +; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X64-AVX-I32-NEXT: vpsrld $16, %xmm2, %xmm3 +; X64-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm3 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm3 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm3 ; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 ; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx ; X64-AVX-I32-NEXT: vmovd %ecx, %xmm3 ; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] -; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vpsrlq $48, %xmm2, %xmm4 +; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; X64-AVX-I32-NEXT: vextracti128 $1, %ymm2, %xmm2 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx -; X64-AVX-I32-NEXT: vmovd %ecx, %xmm3 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3,3,3] +; X64-AVX-I32-NEXT: vpsrlq $48, %xmm0, %xmm4 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %eax ; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %eax ; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vpsrld $16, %xmm2, %xmm4 +; X64-AVX-I32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %eax -; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm4 +; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] +; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vroundss $4, %xmm4, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvttss2si %xmm4, %ecx ; X64-AVX-I32-NEXT: vmovd %ecx, %xmm4 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvttss2si %xmm5, %eax ; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4 -; X64-AVX-I32-NEXT: vpsrlq $48, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2 -; X64-AVX-I32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax -; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %ecx -; X64-AVX-I32-NEXT: vmovd %ecx, %xmm2 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 -; X64-AVX-I32-NEXT: vpsrld $16, %xmm1, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx -; X64-AVX-I32-NEXT: vmovd %ecx, %xmm3 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vpsrlq $48, %xmm1, %xmm5 +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; X64-AVX-I32-NEXT: vextracti128 $1, %ymm1, %xmm1 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %eax -; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vroundss $4, %xmm3, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vcvttss2si %xmm3, %ecx -; X64-AVX-I32-NEXT: vmovd %ecx, %xmm3 -; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm5 = xmm1[3,3,3,3] +; X64-AVX-I32-NEXT: vpsrld $16, %xmm0, %xmm5 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm4, %xmm4 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 -; X64-AVX-I32-NEXT: vpsrld $16, %xmm1, %xmm5 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vcvttss2si %xmm5, %eax -; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm5 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm5 ; X64-AVX-I32-NEXT: vroundss $4, %xmm5, %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm5, %xmm5 ; X64-AVX-I32-NEXT: vcvttss2si %xmm5, %ecx ; X64-AVX-I32-NEXT: vmovd %ecx, %xmm5 ; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] +; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3] ; X64-AVX-I32-NEXT: vcvtph2ps %xmm6, %xmm6 ; X64-AVX-I32-NEXT: vroundss $4, %xmm6, %xmm6, %xmm6 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm6, %xmm6 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm6, %xmm6 ; X64-AVX-I32-NEXT: vcvttss2si %xmm6, %eax +; X64-AVX-I32-NEXT: vpsrlq $48, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm5, %xmm5 -; X64-AVX-I32-NEXT: vpsrlq $48, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %eax +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm5, %xmm5 +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %eax +; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0] +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %ecx +; X64-AVX-I32-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3] +; X64-AVX-I32-NEXT: vcvtph2ps %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vroundss $4, %xmm6, %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vcvttss2si %xmm6, %eax +; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX-I32-NEXT: vcvtph2ps %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vroundss $4, %xmm6, %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vcvttss2si %xmm6, %eax +; X64-AVX-I32-NEXT: vpsrld $16, %xmm1, %xmm6 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vroundss $4, %xmm6, %xmm6, %xmm6 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm6, %xmm7 +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm6 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm7, %xmm0 +; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %eax +; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm0 +; X64-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %ecx +; X64-AVX-I32-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm7 = xmm1[1,1,3,3] +; X64-AVX-I32-NEXT: vcvtph2ps %xmm7, %xmm7 +; X64-AVX-I32-NEXT: vroundss $4, %xmm7, %xmm7, %xmm7 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm7, %xmm7 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm7, %xmm7 +; X64-AVX-I32-NEXT: vcvttss2si %xmm7, %eax +; X64-AVX-I32-NEXT: vpsrlq $48, %xmm1, %xmm7 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm7, %xmm7 +; X64-AVX-I32-NEXT: vroundss $4, %xmm7, %xmm7, %xmm7 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm7, %xmm7 +; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm7, %xmm7 +; X64-AVX-I32-NEXT: vcvttss2si %xmm7, %eax +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm7 +; X64-AVX-I32-NEXT: vextracti128 $1, %ymm1, %xmm8 +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm0 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %eax +; X64-AVX-I32-NEXT: vshufpd {{.*#+}} xmm0 = xmm8[1,0] +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvttss2si %xmm0, %ecx +; X64-AVX-I32-NEXT: vmovd %ecx, %xmm0 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3,3,3] +; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %eax +; X64-AVX-I32-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx +; X64-AVX-I32-NEXT: vpsrld $16, %xmm8, %xmm1 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 ; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %eax -; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1 -; X64-AVX-I32-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm3 -; X64-AVX-I32-NEXT: vmovdqa %ymm4, %ymm1 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm8, %xmm1 +; X64-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %edx +; X64-AVX-I32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm8[1,1,3,3] +; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm9 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm0 +; X64-AVX-I32-NEXT: vcvttss2si %xmm1, %ecx +; X64-AVX-I32-NEXT: vmovd %edx, %xmm1 +; X64-AVX-I32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-AVX-I32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm3 +; X64-AVX-I32-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm1 +; X64-AVX-I32-NEXT: vpsrlq $48, %xmm8, %xmm2 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vcvtph2ps %xmm2, %xmm2 +; X64-AVX-I32-NEXT: vcvttss2si %xmm2, %eax +; X64-AVX-I32-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm2 +; X64-AVX-I32-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 +; X64-AVX-I32-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 ; X64-AVX-I32-NEXT: retq ; ; X64-FP16-I32-LABEL: lrint_v32f32: diff --git a/llvm/test/CodeGen/X86/vector-lrint.ll b/llvm/test/CodeGen/X86/vector-lrint.ll index b1c8d46f497f3..aff404405ff92 100644 --- a/llvm/test/CodeGen/X86/vector-lrint.ll +++ b/llvm/test/CodeGen/X86/vector-lrint.ll @@ -54,21 +54,21 @@ define <2 x iXLen> @lrint_v2f32(<2 x float> %x) { ; X64-AVX1-i64-LABEL: lrint_v2f32: ; X64-AVX1-i64: # %bb.0: ; X64-AVX1-i64-NEXT: vcvtss2si %xmm0, %rax -; X64-AVX1-i64-NEXT: vmovq %rax, %xmm1 ; X64-AVX1-i64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-AVX1-i64-NEXT: vcvtss2si %xmm0, %rax +; X64-AVX1-i64-NEXT: vcvtss2si %xmm0, %rcx ; X64-AVX1-i64-NEXT: vmovq %rax, %xmm0 -; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX1-i64-NEXT: vmovq %rcx, %xmm1 +; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-AVX1-i64-NEXT: retq ; ; AVX512-i64-LABEL: lrint_v2f32: ; AVX512-i64: # %bb.0: ; AVX512-i64-NEXT: vcvtss2si %xmm0, %rax -; AVX512-i64-NEXT: vmovq %rax, %xmm1 ; AVX512-i64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512-i64-NEXT: vcvtss2si %xmm0, %rax +; AVX512-i64-NEXT: vcvtss2si %xmm0, %rcx ; AVX512-i64-NEXT: vmovq %rax, %xmm0 -; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-i64-NEXT: vmovq %rcx, %xmm1 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-i64-NEXT: retq ; ; AVX512DQ-i64-LABEL: lrint_v2f32: @@ -197,17 +197,17 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) { ; AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-i64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-i64-NEXT: vcvtss2si %xmm2, %rax +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-i64-NEXT: vcvtss2si %xmm2, %rcx ; AVX512-i64-NEXT: vmovq %rax, %xmm2 -; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX512-i64-NEXT: vcvtss2si %xmm3, %rax -; AVX512-i64-NEXT: vmovq %rax, %xmm3 -; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-i64-NEXT: vmovq %rcx, %xmm3 ; AVX512-i64-NEXT: vcvtss2si %xmm1, %rax -; AVX512-i64-NEXT: vmovq %rax, %xmm3 ; AVX512-i64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512-i64-NEXT: vcvtss2si %xmm1, %rax +; AVX512-i64-NEXT: vcvtss2si %xmm1, %rcx ; AVX512-i64-NEXT: vmovq %rax, %xmm1 -; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-i64-NEXT: vmovq %rcx, %xmm3 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX512-i64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-i64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-i64-NEXT: vcvtss2si %xmm2, %rax @@ -285,21 +285,21 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) { ; X64-AVX1-i64-LABEL: lrint_v2f64: ; X64-AVX1-i64: # %bb.0: ; X64-AVX1-i64-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX1-i64-NEXT: vmovq %rax, %xmm1 ; X64-AVX1-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X64-AVX1-i64-NEXT: vcvtsd2si %xmm0, %rax +; X64-AVX1-i64-NEXT: vcvtsd2si %xmm0, %rcx ; X64-AVX1-i64-NEXT: vmovq %rax, %xmm0 -; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX1-i64-NEXT: vmovq %rcx, %xmm1 +; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-AVX1-i64-NEXT: retq ; ; AVX512-i64-LABEL: lrint_v2f64: ; AVX512-i64: # %bb.0: ; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax -; AVX512-i64-NEXT: vmovq %rax, %xmm1 ; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax +; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rcx ; AVX512-i64-NEXT: vmovq %rax, %xmm0 -; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-i64-NEXT: vmovq %rcx, %xmm1 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-i64-NEXT: retq ; ; AVX512DQ-i64-LABEL: lrint_v2f64: @@ -386,11 +386,11 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: cvtpd2dq %xmm1, %xmm1 +; X86-SSE2-NEXT: cvtpd2dq %xmm1, %xmm3 ; X86-SSE2-NEXT: cvtpd2dq %xmm0, %xmm0 -; X86-SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-SSE2-NEXT: cvtpd2dq %xmm2, %xmm1 ; X86-SSE2-NEXT: cvtpd2dq 8(%ebp), %xmm2 +; X86-SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; X86-SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -420,17 +420,17 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; X64-AVX1-i64: # %bb.0: ; X64-AVX1-i64-NEXT: vextractf128 $1, %ymm0, %xmm2 ; X64-AVX1-i64-NEXT: vcvtsd2si %xmm2, %rax -; X64-AVX1-i64-NEXT: vmovq %rax, %xmm3 ; X64-AVX1-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; X64-AVX1-i64-NEXT: vcvtsd2si %xmm2, %rax +; X64-AVX1-i64-NEXT: vcvtsd2si %xmm2, %rcx ; X64-AVX1-i64-NEXT: vmovq %rax, %xmm2 -; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; X64-AVX1-i64-NEXT: vmovq %rcx, %xmm3 ; X64-AVX1-i64-NEXT: vcvtsd2si %xmm0, %rax -; X64-AVX1-i64-NEXT: vmovq %rax, %xmm3 ; X64-AVX1-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; X64-AVX1-i64-NEXT: vcvtsd2si %xmm0, %rax +; X64-AVX1-i64-NEXT: vcvtsd2si %xmm0, %rcx ; X64-AVX1-i64-NEXT: vmovq %rax, %xmm0 -; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-AVX1-i64-NEXT: vmovq %rcx, %xmm3 +; X64-AVX1-i64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; X64-AVX1-i64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; X64-AVX1-i64-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-i64-NEXT: vcvtsd2si %xmm2, %rax @@ -456,13 +456,13 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax ; AVX512-i64-NEXT: vmovq %rax, %xmm1 -; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-i64-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax +; AVX512-i64-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512-i64-NEXT: vcvtsd2si %xmm3, %rax +; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512-i64-NEXT: vcvtsd2si %xmm3, %rcx ; AVX512-i64-NEXT: vmovq %rax, %xmm3 -; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax -; AVX512-i64-NEXT: vmovq %rax, %xmm2 +; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-i64-NEXT: vmovq %rcx, %xmm2 ; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-i64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll index cfb5fac2fd7aa..a93017a751155 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -98,10 +98,10 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm1 ; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 @@ -130,10 +130,10 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $4, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: pshufb %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 @@ -259,10 +259,10 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pshufb %xmm0, %xmm4 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE-NEXT: psrlw $4, %xmm1 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pshufb %xmm0, %xmm4 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: pshufb %xmm1, %xmm3 ; X86-SSE-NEXT: pcmpeqb %xmm2, %xmm1 @@ -376,10 +376,10 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: psrlw $4, %xmm1 ; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm1 @@ -408,10 +408,10 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlw $4, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: pshufb %xmm1, %xmm2 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm1 @@ -537,10 +537,10 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pshufb %xmm0, %xmm4 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE-NEXT: psrlw $4, %xmm1 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pshufb %xmm0, %xmm4 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: pshufb %xmm1, %xmm3 ; X86-SSE-NEXT: pcmpeqb %xmm2, %xmm1 @@ -1144,10 +1144,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: psrlw $4, %xmm3 ; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pshufb %xmm3, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 @@ -1164,10 +1164,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrlw $4, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: pshufb %xmm3, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 @@ -1183,14 +1183,14 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX1OR2-LABEL: testv8i16: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm3 ; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX1OR2-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm5 +; AVX1OR2-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpand %xmm0, %xmm1, %xmm0 @@ -1201,14 +1201,14 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX512VL-LABEL: testv8i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm5 +; AVX512VL-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VL-NEXT: vpand %xmm0, %xmm1, %xmm0 @@ -1219,14 +1219,14 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX512VLBWDQ-LABEL: testv8i16: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 @@ -1256,10 +1256,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE-NEXT: pshufb %xmm0, %xmm3 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE-NEXT: psrlw $4, %xmm1 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pshufb %xmm0, %xmm3 ; X86-SSE-NEXT: pxor %xmm4, %xmm4 ; X86-SSE-NEXT: pshufb %xmm1, %xmm2 ; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm1 @@ -1352,10 +1352,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: psrlw $4, %xmm3 ; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pshufb %xmm3, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm3 @@ -1372,10 +1372,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrlw $4, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: pshufb %xmm3, %xmm1 ; SSE41-NEXT: pcmpeqb %xmm4, %xmm3 @@ -1391,14 +1391,14 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX1OR2-LABEL: testv8i16u: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm3 ; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX1OR2-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm5 +; AVX1OR2-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX1OR2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpand %xmm0, %xmm1, %xmm0 @@ -1409,14 +1409,14 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX512VL-LABEL: testv8i16u: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm5 +; AVX512VL-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VL-NEXT: vpand %xmm0, %xmm1, %xmm0 @@ -1427,14 +1427,14 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX512VLBWDQ-LABEL: testv8i16u: ; AVX512VLBWDQ: # %bb.0: ; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 -; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm5 -; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VLBWDQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 @@ -1464,10 +1464,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE-NEXT: pshufb %xmm0, %xmm3 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE-NEXT: psrlw $4, %xmm1 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pshufb %xmm0, %xmm3 ; X86-SSE-NEXT: pxor %xmm4, %xmm4 ; X86-SSE-NEXT: pshufb %xmm1, %xmm2 ; X86-SSE-NEXT: pcmpeqb %xmm4, %xmm1 @@ -1815,26 +1815,16 @@ define <2 x i64> @foldv2i64() nounwind { ; SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: foldv2i64: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: foldv2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [55,0] -; AVX512VL-NEXT: retq +; NOBW-LABEL: foldv2i64: +; NOBW: # %bb.0: +; NOBW-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] +; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [55,0] +; AVX512VLBWDQ-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; -; AVX512-LABEL: foldv2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [55,0] -; AVX512-NEXT: retq -; ; X86-SSE-LABEL: foldv2i64: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0] @@ -1849,26 +1839,16 @@ define <2 x i64> @foldv2i64u() nounwind { ; SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: foldv2i64u: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: foldv2i64u: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [55,0] -; AVX512VL-NEXT: retq +; NOBW-LABEL: foldv2i64u: +; NOBW: # %bb.0: +; NOBW-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] +; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [55,0] +; AVX512VLBWDQ-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; -; AVX512-LABEL: foldv2i64u: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [55,0] -; AVX512-NEXT: retq -; ; X86-SSE-LABEL: foldv2i64u: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0] @@ -1883,26 +1863,16 @@ define <4 x i32> @foldv4i32() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: foldv4i32: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: foldv4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24] -; AVX512VL-NEXT: retq +; NOBW-LABEL: foldv4i32: +; NOBW: # %bb.0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv4i32: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] ; AVX512VLBWDQ-NEXT: retq ; -; AVX512-LABEL: foldv4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24] -; AVX512-NEXT: retq -; ; X86-SSE-LABEL: foldv4i32: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] @@ -1917,26 +1887,16 @@ define <4 x i32> @foldv4i32u() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: foldv4i32u: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: foldv4i32u: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24] -; AVX512VL-NEXT: retq +; NOBW-LABEL: foldv4i32u: +; NOBW: # %bb.0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv4i32u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] ; AVX512VLBWDQ-NEXT: retq ; -; AVX512-LABEL: foldv4i32u: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [23,0,32,24] -; AVX512-NEXT: retq -; ; X86-SSE-LABEL: foldv4i32u: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll index db363493e2dac..8ab7f50d92773 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -65,8 +65,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; AVX2-LABEL: testv4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -94,8 +93,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; AVX512VL-LABEL: testv4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -123,8 +121,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv4i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -164,8 +161,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; X86-AVX-LABEL: testv4i64: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # ymm1 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -251,8 +247,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX2-LABEL: testv4i64u: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -280,8 +275,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX512VL-LABEL: testv4i64u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -309,8 +303,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv4i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -350,8 +343,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; X86-AVX-LABEL: testv4i64u: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # ymm1 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -427,8 +419,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX2-LABEL: testv8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -451,8 +442,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX512VL-LABEL: testv8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -475,8 +465,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv8i32: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -511,8 +500,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; X86-AVX-LABEL: testv8i32: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # ymm1 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -583,8 +571,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX2-LABEL: testv8i32u: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -607,8 +594,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX512VL-LABEL: testv8i32u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -631,8 +617,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv8i32u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -667,8 +652,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; X86-AVX-LABEL: testv8i32u: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # ymm1 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -729,16 +713,15 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX2-LABEL: testv16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 -; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -748,16 +731,15 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512VL-LABEL: testv16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -767,16 +749,15 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv16i16: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 -; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VLBWDQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -794,16 +775,15 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; X86-AVX-LABEL: testv16i16: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # ymm1 = mem[0,1,0,1] -; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm2 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm3 ; X86-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; X86-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 -; X86-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 -; X86-AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; X86-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; X86-AVX-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 +; X86-AVX-NEXT: vpand %ymm5, %ymm3, %ymm3 +; X86-AVX-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; X86-AVX-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; X86-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -850,16 +830,15 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX2-LABEL: testv16i16u: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 -; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -869,16 +848,15 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX512VL-LABEL: testv16i16u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -888,16 +866,15 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv16i16u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm3 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 -; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VLBWDQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -915,16 +892,15 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; X86-AVX-LABEL: testv16i16u: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # ymm1 = mem[0,1,0,1] -; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm2 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 +; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm3 ; X86-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; X86-AVX-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 -; X86-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 -; X86-AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; X86-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; X86-AVX-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 +; X86-AVX-NEXT: vpand %ymm5, %ymm3, %ymm3 +; X86-AVX-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; X86-AVX-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; X86-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -961,8 +937,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX2-LABEL: testv32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -975,8 +950,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512VL-LABEL: testv32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 @@ -989,8 +963,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv32i8: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 @@ -1016,8 +989,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; X86-AVX-LABEL: testv32i8: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # ymm1 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 @@ -1057,8 +1029,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX2-LABEL: testv32i8u: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1071,8 +1042,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX512VL-LABEL: testv32i8u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 @@ -1085,8 +1055,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv32i8u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 @@ -1112,8 +1081,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; X86-AVX-LABEL: testv32i8u: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # ymm1 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 @@ -1128,30 +1096,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { } define <4 x i64> @foldv4i64() nounwind { -; AVX1-LABEL: foldv4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: foldv4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56] -; AVX512VL-NEXT: retq -; -; AVX512VLBWDQ-LABEL: foldv4i64: -; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56] -; AVX512VLBWDQ-NEXT: retq -; -; AVX512-LABEL: foldv4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56] -; AVX512-NEXT: retq +; X64-LABEL: foldv4i64: +; X64: # %bb.0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: retq ; ; X86-AVX-LABEL: foldv4i64: ; X86-AVX: # %bb.0: @@ -1162,30 +1110,10 @@ define <4 x i64> @foldv4i64() nounwind { } define <4 x i64> @foldv4i64u() nounwind { -; AVX1-LABEL: foldv4i64u: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv4i64u: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: foldv4i64u: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56] -; AVX512VL-NEXT: retq -; -; AVX512VLBWDQ-LABEL: foldv4i64u: -; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56] -; AVX512VLBWDQ-NEXT: retq -; -; AVX512-LABEL: foldv4i64u: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [55,0,64,56] -; AVX512-NEXT: retq +; X64-LABEL: foldv4i64u: +; X64: # %bb.0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: retq ; ; X86-AVX-LABEL: foldv4i64u: ; X86-AVX: # %bb.0: @@ -1196,30 +1124,10 @@ define <4 x i64> @foldv4i64u() nounwind { } define <8 x i32> @foldv8i32() nounwind { -; AVX1-LABEL: foldv8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: foldv8i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512VL-NEXT: retq -; -; AVX512VLBWDQ-LABEL: foldv8i32: -; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512VLBWDQ-NEXT: retq -; -; AVX512-LABEL: foldv8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512-NEXT: retq +; X64-LABEL: foldv8i32: +; X64: # %bb.0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: retq ; ; X86-AVX-LABEL: foldv8i32: ; X86-AVX: # %bb.0: @@ -1230,30 +1138,10 @@ define <8 x i32> @foldv8i32() nounwind { } define <8 x i32> @foldv8i32u() nounwind { -; AVX1-LABEL: foldv8i32u: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv8i32u: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: foldv8i32u: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512VL-NEXT: retq -; -; AVX512VLBWDQ-LABEL: foldv8i32u: -; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512VLBWDQ-NEXT: retq -; -; AVX512-LABEL: foldv8i32u: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512-NEXT: retq +; X64-LABEL: foldv8i32u: +; X64: # %bb.0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: retq ; ; X86-AVX-LABEL: foldv8i32u: ; X86-AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index d35a365508d54..79dfa709b12bd 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -29,10 +29,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 @@ -62,8 +61,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 @@ -108,10 +106,9 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 @@ -141,8 +138,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 @@ -185,10 +181,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 @@ -220,8 +215,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 @@ -272,10 +266,9 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 @@ -307,8 +300,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 @@ -370,8 +362,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm3 {%k1} {z} ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1 @@ -386,8 +377,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512DQ-LABEL: testv32i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -454,8 +444,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm3 {%k1} {z} ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm3, %zmm1 @@ -470,8 +459,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512DQ-LABEL: testv32i16u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -558,8 +546,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 @@ -568,8 +555,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512DQ-LABEL: testv64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -646,8 +632,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 @@ -656,8 +641,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512DQ-LABEL: testv64i8u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] diff --git a/llvm/test/CodeGen/X86/vector-merge-store-fp-constants.ll b/llvm/test/CodeGen/X86/vector-merge-store-fp-constants.ll index e902f3e995c45..45ba3266f4c3b 100644 --- a/llvm/test/CodeGen/X86/vector-merge-store-fp-constants.ll +++ b/llvm/test/CodeGen/X86/vector-merge-store-fp-constants.ll @@ -5,10 +5,9 @@ define void @merge_8_float_zero_stores(ptr %ptr) { ; DEFAULTCPU-LABEL: merge_8_float_zero_stores: ; DEFAULTCPU: # %bb.0: -; DEFAULTCPU-NEXT: movq $0, (%rdi) -; DEFAULTCPU-NEXT: movq $0, 8(%rdi) -; DEFAULTCPU-NEXT: movq $0, 16(%rdi) -; DEFAULTCPU-NEXT: movq $0, 24(%rdi) +; DEFAULTCPU-NEXT: xorps %xmm0, %xmm0 +; DEFAULTCPU-NEXT: movups %xmm0, (%rdi) +; DEFAULTCPU-NEXT: movups %xmm0, 16(%rdi) ; DEFAULTCPU-NEXT: retq ; ; X64CPU-LABEL: merge_8_float_zero_stores: diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 98b5bab98c4f9..6f8aca897a136 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -131,8 +131,8 @@ define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: retl @@ -146,8 +146,8 @@ define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind { ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE2-NEXT: retq @@ -934,42 +934,31 @@ define <32 x i8> @mul_v32i8_neg5(<32 x i8> %a0) nounwind { ; define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind { -; X86-SSE2-LABEL: mul_v2i64_17_65: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlq $32, %xmm0 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE2-NEXT: psllq $32, %xmm0 -; X86-SSE2-NEXT: paddq %xmm2, %xmm0 -; X86-SSE2-NEXT: retl -; -; SSE4-LABEL: mul_v2i64_17_65: -; SSE4: # %bb.0: -; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [17,65] -; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pmuludq %xmm1, %xmm2 -; SSE4-NEXT: psrlq $32, %xmm0 -; SSE4-NEXT: pmuludq %xmm1, %xmm0 -; SSE4-NEXT: psllq $32, %xmm0 -; SSE4-NEXT: paddq %xmm2, %xmm0 -; SSE4-NEXT: ret{{[l|q]}} +; X86-SSE-LABEL: mul_v2i64_17_65: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: retl ; -; X64-SSE2-LABEL: mul_v2i64_17_65: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [17,65] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: psrlq $32, %xmm0 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE2-NEXT: psllq $32, %xmm0 -; X64-SSE2-NEXT: paddq %xmm2, %xmm0 -; X64-SSE2-NEXT: retq +; X64-SSE-LABEL: mul_v2i64_17_65: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [17,65] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_17_65: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,65] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [17,65] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -979,7 +968,7 @@ define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_17_65: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,65] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [17,65] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1000,8 +989,8 @@ define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: retl @@ -1015,8 +1004,8 @@ define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind { ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE2-NEXT: retq @@ -1060,9 +1049,9 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3,9,17,33,65,129,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,2] +; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -1083,9 +1072,9 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3,9,17,33,65,129,2,3] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,2] +; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq @@ -1329,42 +1318,31 @@ define <16 x i8> @mul_v16i8_neg15(<16 x i8> %a0) nounwind { ; define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { -; X86-SSE2-LABEL: mul_v2i64_15_63: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlq $32, %xmm0 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE2-NEXT: psllq $32, %xmm0 -; X86-SSE2-NEXT: paddq %xmm2, %xmm0 -; X86-SSE2-NEXT: retl -; -; SSE4-LABEL: mul_v2i64_15_63: -; SSE4: # %bb.0: -; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [15,63] -; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pmuludq %xmm1, %xmm2 -; SSE4-NEXT: psrlq $32, %xmm0 -; SSE4-NEXT: pmuludq %xmm1, %xmm0 -; SSE4-NEXT: psllq $32, %xmm0 -; SSE4-NEXT: paddq %xmm2, %xmm0 -; SSE4-NEXT: ret{{[l|q]}} +; X86-SSE-LABEL: mul_v2i64_15_63: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: retl ; -; X64-SSE2-LABEL: mul_v2i64_15_63: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,63] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: psrlq $32, %xmm0 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE2-NEXT: psllq $32, %xmm0 -; X64-SSE2-NEXT: paddq %xmm2, %xmm0 -; X64-SSE2-NEXT: retq +; X64-SSE-LABEL: mul_v2i64_15_63: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,63] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_15_63: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,63] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,63] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1374,7 +1352,7 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_15_63: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,63] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,63] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1391,65 +1369,37 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { } define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { -; X86-SSE2-LABEL: mul_v2i64_neg_15_63: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrlq $32, %xmm2 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE2-NEXT: paddq %xmm1, %xmm2 -; X86-SSE2-NEXT: psllq $32, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 -; X86-SSE2-NEXT: paddq %xmm2, %xmm0 -; X86-SSE2-NEXT: retl -; -; X86-SSE4-LABEL: mul_v2i64_neg_15_63: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE4-NEXT: psrlq $32, %xmm2 -; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551601,18446744073709551553] -; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE4-NEXT: paddq %xmm1, %xmm2 -; X86-SSE4-NEXT: psllq $32, %xmm2 -; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 -; X86-SSE4-NEXT: paddq %xmm2, %xmm0 -; X86-SSE4-NEXT: retl -; -; X64-SSE2-LABEL: mul_v2i64_neg_15_63: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE2-NEXT: psrlq $32, %xmm3 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: paddq %xmm3, %xmm0 -; X64-SSE2-NEXT: psllq $32, %xmm0 -; X64-SSE2-NEXT: paddq %xmm2, %xmm0 -; X64-SSE2-NEXT: retq +; X86-SSE-LABEL: mul_v2i64_neg_15_63: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295] +; X86-SSE-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE-NEXT: paddq %xmm1, %xmm2 +; X86-SSE-NEXT: psllq $32, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: retl ; -; X64-SSE4-LABEL: mul_v2i64_neg_15_63: -; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] -; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE4-NEXT: psrlq $32, %xmm3 -; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE4-NEXT: paddq %xmm3, %xmm0 -; X64-SSE4-NEXT: psllq $32, %xmm0 -; X64-SSE4-NEXT: paddq %xmm2, %xmm0 -; X64-SSE4-NEXT: retq +; X64-SSE-LABEL: mul_v2i64_neg_15_63: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: psrlq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: paddq %xmm3, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_neg_15_63: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1461,7 +1411,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_neg_15_63: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1480,65 +1430,37 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind { } define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { -; X86-SSE2-LABEL: mul_v2i64_neg_17_65: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrlq $32, %xmm2 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE2-NEXT: paddq %xmm1, %xmm2 -; X86-SSE2-NEXT: psllq $32, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm0 -; X86-SSE2-NEXT: paddq %xmm2, %xmm0 -; X86-SSE2-NEXT: retl -; -; X86-SSE4-LABEL: mul_v2i64_neg_17_65: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-SSE4-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE4-NEXT: psrlq $32, %xmm2 -; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm3 = [18446744073709551599,18446744073709551551] -; X86-SSE4-NEXT: pmuludq %xmm3, %xmm2 -; X86-SSE4-NEXT: paddq %xmm1, %xmm2 -; X86-SSE4-NEXT: psllq $32, %xmm2 -; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 -; X86-SSE4-NEXT: paddq %xmm2, %xmm0 -; X86-SSE4-NEXT: retl -; -; X64-SSE2-LABEL: mul_v2i64_neg_17_65: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE2-NEXT: psrlq $32, %xmm3 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: paddq %xmm3, %xmm0 -; X64-SSE2-NEXT: psllq $32, %xmm0 -; X64-SSE2-NEXT: paddq %xmm2, %xmm0 -; X64-SSE2-NEXT: retq +; X86-SSE-LABEL: mul_v2i64_neg_17_65: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295] +; X86-SSE-NEXT: pmuludq %xmm3, %xmm2 +; X86-SSE-NEXT: paddq %xmm1, %xmm2 +; X86-SSE-NEXT: psllq $32, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm3, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: retl ; -; X64-SSE4-LABEL: mul_v2i64_neg_17_65: -; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] -; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE4-NEXT: psrlq $32, %xmm3 -; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE4-NEXT: paddq %xmm3, %xmm0 -; X64-SSE4-NEXT: psllq $32, %xmm0 -; X64-SSE4-NEXT: paddq %xmm2, %xmm0 -; X64-SSE4-NEXT: retq +; X64-SSE-LABEL: mul_v2i64_neg_17_65: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: psrlq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: paddq %xmm3, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_neg_17_65: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1550,7 +1472,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_neg_17_65: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1592,65 +1514,37 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { } define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { -; X86-SSE2-LABEL: mul_v2i64_neg_0_1: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psrlq $32, %xmm3 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: paddq %xmm3, %xmm0 -; X86-SSE2-NEXT: psllq $32, %xmm0 -; X86-SSE2-NEXT: paddq %xmm2, %xmm0 -; X86-SSE2-NEXT: retl -; -; X86-SSE4-LABEL: mul_v2i64_neg_0_1: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] -; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE4-NEXT: psrlq $32, %xmm3 -; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE4-NEXT: paddq %xmm3, %xmm0 -; X86-SSE4-NEXT: psllq $32, %xmm0 -; X86-SSE4-NEXT: paddq %xmm2, %xmm0 -; X86-SSE4-NEXT: retl -; -; X64-SSE2-LABEL: mul_v2i64_neg_0_1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE2-NEXT: psrlq $32, %xmm3 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: paddq %xmm3, %xmm0 -; X64-SSE2-NEXT: psllq $32, %xmm0 -; X64-SSE2-NEXT: paddq %xmm2, %xmm0 -; X64-SSE2-NEXT: retq +; X86-SSE-LABEL: mul_v2i64_neg_0_1: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: psrlq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: paddq %xmm3, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: retl ; -; X64-SSE4-LABEL: mul_v2i64_neg_0_1: -; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] -; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE4-NEXT: psrlq $32, %xmm3 -; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE4-NEXT: paddq %xmm3, %xmm0 -; X64-SSE4-NEXT: psllq $32, %xmm0 -; X64-SSE4-NEXT: paddq %xmm2, %xmm0 -; X64-SSE4-NEXT: retq +; X64-SSE-LABEL: mul_v2i64_neg_0_1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: psrlq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: paddq %xmm3, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_neg_0_1: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1662,7 +1556,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_neg_0_1: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,18446744073709551615] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1681,65 +1575,37 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind { } define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind { -; X86-SSE2-LABEL: mul_v2i64_15_neg_63: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psrlq $32, %xmm3 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: paddq %xmm3, %xmm0 -; X86-SSE2-NEXT: psllq $32, %xmm0 -; X86-SSE2-NEXT: paddq %xmm2, %xmm0 -; X86-SSE2-NEXT: retl -; -; X86-SSE4-LABEL: mul_v2i64_15_neg_63: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] -; X86-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE4-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE4-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE4-NEXT: psrlq $32, %xmm3 -; X86-SSE4-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE4-NEXT: paddq %xmm3, %xmm0 -; X86-SSE4-NEXT: psllq $32, %xmm0 -; X86-SSE4-NEXT: paddq %xmm2, %xmm0 -; X86-SSE4-NEXT: retl -; -; X64-SSE2-LABEL: mul_v2i64_15_neg_63: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,18446744073709551553] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE2-NEXT: psrlq $32, %xmm3 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: paddq %xmm3, %xmm0 -; X64-SSE2-NEXT: psllq $32, %xmm0 -; X64-SSE2-NEXT: paddq %xmm2, %xmm0 -; X64-SSE2-NEXT: retq +; X86-SSE-LABEL: mul_v2i64_15_neg_63: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: psrlq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: paddq %xmm3, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: retl ; -; X64-SSE4-LABEL: mul_v2i64_15_neg_63: -; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] -; X64-SSE4-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE4-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE4-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE4-NEXT: psrlq $32, %xmm3 -; X64-SSE4-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE4-NEXT: paddq %xmm3, %xmm0 -; X64-SSE4-NEXT: psllq $32, %xmm0 -; X64-SSE4-NEXT: paddq %xmm2, %xmm0 -; X64-SSE4-NEXT: retq +; X64-SSE-LABEL: mul_v2i64_15_neg_63: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,18446744073709551553] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: psrlq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: paddq %xmm3, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_15_neg_63: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1751,7 +1617,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_15_neg_63: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,18446744073709551553] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,18446744073709551553] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 @@ -1774,8 +1640,8 @@ define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE2-NEXT: retl @@ -1789,8 +1655,8 @@ define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind { ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE2-NEXT: retq @@ -1892,42 +1758,31 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> } define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { -; X86-SSE2-LABEL: mul_v2i64_68_132: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlq $32, %xmm0 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE2-NEXT: psllq $32, %xmm0 -; X86-SSE2-NEXT: paddq %xmm2, %xmm0 -; X86-SSE2-NEXT: retl -; -; SSE4-LABEL: mul_v2i64_68_132: -; SSE4: # %bb.0: -; SSE4-NEXT: pmovzxbq {{.*#+}} xmm1 = [68,132] -; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pmuludq %xmm1, %xmm2 -; SSE4-NEXT: psrlq $32, %xmm0 -; SSE4-NEXT: pmuludq %xmm1, %xmm0 -; SSE4-NEXT: psllq $32, %xmm0 -; SSE4-NEXT: paddq %xmm2, %xmm0 -; SSE4-NEXT: ret{{[l|q]}} +; X86-SSE-LABEL: mul_v2i64_68_132: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: retl ; -; X64-SSE2-LABEL: mul_v2i64_68_132: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [68,132] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: psrlq $32, %xmm0 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE2-NEXT: psllq $32, %xmm0 -; X64-SSE2-NEXT: paddq %xmm2, %xmm0 -; X64-SSE2-NEXT: retq +; X64-SSE-LABEL: mul_v2i64_68_132: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [68,132] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_68_132: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovzxbq {{.*#+}} xmm1 = [68,132] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [68,132] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1937,7 +1792,7 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_68_132: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [68,132] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [68,132] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1954,42 +1809,31 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { } define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind { -; X86-SSE2-LABEL: mul_v2i64_60_120: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlq $32, %xmm0 -; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE2-NEXT: psllq $32, %xmm0 -; X86-SSE2-NEXT: paddq %xmm2, %xmm0 -; X86-SSE2-NEXT: retl -; -; SSE4-LABEL: mul_v2i64_60_120: -; SSE4: # %bb.0: -; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [60,124] -; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pmuludq %xmm1, %xmm2 -; SSE4-NEXT: psrlq $32, %xmm0 -; SSE4-NEXT: pmuludq %xmm1, %xmm0 -; SSE4-NEXT: psllq $32, %xmm0 -; SSE4-NEXT: paddq %xmm2, %xmm0 -; SSE4-NEXT: ret{{[l|q]}} +; X86-SSE-LABEL: mul_v2i64_60_120: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: retl ; -; X64-SSE2-LABEL: mul_v2i64_60_120: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [60,124] -; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE2-NEXT: psrlq $32, %xmm0 -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE2-NEXT: psllq $32, %xmm0 -; X64-SSE2-NEXT: paddq %xmm2, %xmm0 -; X64-SSE2-NEXT: retq +; X64-SSE-LABEL: mul_v2i64_60_120: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [60,124] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: retq ; ; X64-XOP-LABEL: mul_v2i64_60_120: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,124] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [60,124] ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-XOP-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-XOP-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -1999,7 +1843,7 @@ define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind { ; ; X64-AVX2-LABEL: mul_v2i64_60_120: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,124] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [60,124] ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -2099,9 +1943,9 @@ define <4 x i64> @mul_v4i64_zext_cross_bb(ptr %in, ptr %y) { ; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; X86-SSE4-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; X86-SSE4-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE4-NEXT: pmuludq %xmm3, %xmm0 ; X86-SSE4-NEXT: retl ; ; X64-SSE2-LABEL: mul_v4i64_zext_cross_bb: @@ -2123,9 +1967,9 @@ define <4 x i64> @mul_v4i64_zext_cross_bb(ptr %in, ptr %y) { ; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; X64-SSE4-NEXT: pmuludq %xmm2, %xmm1 -; X64-SSE4-NEXT: pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; X64-SSE4-NEXT: pmuludq %xmm2, %xmm0 +; X64-SSE4-NEXT: pmuludq %xmm3, %xmm0 ; X64-SSE4-NEXT: retq ; ; X64-XOP-LABEL: mul_v4i64_zext_cross_bb: diff --git a/llvm/test/CodeGen/X86/vector-pack-128.ll b/llvm/test/CodeGen/X86/vector-pack-128.ll index f58f19ecd2481..40360db2f5b8a 100644 --- a/llvm/test/CodeGen/X86/vector-pack-128.ll +++ b/llvm/test/CodeGen/X86/vector-pack-128.ll @@ -25,8 +25,8 @@ define <8 x i16> @trunc_concat_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwi ; ; AVX2-LABEL: trunc_concat_packssdw_128: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15] +; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -68,8 +68,8 @@ define <8 x i16> @trunc_concat_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwi ; ; AVX2-LABEL: trunc_concat_packusdw_128: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15] +; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -185,8 +185,8 @@ define <8 x i16> @concat_trunc_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwi ; ; AVX2-LABEL: concat_trunc_packssdw_128: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15] +; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -231,8 +231,8 @@ define <8 x i16> @concat_trunc_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) nounwi ; ; AVX2-LABEL: concat_trunc_packusdw_128: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15] +; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-pack-256.ll b/llvm/test/CodeGen/X86/vector-pack-256.ll index 832e50d147f6c..3339233d1e8a0 100644 --- a/llvm/test/CodeGen/X86/vector-pack-256.ll +++ b/llvm/test/CodeGen/X86/vector-pack-256.ll @@ -58,8 +58,8 @@ define <16 x i16> @trunc_concat_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounw ; ; AVX2-LABEL: trunc_concat_packusdw_256: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -228,8 +228,8 @@ define <16 x i16> @concat_trunc_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounw ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] diff --git a/llvm/test/CodeGen/X86/vector-pack-512.ll b/llvm/test/CodeGen/X86/vector-pack-512.ll index 30e61a68bb22f..90ce508907ff4 100644 --- a/llvm/test/CodeGen/X86/vector-pack-512.ll +++ b/llvm/test/CodeGen/X86/vector-pack-512.ll @@ -9,9 +9,9 @@ define <32 x i16> @trunc_concat_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrad $17, %zmm0, %zmm0 ; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vpmovdw %zmm3, %ymm0 ; AVX512-NEXT: vpmovdw %zmm2, %ymm1 @@ -29,9 +29,9 @@ define <32 x i16> @trunc_concat_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrld $17, %zmm0, %zmm0 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-NEXT: vpmovdw %zmm3, %ymm0 ; AVX512-NEXT: vpmovdw %zmm2, %ymm1 @@ -52,9 +52,9 @@ define <64 x i8> @trunc_concat_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,5,12,13,6,7,14,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,12,13,6,7,14,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 @@ -75,9 +75,9 @@ define <64 x i8> @trunc_concat_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1 @@ -98,9 +98,9 @@ define <64 x i8> @trunc_concat_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [4,5,12,13,6,7,14,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,12,13,6,7,14,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 @@ -121,9 +121,9 @@ define <64 x i8> @trunc_concat_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vpmovwb %zmm3, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm1 @@ -145,7 +145,7 @@ define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1 ; AVX512-NEXT: vpmovdw %zmm0, %ymm2 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %1 = ashr <16 x i32> %a0, @@ -163,7 +163,7 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou ; AVX512-NEXT: vpsrld $23, %zmm1, %zmm1 ; AVX512-NEXT: vpmovdw %zmm0, %ymm2 ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %1 = lshr <16 x i32> %a0, @@ -177,28 +177,28 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; AVX512F-LABEL: concat_trunc_packsswb_512: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2 ; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,1,10,3,11] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,2,9,1,10,3,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: concat_trunc_packsswb_512: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm2 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm0 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: retq %1 = ashr <32 x i16> %a0, @@ -212,28 +212,28 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; AVX512F-LABEL: concat_trunc_packuswb_512: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,2,9,1,10,3,11] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,2,9,1,10,3,11] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: concat_trunc_packuswb_512: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm2 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm0 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,2,10,3,11] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: retq %1 = lshr <32 x i16> %a0, diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll index 5b43acbe52375..83c116e828489 100644 --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -1749,7 +1749,7 @@ define <16 x i1> @is_positive_mask_v16i16_v16i1(<16 x i16> %x, <16 x i1> %y) { ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1851,7 +1851,6 @@ define <32 x i1> @is_positive_mask_v32i8_v32i1(<32 x i8> %x, <32 x i1> %y) { ; ; SSE42-LABEL: is_positive_mask_v32i8_v32i1: ; SSE42: # %bb.0: -; SSE42-NEXT: movq %rdi, %rax ; SSE42-NEXT: movd %esi, %xmm2 ; SSE42-NEXT: pinsrb $1, %edx, %xmm2 ; SSE42-NEXT: pinsrb $2, %ecx, %xmm2 @@ -1883,6 +1882,7 @@ define <32 x i1> @is_positive_mask_v32i8_v32i1(<32 x i8> %x, <32 x i1> %y) { ; SSE42-NEXT: pinsrb $12, {{[0-9]+}}(%rsp), %xmm3 ; SSE42-NEXT: pinsrb $13, {{[0-9]+}}(%rsp), %xmm3 ; SSE42-NEXT: pinsrb $14, {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: movq %rdi, %rax ; SSE42-NEXT: pinsrb $15, {{[0-9]+}}(%rsp), %xmm3 ; SSE42-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE42-NEXT: pcmpgtb %xmm4, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll index b5b9af543ed5c..6af28c3397d57 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -107,7 +107,7 @@ define <16 x i8> @ult_2_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -115,7 +115,7 @@ define <16 x i8> @ult_2_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_2_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -210,7 +210,7 @@ define <16 x i8> @ugt_2_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_2_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -347,7 +347,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -355,7 +355,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -364,7 +364,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -374,7 +374,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -383,7 +383,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -391,7 +391,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_3_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -486,7 +486,7 @@ define <16 x i8> @ugt_3_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -623,7 +623,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -631,7 +631,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -640,7 +640,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -650,7 +650,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -659,7 +659,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -667,7 +667,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_4_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -762,7 +762,7 @@ define <16 x i8> @ugt_4_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -899,7 +899,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_5_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -907,7 +907,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -916,7 +916,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -926,7 +926,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -935,7 +935,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -943,7 +943,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_5_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1038,7 +1038,7 @@ define <16 x i8> @ugt_5_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_5_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1175,7 +1175,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_6_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1183,7 +1183,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1192,7 +1192,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1202,7 +1202,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1211,7 +1211,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1219,7 +1219,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_6_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1314,7 +1314,7 @@ define <16 x i8> @ugt_6_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_6_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1451,7 +1451,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_7_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1459,7 +1459,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1468,7 +1468,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1478,7 +1478,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1487,7 +1487,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1495,7 +1495,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_7_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1600,7 +1600,7 @@ define <8 x i16> @ult_2_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1608,7 +1608,7 @@ define <8 x i16> @ult_2_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_2_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -1720,7 +1720,7 @@ define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_2_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1856,7 +1856,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3,3,3,3,3] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1879,7 +1879,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1890,7 +1890,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1899,7 +1899,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1909,7 +1909,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1918,7 +1918,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1926,7 +1926,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_3_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2038,7 +2038,7 @@ define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2174,7 +2174,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4,4,4,4,4] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2197,7 +2197,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2208,7 +2208,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2217,7 +2217,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2227,7 +2227,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -2236,7 +2236,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2244,7 +2244,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_4_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2356,7 +2356,7 @@ define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2492,7 +2492,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5,5,5,5,5] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2515,7 +2515,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2526,7 +2526,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2535,7 +2535,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2545,7 +2545,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -2554,7 +2554,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2562,7 +2562,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_5_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2674,7 +2674,7 @@ define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2810,7 +2810,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [6,6,6,6,6,6,6,6] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6,6,6,6,6] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2833,7 +2833,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2844,7 +2844,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2853,7 +2853,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2863,7 +2863,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -2872,7 +2872,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2880,7 +2880,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_6_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2992,7 +2992,7 @@ define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3128,7 +3128,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7,7,7,7,7] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3151,7 +3151,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_7_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3162,7 +3162,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3171,7 +3171,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3181,7 +3181,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -3190,7 +3190,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3198,7 +3198,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_7_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3310,7 +3310,7 @@ define <8 x i16> @ugt_7_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_7_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3446,7 +3446,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [8,8,8,8,8,8,8,8] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8,8,8,8,8] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3469,7 +3469,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_8_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3480,7 +3480,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3489,7 +3489,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3499,7 +3499,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -3508,7 +3508,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3516,7 +3516,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_8_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3628,7 +3628,7 @@ define <8 x i16> @ugt_8_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_8_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3764,7 +3764,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [9,9,9,9,9,9,9,9] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9,9,9,9,9,9,9,9] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3787,7 +3787,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_9_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3798,7 +3798,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3807,7 +3807,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3817,7 +3817,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -3826,7 +3826,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3834,7 +3834,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_9_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3946,7 +3946,7 @@ define <8 x i16> @ugt_9_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_9_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4082,7 +4082,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [10,10,10,10,10,10,10,10] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [10,10,10,10,10,10,10,10] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -4105,7 +4105,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_10_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4116,7 +4116,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4125,7 +4125,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4135,7 +4135,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -4144,7 +4144,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4152,7 +4152,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_10_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4264,7 +4264,7 @@ define <8 x i16> @ugt_10_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_10_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4400,7 +4400,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [11,11,11,11,11,11,11,11] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [11,11,11,11,11,11,11,11] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -4423,7 +4423,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_11_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4434,7 +4434,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4443,7 +4443,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4453,7 +4453,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -4462,7 +4462,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4470,7 +4470,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_11_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4582,7 +4582,7 @@ define <8 x i16> @ugt_11_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_11_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4718,7 +4718,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [12,12,12,12,12,12,12,12] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [12,12,12,12,12,12,12,12] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -4741,7 +4741,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_12_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4752,7 +4752,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4761,7 +4761,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4771,7 +4771,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -4780,7 +4780,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4788,7 +4788,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_12_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4900,7 +4900,7 @@ define <8 x i16> @ugt_12_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_12_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5036,7 +5036,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [13,13,13,13,13,13,13,13] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [13,13,13,13,13,13,13,13] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -5059,7 +5059,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_13_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5070,7 +5070,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5079,7 +5079,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5089,7 +5089,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -5098,7 +5098,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5106,7 +5106,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_13_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -5218,7 +5218,7 @@ define <8 x i16> @ugt_13_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_13_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5354,7 +5354,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [14,14,14,14,14,14,14,14] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [14,14,14,14,14,14,14,14] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -5377,7 +5377,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_14_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5388,7 +5388,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5397,7 +5397,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5407,7 +5407,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -5416,7 +5416,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5424,7 +5424,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_14_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -5536,7 +5536,7 @@ define <8 x i16> @ugt_14_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_14_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5672,7 +5672,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15] ; SSE41-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -5695,7 +5695,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_15_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5706,7 +5706,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5715,7 +5715,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5725,7 +5725,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -5734,7 +5734,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5742,7 +5742,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_15_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -6011,7 +6011,7 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_2_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6174,7 +6174,7 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [3,3,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -6200,7 +6200,7 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6385,7 +6385,7 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6548,7 +6548,7 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [4,4,4,4] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -6574,7 +6574,7 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6759,7 +6759,7 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6922,7 +6922,7 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [5,5,5,5] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -6948,7 +6948,7 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7133,7 +7133,7 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7296,7 +7296,7 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [6,6,6,6] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -7322,7 +7322,7 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7507,7 +7507,7 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7670,7 +7670,7 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [7,7,7,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -7696,7 +7696,7 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_7_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7881,7 +7881,7 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_7_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8044,7 +8044,7 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [8,8,8,8] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -8070,7 +8070,7 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_8_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8255,7 +8255,7 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_8_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8418,7 +8418,7 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [9,9,9,9] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9,9,9,9] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -8444,7 +8444,7 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_9_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8629,7 +8629,7 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_9_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8792,7 +8792,7 @@ define <4 x i32> @ult_10_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [10,10,10,10] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [10,10,10,10] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -8818,7 +8818,7 @@ define <4 x i32> @ult_10_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_10_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9003,7 +9003,7 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_10_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9166,7 +9166,7 @@ define <4 x i32> @ult_11_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [11,11,11,11] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [11,11,11,11] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -9192,7 +9192,7 @@ define <4 x i32> @ult_11_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_11_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9377,7 +9377,7 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_11_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9540,7 +9540,7 @@ define <4 x i32> @ult_12_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [12,12,12,12] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [12,12,12,12] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -9566,7 +9566,7 @@ define <4 x i32> @ult_12_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_12_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9751,7 +9751,7 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_12_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9914,7 +9914,7 @@ define <4 x i32> @ult_13_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [13,13,13,13] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [13,13,13,13] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -9940,7 +9940,7 @@ define <4 x i32> @ult_13_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_13_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10125,7 +10125,7 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_13_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10288,7 +10288,7 @@ define <4 x i32> @ult_14_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [14,14,14,14] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [14,14,14,14] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -10314,7 +10314,7 @@ define <4 x i32> @ult_14_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_14_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10499,7 +10499,7 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_14_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10662,7 +10662,7 @@ define <4 x i32> @ult_15_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [15,15,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -10688,7 +10688,7 @@ define <4 x i32> @ult_15_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_15_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10873,7 +10873,7 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_15_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11036,7 +11036,7 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [16,16,16,16] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -11062,7 +11062,7 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_16_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11247,7 +11247,7 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_16_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11410,7 +11410,7 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [17,17,17,17] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [17,17,17,17] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -11436,7 +11436,7 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_17_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11621,7 +11621,7 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_17_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11784,7 +11784,7 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [18,18,18,18] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18,18,18,18] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -11810,7 +11810,7 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_18_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11995,7 +11995,7 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_18_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12158,7 +12158,7 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [19,19,19,19] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [19,19,19,19] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -12184,7 +12184,7 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_19_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12369,7 +12369,7 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_19_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12532,7 +12532,7 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [20,20,20,20] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [20,20,20,20] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -12558,7 +12558,7 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_20_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12743,7 +12743,7 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_20_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12906,7 +12906,7 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [21,21,21,21] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [21,21,21,21] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -12932,7 +12932,7 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_21_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13117,7 +13117,7 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_21_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13280,7 +13280,7 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [22,22,22,22] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [22,22,22,22] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -13306,7 +13306,7 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_22_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13491,7 +13491,7 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_22_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13654,7 +13654,7 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [23,23,23,23] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [23,23,23,23] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -13680,7 +13680,7 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_23_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13865,7 +13865,7 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_23_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14028,7 +14028,7 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [24,24,24,24] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [24,24,24,24] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -14054,7 +14054,7 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_24_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14239,7 +14239,7 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_24_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14402,7 +14402,7 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [25,25,25,25] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -14428,7 +14428,7 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_25_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14613,7 +14613,7 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_25_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14776,7 +14776,7 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [26,26,26,26] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [26,26,26,26] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -14802,7 +14802,7 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_26_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14987,7 +14987,7 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_26_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15150,7 +15150,7 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [27,27,27,27] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [27,27,27,27] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -15176,7 +15176,7 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_27_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15361,7 +15361,7 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_27_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15524,7 +15524,7 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [28,28,28,28] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [28,28,28,28] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -15550,7 +15550,7 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_28_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15735,7 +15735,7 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_28_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15898,7 +15898,7 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [29,29,29,29] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [29,29,29,29] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -15924,7 +15924,7 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_29_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16109,7 +16109,7 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_29_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16272,7 +16272,7 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [30,30,30,30] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [30,30,30,30] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -16298,7 +16298,7 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_30_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16483,7 +16483,7 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_30_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16646,7 +16646,7 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) { ; SSE41-NEXT: psadbw %xmm0, %xmm3 ; SSE41-NEXT: psadbw %xmm0, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [31,31,31,31] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -16672,7 +16672,7 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_31_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16917,7 +16917,7 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17050,7 +17050,7 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_2_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17183,7 +17183,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [3,3,3,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -17199,13 +17199,14 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17215,7 +17216,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17223,7 +17224,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17241,7 +17242,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17358,7 +17359,7 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17491,7 +17492,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [4,4,4,4] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [4,4,4,4] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -17507,13 +17508,14 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17523,7 +17525,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17531,7 +17533,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17549,7 +17551,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17666,7 +17668,7 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17799,7 +17801,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [5,5,5,5] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [5,5,5,5] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -17815,13 +17817,14 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17831,7 +17834,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17839,7 +17842,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17857,7 +17860,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17974,7 +17977,7 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18107,7 +18110,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [6,6,6,6] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -18123,13 +18126,14 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18139,7 +18143,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18147,7 +18151,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18165,7 +18169,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18282,7 +18286,7 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18415,7 +18419,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [7,7,7,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [7,7,7,7] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -18431,13 +18435,14 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_7_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18447,7 +18452,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18455,7 +18460,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18473,7 +18478,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18590,7 +18595,7 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_7_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18723,7 +18728,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [8,8,8,8] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -18739,13 +18744,14 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18755,7 +18761,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18763,7 +18769,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18781,7 +18787,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18898,7 +18904,7 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19031,7 +19037,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [9,9,9,9] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9,9,9,9] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -19047,13 +19053,14 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_9_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19063,7 +19070,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19071,7 +19078,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19089,7 +19096,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19206,7 +19213,7 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_9_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19339,7 +19346,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [10,10,10,10] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [10,10,10,10] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -19355,13 +19362,14 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_10_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19371,7 +19379,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19379,7 +19387,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19397,7 +19405,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19514,7 +19522,7 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_10_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19647,7 +19655,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [11,11,11,11] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [11,11,11,11] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -19663,13 +19671,14 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_11_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19679,7 +19688,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19687,7 +19696,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19705,7 +19714,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19822,7 +19831,7 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_11_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19955,7 +19964,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [12,12,12,12] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [12,12,12,12] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -19971,13 +19980,14 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_12_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19987,7 +19997,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19995,7 +20005,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20013,7 +20023,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20130,7 +20140,7 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_12_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20263,7 +20273,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [13,13,13,13] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [13,13,13,13] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -20279,13 +20289,14 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_13_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20295,7 +20306,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20303,7 +20314,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20321,7 +20332,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20438,7 +20449,7 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_13_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20571,7 +20582,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [14,14,14,14] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [14,14,14,14] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -20587,13 +20598,14 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_14_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20603,7 +20615,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20611,7 +20623,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20629,7 +20641,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20746,7 +20758,7 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_14_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20879,7 +20891,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [15,15,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -20895,13 +20907,14 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_15_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20911,7 +20924,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20919,7 +20932,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20937,7 +20950,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21054,7 +21067,7 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_15_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21187,7 +21200,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [16,16,16,16] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -21203,13 +21216,14 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_16_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21219,7 +21233,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21227,7 +21241,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21245,7 +21259,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21362,7 +21376,7 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_16_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21495,7 +21509,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [17,17,17,17] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [17,17,17,17] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -21511,13 +21525,14 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_17_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21527,7 +21542,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21535,7 +21550,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21553,7 +21568,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21670,7 +21685,7 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_17_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21803,7 +21818,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [18,18,18,18] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18,18,18,18] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -21819,13 +21834,14 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_18_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21835,7 +21851,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21843,7 +21859,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21861,7 +21877,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21978,7 +21994,7 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_18_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22111,7 +22127,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [19,19,19,19] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [19,19,19,19] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -22127,13 +22143,14 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_19_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22143,7 +22160,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22151,7 +22168,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22169,7 +22186,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22286,7 +22303,7 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_19_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22419,7 +22436,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [20,20,20,20] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [20,20,20,20] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -22435,13 +22452,14 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_20_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22451,7 +22469,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22459,7 +22477,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22477,7 +22495,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22594,7 +22612,7 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_20_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22727,7 +22745,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [21,21,21,21] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [21,21,21,21] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -22743,13 +22761,14 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_21_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22759,7 +22778,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22767,7 +22786,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22785,7 +22804,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22902,7 +22921,7 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_21_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23035,7 +23054,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [22,22,22,22] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [22,22,22,22] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -23051,13 +23070,14 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_22_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23067,7 +23087,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23075,7 +23095,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23093,7 +23113,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23210,7 +23230,7 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_22_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23343,7 +23363,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [23,23,23,23] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [23,23,23,23] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -23359,13 +23379,14 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_23_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23375,7 +23396,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23383,7 +23404,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23401,7 +23422,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23518,7 +23539,7 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_23_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23651,7 +23672,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [24,24,24,24] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [24,24,24,24] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -23667,13 +23688,14 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_24_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23683,7 +23705,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23691,7 +23713,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23709,7 +23731,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23826,7 +23848,7 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_24_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23959,7 +23981,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [25,25,25,25] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -23975,13 +23997,14 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_25_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23991,7 +24014,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23999,7 +24022,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24017,7 +24040,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24134,7 +24157,7 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_25_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24267,7 +24290,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [26,26,26,26] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [26,26,26,26] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -24283,13 +24306,14 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_26_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24299,7 +24323,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24307,7 +24331,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24325,7 +24349,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24442,7 +24466,7 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_26_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24575,7 +24599,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [27,27,27,27] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [27,27,27,27] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -24591,13 +24615,14 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_27_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24607,7 +24632,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24615,7 +24640,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24633,7 +24658,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24750,7 +24775,7 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_27_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24883,7 +24908,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [28,28,28,28] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [28,28,28,28] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -24899,13 +24924,14 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_28_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24915,7 +24941,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24923,7 +24949,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24941,7 +24967,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25058,7 +25084,7 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_28_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25191,7 +25217,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [29,29,29,29] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [29,29,29,29] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -25207,13 +25233,14 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_29_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25223,7 +25250,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25231,7 +25258,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25249,7 +25276,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25366,7 +25393,7 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_29_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25499,7 +25526,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [30,30,30,30] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [30,30,30,30] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -25515,13 +25542,14 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_30_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25531,7 +25559,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25539,7 +25567,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25557,7 +25585,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25674,7 +25702,7 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_30_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25807,7 +25835,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [31,31,31,31] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [31,31,31,31] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -25823,13 +25851,14 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_31_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25839,7 +25868,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25847,7 +25876,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25865,7 +25894,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25982,7 +26011,7 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_31_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26115,7 +26144,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [32,32,32,32] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -26131,13 +26160,14 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_32_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26147,7 +26177,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26155,7 +26185,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26173,7 +26203,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26290,7 +26320,7 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_32_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26423,7 +26453,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [33,33,33,33] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -26439,13 +26469,14 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_33_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26455,7 +26486,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26463,7 +26494,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26481,7 +26512,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26598,7 +26629,7 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_33_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26731,7 +26762,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [34,34,34,34] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [34,34,34,34] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -26747,13 +26778,14 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_34_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26763,7 +26795,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26771,7 +26803,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26789,7 +26821,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26906,7 +26938,7 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_34_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27039,7 +27071,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [35,35,35,35] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [35,35,35,35] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -27055,13 +27087,14 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_35_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27071,7 +27104,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27079,7 +27112,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27097,7 +27130,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27214,7 +27247,7 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_35_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27347,7 +27380,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [36,36,36,36] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [36,36,36,36] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -27363,13 +27396,14 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_36_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27379,7 +27413,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27387,7 +27421,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27405,7 +27439,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27522,7 +27556,7 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_36_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27655,7 +27689,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [37,37,37,37] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [37,37,37,37] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -27671,13 +27705,14 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_37_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27687,7 +27722,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27695,7 +27730,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27713,7 +27748,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27830,7 +27865,7 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_37_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27963,7 +27998,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [38,38,38,38] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [38,38,38,38] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -27979,13 +28014,14 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_38_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27995,7 +28031,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28003,7 +28039,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28021,7 +28057,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28138,7 +28174,7 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_38_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28271,7 +28307,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [39,39,39,39] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [39,39,39,39] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -28287,13 +28323,14 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_39_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28303,7 +28340,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28311,7 +28348,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28329,7 +28366,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28446,7 +28483,7 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_39_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28579,7 +28616,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [40,40,40,40] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [40,40,40,40] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -28595,13 +28632,14 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_40_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28611,7 +28649,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28619,7 +28657,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28637,7 +28675,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28754,7 +28792,7 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_40_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28887,7 +28925,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [41,41,41,41] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [41,41,41,41] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -28903,13 +28941,14 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_41_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28919,7 +28958,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28927,7 +28966,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28945,7 +28984,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29062,7 +29101,7 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_41_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29195,7 +29234,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [42,42,42,42] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [42,42,42,42] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -29211,13 +29250,14 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_42_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29227,7 +29267,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29235,7 +29275,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29253,7 +29293,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29370,7 +29410,7 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_42_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29503,7 +29543,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [43,43,43,43] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [43,43,43,43] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -29519,13 +29559,14 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_43_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29535,7 +29576,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29543,7 +29584,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29561,7 +29602,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29678,7 +29719,7 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_43_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29811,7 +29852,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [44,44,44,44] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [44,44,44,44] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -29827,13 +29868,14 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_44_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29843,7 +29885,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29851,7 +29893,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29869,7 +29911,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29986,7 +30028,7 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_44_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30119,7 +30161,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [45,45,45,45] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [45,45,45,45] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -30135,13 +30177,14 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_45_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30151,7 +30194,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30159,7 +30202,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30177,7 +30220,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30294,7 +30337,7 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_45_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30427,7 +30470,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [46,46,46,46] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [46,46,46,46] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -30443,13 +30486,14 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_46_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30459,7 +30503,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30467,7 +30511,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30485,7 +30529,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30602,7 +30646,7 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_46_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30735,7 +30779,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [47,47,47,47] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [47,47,47,47] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -30751,13 +30795,14 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_47_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30767,7 +30812,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30775,7 +30820,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30793,7 +30838,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30910,7 +30955,7 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_47_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31043,7 +31088,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [48,48,48,48] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [48,48,48,48] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -31059,13 +31104,14 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_48_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31075,7 +31121,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31083,7 +31129,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31101,7 +31147,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31218,7 +31264,7 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_48_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31351,7 +31397,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [49,49,49,49] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [49,49,49,49] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -31367,13 +31413,14 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_49_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31383,7 +31430,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31391,7 +31438,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31409,7 +31456,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31526,7 +31573,7 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_49_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31659,7 +31706,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [50,50,50,50] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [50,50,50,50] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -31675,13 +31722,14 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_50_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31691,7 +31739,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31699,7 +31747,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31717,7 +31765,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31834,7 +31882,7 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_50_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31967,7 +32015,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [51,51,51,51] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -31983,13 +32031,14 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_51_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31999,7 +32048,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32007,7 +32056,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32025,7 +32074,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32142,7 +32191,7 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_51_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32275,7 +32324,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [52,52,52,52] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [52,52,52,52] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -32291,13 +32340,14 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_52_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32307,7 +32357,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32315,7 +32365,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32333,7 +32383,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32450,7 +32500,7 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_52_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32583,7 +32633,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [53,53,53,53] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [53,53,53,53] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -32599,13 +32649,14 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_53_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32615,7 +32666,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32623,7 +32674,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32641,7 +32692,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32758,7 +32809,7 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_53_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32891,7 +32942,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [54,54,54,54] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [54,54,54,54] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -32907,13 +32958,14 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_54_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32923,7 +32975,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32931,7 +32983,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32949,7 +33001,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33066,7 +33118,7 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_54_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33199,7 +33251,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [55,55,55,55] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [55,55,55,55] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -33215,13 +33267,14 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_55_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33231,7 +33284,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33239,7 +33292,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33257,7 +33310,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33374,7 +33427,7 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_55_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33507,7 +33560,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [56,56,56,56] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [56,56,56,56] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -33523,13 +33576,14 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_56_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33539,7 +33593,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33547,7 +33601,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33565,7 +33619,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33682,7 +33736,7 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_56_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33815,7 +33869,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [57,57,57,57] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [57,57,57,57] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -33831,13 +33885,14 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_57_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33847,7 +33902,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33855,7 +33910,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33873,7 +33928,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33990,7 +34045,7 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_57_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34123,7 +34178,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [58,58,58,58] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [58,58,58,58] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -34139,13 +34194,14 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_58_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34155,7 +34211,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34163,7 +34219,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34181,7 +34237,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34298,7 +34354,7 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_58_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34431,7 +34487,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [59,59,59,59] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [59,59,59,59] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -34447,13 +34503,14 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_59_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34463,7 +34520,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34471,7 +34528,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34489,7 +34546,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34606,7 +34663,7 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_59_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34739,7 +34796,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [60,60,60,60] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [60,60,60,60] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -34755,13 +34812,14 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_60_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34771,7 +34829,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34779,7 +34837,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34797,7 +34855,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34914,7 +34972,7 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_60_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35047,7 +35105,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [61,61,61,61] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [61,61,61,61] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -35063,13 +35121,14 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_61_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35079,7 +35138,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35087,7 +35146,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35105,7 +35164,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35222,7 +35281,7 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_61_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35355,7 +35414,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [62,62,62,62] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [62,62,62,62] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -35371,13 +35430,14 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_62_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35387,7 +35447,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35395,7 +35455,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35413,7 +35473,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35530,7 +35590,7 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_62_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35663,7 +35723,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [63,63,63,63] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [63,63,63,63] ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -35679,13 +35739,14 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [63,63] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ult_63_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35695,7 +35756,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35703,7 +35764,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35721,7 +35782,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll index c1d30b6d5a995..79c0b2b38b7e6 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -100,7 +100,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; AVX2-LABEL: testv2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -269,7 +269,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX2-LABEL: testv4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -439,7 +439,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; AVX2-LABEL: testv8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -579,7 +579,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; AVX2-LABEL: testv16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -593,8 +593,8 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; XOP: # %bb.0: ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; XOP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; XOP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq @@ -629,29 +629,19 @@ define <2 x i64> @foldv2i64() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,64] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: foldv2i64: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] -; AVX1OR2-NEXT: retq -; -; XOP-LABEL: foldv2i64: -; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] -; XOP-NEXT: retq -; -; AVX512-LABEL: foldv2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,64] -; AVX512-NEXT: retq +; AVX-LABEL: foldv2i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,64] +; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,64] +; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] ; BITALG-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> ) ret <2 x i64> %out @@ -663,29 +653,19 @@ define <4 x i32> @foldv4i32() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,32,0,8] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: foldv4i32: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] -; AVX1OR2-NEXT: retq -; -; XOP-LABEL: foldv4i32: -; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] -; XOP-NEXT: retq -; -; AVX512-LABEL: foldv4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,32,0,8] -; AVX512-NEXT: retq +; AVX-LABEL: foldv4i32: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,32,0,8] +; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,32,0,8] +; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] ; BITALG-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> ) ret <4 x i32> %out @@ -863,8 +843,8 @@ define <2 x i64> @ne_1_v2i64(<2 x i64> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ne_1_v2i64: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1055,8 +1035,8 @@ define <4 x i32> @ne_1_v4i32(<4 x i32> %0) { ; AVX512VPOPCNTDQ-LABEL: ne_1_v4i32: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1065,8 +1045,8 @@ define <4 x i32> @ne_1_v4i32(<4 x i32> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ne_1_v4i32: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll index 487f9a5d326cf..2e2ef4675e14b 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -126,7 +126,7 @@ define <32 x i8> @ult_2_v32i8(<32 x i8> %0) { ; BITALG-LABEL: ult_2_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -163,8 +163,7 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -177,8 +176,7 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -189,10 +187,9 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -247,8 +244,7 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -262,8 +258,7 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -275,16 +270,15 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ult_3_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -299,7 +293,7 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { ; BITALG-LABEL: ult_3_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -336,8 +330,7 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -350,8 +343,7 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -362,10 +354,9 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -420,8 +411,7 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -435,8 +425,7 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -448,16 +437,15 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ult_4_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -472,7 +460,7 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) { ; BITALG-LABEL: ult_4_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -509,8 +497,7 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -523,8 +510,7 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -535,10 +521,9 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -593,8 +578,7 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -608,8 +592,7 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -621,16 +604,15 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ult_5_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -645,7 +627,7 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) { ; BITALG-LABEL: ult_5_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -682,8 +664,7 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -696,8 +677,7 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -708,10 +688,9 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -766,8 +745,7 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -781,8 +759,7 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -794,16 +771,15 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ult_6_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -818,7 +794,7 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) { ; BITALG-LABEL: ult_6_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -855,8 +831,7 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -869,8 +844,7 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -881,10 +855,9 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -939,8 +912,7 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -954,8 +926,7 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -967,16 +938,15 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ult_7_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -991,7 +961,7 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) { ; BITALG-LABEL: ult_7_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -1119,7 +1089,7 @@ define <16 x i16> @ult_2_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_2_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1162,8 +1132,7 @@ define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1243,8 +1212,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1271,7 +1239,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1286,7 +1254,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_3_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1329,8 +1297,7 @@ define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1410,8 +1377,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1438,7 +1404,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1453,7 +1419,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_4_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1496,8 +1462,7 @@ define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1577,8 +1542,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1605,7 +1569,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1620,7 +1584,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_5_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1663,8 +1627,7 @@ define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1744,8 +1707,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1772,7 +1734,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1787,7 +1749,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_6_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1830,8 +1792,7 @@ define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1911,8 +1872,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1939,7 +1899,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1954,7 +1914,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_7_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1997,8 +1957,7 @@ define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2078,8 +2037,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2106,7 +2064,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2121,7 +2079,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_8_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2164,8 +2122,7 @@ define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2245,8 +2202,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2273,7 +2229,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2288,7 +2244,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_9_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2331,8 +2287,7 @@ define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2412,8 +2367,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2440,7 +2394,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2455,7 +2409,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_10_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2498,8 +2452,7 @@ define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2579,8 +2532,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2607,7 +2559,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2622,7 +2574,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_11_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2665,8 +2617,7 @@ define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2746,8 +2697,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2774,7 +2724,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2789,7 +2739,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_12_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2832,8 +2782,7 @@ define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2913,8 +2862,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2941,7 +2889,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2956,7 +2904,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_13_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2999,8 +2947,7 @@ define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3080,8 +3027,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3108,7 +3054,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -3123,7 +3069,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_14_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -3166,8 +3112,7 @@ define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3247,8 +3192,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3275,7 +3219,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -3290,7 +3234,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; BITALG-LABEL: ult_15_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -3460,7 +3404,7 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -3468,8 +3412,7 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3570,8 +3513,7 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3665,7 +3607,7 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -3673,8 +3615,7 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3775,8 +3716,7 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3870,7 +3810,7 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -3878,8 +3818,7 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3980,8 +3919,7 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4075,7 +4013,7 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4083,8 +4021,7 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4185,8 +4122,7 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4280,7 +4216,7 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4288,8 +4224,7 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4390,8 +4325,7 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4485,7 +4419,7 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4493,8 +4427,7 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4595,8 +4528,7 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4690,7 +4622,7 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4698,8 +4630,7 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4800,8 +4731,7 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4895,7 +4825,7 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4903,8 +4833,7 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5005,8 +4934,7 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5100,7 +5028,7 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -5108,8 +5036,7 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5210,8 +5137,7 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5305,7 +5231,7 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -5313,8 +5239,7 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5415,8 +5340,7 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5510,7 +5434,7 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -5518,8 +5442,7 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5620,8 +5543,7 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5715,7 +5637,7 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -5723,8 +5645,7 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5825,8 +5746,7 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5920,7 +5840,7 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -5928,8 +5848,7 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6030,8 +5949,7 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6125,7 +6043,7 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -6133,8 +6051,7 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6235,8 +6152,7 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6330,7 +6246,7 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -6338,8 +6254,7 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6440,8 +6355,7 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6535,7 +6449,7 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -6543,8 +6457,7 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6645,8 +6558,7 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6740,7 +6652,7 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -6748,8 +6660,7 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6850,8 +6761,7 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6945,7 +6855,7 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -6953,8 +6863,7 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7055,8 +6964,7 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7150,7 +7058,7 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -7158,8 +7066,7 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7260,8 +7167,7 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7355,7 +7261,7 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -7363,8 +7269,7 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7465,8 +7370,7 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7560,7 +7464,7 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -7568,8 +7472,7 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7670,8 +7573,7 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7765,7 +7667,7 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -7773,8 +7675,7 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7875,8 +7776,7 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7970,7 +7870,7 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -7978,8 +7878,7 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8080,8 +7979,7 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8175,7 +8073,7 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -8183,8 +8081,7 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8285,8 +8182,7 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8380,7 +8276,7 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -8388,8 +8284,7 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8490,8 +8385,7 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8585,7 +8479,7 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -8593,8 +8487,7 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8695,8 +8588,7 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8790,7 +8682,7 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -8798,8 +8690,7 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8900,8 +8791,7 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8995,7 +8885,7 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -9003,8 +8893,7 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9105,8 +8994,7 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9200,7 +9088,7 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1] ; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -9208,8 +9096,7 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9310,8 +9197,7 @@ define <8 x i32> @ult_31_v8i32(<8 x i32> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9525,7 +9411,8 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2,2] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9535,8 +9422,7 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9608,7 +9494,8 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9618,8 +9505,7 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9691,7 +9577,8 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9701,8 +9588,7 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9774,7 +9660,8 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9784,8 +9671,7 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9857,7 +9743,8 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4,4] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9867,8 +9754,7 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9940,7 +9826,8 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -9950,8 +9837,7 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10023,7 +9909,8 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [5,5] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10033,8 +9920,7 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10106,7 +9992,8 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10116,8 +10003,7 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10189,7 +10075,8 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [6,6] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10199,8 +10086,7 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10272,7 +10158,8 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10282,8 +10169,7 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10355,7 +10241,8 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [7,7] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10365,8 +10252,7 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10438,7 +10324,8 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10448,8 +10335,7 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10521,7 +10407,8 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [8,8] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10531,8 +10418,7 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10604,7 +10490,8 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10614,8 +10501,7 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10687,7 +10573,8 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9,9] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10697,8 +10584,7 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10770,7 +10656,8 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10780,8 +10667,7 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10853,7 +10739,8 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [10,10] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10863,8 +10750,7 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10936,7 +10822,8 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -10946,8 +10833,7 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11019,7 +10905,8 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [11,11] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11029,8 +10916,7 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11102,7 +10988,8 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11112,8 +10999,7 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11185,7 +11071,8 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [12,12] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11195,8 +11082,7 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11268,7 +11154,8 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11278,8 +11165,7 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11351,7 +11237,8 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [13,13] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11361,8 +11248,7 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11434,7 +11320,8 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11444,8 +11331,7 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11517,7 +11403,8 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [14,14] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11527,8 +11414,7 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11600,7 +11486,8 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11610,8 +11497,7 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11683,7 +11569,8 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [15,15] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11693,8 +11580,7 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11766,7 +11652,8 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11776,8 +11663,7 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11849,7 +11735,8 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [16,16] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11859,8 +11746,7 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11932,7 +11818,8 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -11942,8 +11829,7 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12015,7 +11901,8 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [17,17] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12025,8 +11912,7 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12098,7 +11984,8 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12108,8 +11995,7 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12181,7 +12067,8 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18,18] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12191,8 +12078,7 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12264,7 +12150,8 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12274,8 +12161,7 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12347,7 +12233,8 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [19,19] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12357,8 +12244,7 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12430,7 +12316,8 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12440,8 +12327,7 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12513,7 +12399,8 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [20,20] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12523,8 +12410,7 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12596,7 +12482,8 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12606,8 +12493,7 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12679,7 +12565,8 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [21,21] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12689,8 +12576,7 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12762,7 +12648,8 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12772,8 +12659,7 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12845,7 +12731,8 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [22,22] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12855,8 +12742,7 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12928,7 +12814,8 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -12938,8 +12825,7 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13011,7 +12897,8 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [23,23] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13021,8 +12908,7 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13094,7 +12980,8 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13104,8 +12991,7 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13177,7 +13063,8 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [24,24] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13187,8 +13074,7 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13260,7 +13146,8 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13270,8 +13157,7 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13343,7 +13229,8 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [25,25] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13353,8 +13240,7 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13426,7 +13312,8 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13436,8 +13323,7 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13509,7 +13395,8 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [26,26] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13519,8 +13406,7 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13592,7 +13478,8 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13602,8 +13489,7 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13675,7 +13561,8 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [27,27] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13685,8 +13572,7 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13758,7 +13644,8 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13768,8 +13655,7 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13841,7 +13727,8 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [28,28] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13851,8 +13738,7 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13924,7 +13810,8 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -13934,8 +13821,7 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14007,7 +13893,8 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [29,29] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14017,8 +13904,7 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14090,7 +13976,8 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14100,8 +13987,7 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14173,7 +14059,8 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [30,30] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14183,8 +14070,7 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14256,7 +14142,8 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14266,8 +14153,7 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14339,7 +14225,8 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [31,31] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14349,8 +14236,7 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14422,7 +14308,8 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14432,8 +14319,7 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14505,7 +14391,8 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32,32] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14515,8 +14402,7 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14588,7 +14474,8 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14598,8 +14485,7 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14671,7 +14557,8 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [33,33] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14681,8 +14568,7 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14754,7 +14640,8 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14764,8 +14651,7 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14837,7 +14723,8 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [34,34] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14847,8 +14734,7 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14920,7 +14806,8 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -14930,8 +14817,7 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15003,7 +14889,8 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [35,35] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15013,8 +14900,7 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15086,7 +14972,8 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15096,8 +14983,7 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15169,7 +15055,8 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [36,36] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15179,8 +15066,7 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15252,7 +15138,8 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15262,8 +15149,7 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15335,7 +15221,8 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [37,37] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15345,8 +15232,7 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15418,7 +15304,8 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15428,8 +15315,7 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15501,7 +15387,8 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [38,38] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15511,8 +15398,7 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15584,7 +15470,8 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15594,8 +15481,7 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15667,7 +15553,8 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [39,39] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15677,8 +15564,7 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15750,7 +15636,8 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15760,8 +15647,7 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15833,7 +15719,8 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [40,40] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15843,8 +15730,7 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15916,7 +15802,8 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -15926,8 +15813,7 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15999,7 +15885,8 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [41,41] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16009,8 +15896,7 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16082,7 +15968,8 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16092,8 +15979,7 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16165,7 +16051,8 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [42,42] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16175,8 +16062,7 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16248,7 +16134,8 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16258,8 +16145,7 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16331,7 +16217,8 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [43,43] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16341,8 +16228,7 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16414,7 +16300,8 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16424,8 +16311,7 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16497,7 +16383,8 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [44,44] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16507,8 +16394,7 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16580,7 +16466,8 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16590,8 +16477,7 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16663,7 +16549,8 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [45,45] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16673,8 +16560,7 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16746,7 +16632,8 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16756,8 +16643,7 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16829,7 +16715,8 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [46,46] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16839,8 +16726,7 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16912,7 +16798,8 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -16922,8 +16809,7 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16995,7 +16881,8 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [47,47] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17005,8 +16892,7 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17078,7 +16964,8 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17088,8 +16975,7 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17161,7 +17047,8 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [48,48] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17171,8 +17058,7 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17244,7 +17130,8 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17254,8 +17141,7 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17327,7 +17213,8 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [49,49] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17337,8 +17224,7 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17410,7 +17296,8 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17420,8 +17307,7 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17493,7 +17379,8 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [50,50] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17503,8 +17390,7 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17576,7 +17462,8 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17586,8 +17473,7 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17659,7 +17545,8 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [51,51] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17669,8 +17556,7 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17742,7 +17628,8 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17752,8 +17639,7 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17825,7 +17711,8 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [52,52] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17835,8 +17722,7 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17908,7 +17794,8 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -17918,8 +17805,7 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17991,7 +17877,8 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [53,53] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18001,8 +17888,7 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18074,7 +17960,8 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18084,8 +17971,7 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18157,7 +18043,8 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [54,54] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18167,8 +18054,7 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18240,7 +18126,8 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18250,8 +18137,7 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18323,7 +18209,8 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [55,55] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18333,8 +18220,7 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18406,7 +18292,8 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18416,8 +18303,7 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18489,7 +18375,8 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [56,56] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18499,8 +18386,7 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18572,7 +18458,8 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18582,8 +18469,7 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18655,7 +18541,8 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [57,57] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18665,8 +18552,7 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18738,7 +18624,8 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18748,8 +18635,7 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18821,7 +18707,8 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [58,58] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18831,8 +18718,7 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18904,7 +18790,8 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18914,8 +18801,7 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18987,7 +18873,8 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [59,59] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -18997,8 +18884,7 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19070,7 +18956,8 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19080,8 +18967,7 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19153,7 +19039,8 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [60,60] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19163,8 +19050,7 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19236,7 +19122,8 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19246,8 +19133,7 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19319,7 +19205,8 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [61,61] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19329,8 +19216,7 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19402,7 +19288,8 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19412,8 +19299,7 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19485,7 +19371,8 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [62,62] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19495,8 +19382,7 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19568,7 +19454,8 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) { ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [63,63] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [63,63] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -19578,8 +19465,7 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll index 7fb60b987d95d..7d12bc6d2fba0 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -35,8 +35,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -134,8 +133,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -248,8 +246,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -339,8 +336,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -371,8 +367,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -382,10 +377,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -409,79 +403,19 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { } define <4 x i64> @foldv4i64() nounwind { -; AVX1-LABEL: foldv4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8] -; AVX2-NEXT: retq -; -; XOP-LABEL: foldv4i64: -; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8] -; XOP-NEXT: retq -; -; AVX512VPOPCNTDQ-LABEL: foldv4i64: -; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,64,0,8] -; AVX512VPOPCNTDQ-NEXT: retq -; -; AVX512VPOPCNTDQVL-LABEL: foldv4i64: -; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,64,0,8] -; AVX512VPOPCNTDQVL-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv4i64: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,64,0,8] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv4i64: -; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,64,0,8] -; BITALG-NEXT: retq +; ALL-LABEL: foldv4i64: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8] +; ALL-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> ) ret <4 x i64> %out } define <8 x i32> @foldv8i32() nounwind { -; AVX1-LABEL: foldv8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] -; AVX2-NEXT: retq -; -; XOP-LABEL: foldv8i32: -; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] -; XOP-NEXT: retq -; -; AVX512VPOPCNTDQ-LABEL: foldv8i32: -; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] -; AVX512VPOPCNTDQ-NEXT: retq -; -; AVX512VPOPCNTDQVL-LABEL: foldv8i32: -; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] -; AVX512VPOPCNTDQVL-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv8i32: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv8i32: -; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] -; BITALG-NEXT: retq +; ALL-LABEL: foldv8i32: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; ALL-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> ) ret <8 x i32> %out } @@ -632,8 +566,8 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) { ; AVX512VPOPCNTDQ-LABEL: ne_1_v4i64: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -641,8 +575,8 @@ define <4 x i64> @ne_1_v4i64(<4 x i64> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ne_1_v4i64: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq @@ -792,8 +726,8 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) { ; AVX512VPOPCNTDQ-LABEL: ne_1_v8i32: ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -801,8 +735,8 @@ define <8 x i32> @ne_1_v8i32(<8 x i32> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ne_1_v8i32: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpternlogq {{.*#+}} ymm0 = ~ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll index 1618a647a4062..67ebcde5072e3 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll @@ -127,8 +127,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -149,10 +148,9 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_2_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -166,8 +164,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -188,10 +185,9 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -218,8 +214,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -240,10 +235,9 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_3_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -257,8 +251,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -279,10 +272,9 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -309,8 +301,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -331,10 +322,9 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_3_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -348,8 +338,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -370,10 +359,9 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -400,8 +388,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -422,10 +409,9 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_4_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -439,8 +425,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -461,10 +446,9 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -491,8 +475,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -513,10 +496,9 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_4_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -530,8 +512,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -552,10 +533,9 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -582,8 +562,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -604,10 +583,9 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_5_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -621,8 +599,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -643,10 +620,9 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -673,8 +649,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -695,10 +670,9 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_5_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -712,8 +686,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -734,10 +707,9 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -764,8 +736,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -786,10 +757,9 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_6_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -803,8 +773,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -825,10 +794,9 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -855,8 +823,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -877,10 +844,9 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_6_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -894,8 +860,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -916,10 +881,9 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -946,8 +910,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -968,10 +931,9 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_7_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -985,8 +947,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1007,10 +968,9 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1153,8 +1113,7 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1181,10 +1140,9 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_2_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1214,10 +1172,9 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1247,8 +1204,7 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1275,10 +1231,9 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_3_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1308,10 +1263,9 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1341,8 +1295,7 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1369,10 +1322,9 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_3_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1402,10 +1354,9 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1435,8 +1386,7 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1463,10 +1413,9 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_4_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1496,10 +1445,9 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1529,8 +1477,7 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1557,10 +1504,9 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_4_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1590,10 +1536,9 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1623,8 +1568,7 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1651,10 +1595,9 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_5_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1684,10 +1627,9 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1717,8 +1659,7 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1745,10 +1686,9 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_5_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1778,10 +1718,9 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1811,8 +1750,7 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1839,10 +1777,9 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_6_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1872,10 +1809,9 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1905,8 +1841,7 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1933,10 +1868,9 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_6_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1966,10 +1900,9 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1999,8 +1932,7 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2027,10 +1959,9 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_7_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2060,10 +1991,9 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2093,8 +2023,7 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2121,10 +2050,9 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_7_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2154,10 +2082,9 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_7_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2187,8 +2114,7 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2215,10 +2141,9 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_8_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2248,10 +2173,9 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_8_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2281,8 +2205,7 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2309,10 +2232,9 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_8_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2342,10 +2264,9 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_8_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2375,8 +2296,7 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2403,10 +2323,9 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_9_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2436,10 +2355,9 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_9_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2469,8 +2387,7 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2497,10 +2414,9 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_9_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2530,10 +2446,9 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_9_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2563,8 +2478,7 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2591,10 +2505,9 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_10_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2624,10 +2537,9 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_10_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2657,8 +2569,7 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2685,10 +2596,9 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_10_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2718,10 +2628,9 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_10_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2751,8 +2660,7 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2779,10 +2687,9 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_11_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2812,10 +2719,9 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_11_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2845,8 +2751,7 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2873,10 +2778,9 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_11_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2906,10 +2810,9 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_11_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2939,8 +2842,7 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2967,10 +2869,9 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_12_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3000,10 +2901,9 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_12_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3033,8 +2933,7 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3061,10 +2960,9 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_12_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3094,10 +2992,9 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_12_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3127,8 +3024,7 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3155,10 +3051,9 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_13_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3188,10 +3083,9 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_13_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3221,8 +3115,7 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3249,10 +3142,9 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_13_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3282,10 +3174,9 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_13_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3315,8 +3206,7 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3343,10 +3233,9 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_14_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3376,10 +3265,9 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_14_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3409,8 +3297,7 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3437,10 +3324,9 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_14_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3470,10 +3356,9 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_14_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3503,8 +3388,7 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3531,10 +3415,9 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_15_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3564,10 +3447,9 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_15_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3672,8 +3554,7 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3703,10 +3584,9 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_2_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3753,8 +3633,7 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3784,10 +3663,9 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_3_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3834,8 +3712,7 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3865,10 +3742,9 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_3_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3915,8 +3791,7 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3946,10 +3821,9 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_4_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3996,8 +3870,7 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4027,10 +3900,9 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_4_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4077,8 +3949,7 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4108,10 +3979,9 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_5_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4158,8 +4028,7 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4189,10 +4058,9 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_5_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4239,8 +4107,7 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4270,10 +4137,9 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_6_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4320,8 +4186,7 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4351,10 +4216,9 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_6_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4401,8 +4265,7 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4432,10 +4295,9 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_7_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4482,8 +4344,7 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4513,10 +4374,9 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_7_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4563,8 +4423,7 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4594,10 +4453,9 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_8_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4644,8 +4502,7 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4675,10 +4532,9 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_8_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4725,8 +4581,7 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4756,10 +4611,9 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_9_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4806,8 +4660,7 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4837,10 +4690,9 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_9_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4887,8 +4739,7 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4918,10 +4769,9 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_10_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4968,8 +4818,7 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4999,10 +4848,9 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_10_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5049,8 +4897,7 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5080,10 +4927,9 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_11_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5130,8 +4976,7 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5161,10 +5006,9 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_11_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5211,8 +5055,7 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5242,10 +5085,9 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_12_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5292,8 +5134,7 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5323,10 +5164,9 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_12_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5373,8 +5213,7 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5404,10 +5243,9 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_13_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5454,8 +5292,7 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5485,10 +5322,9 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_13_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5535,8 +5371,7 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5566,10 +5401,9 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_14_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5616,8 +5450,7 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5647,10 +5480,9 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_14_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5697,8 +5529,7 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5728,10 +5559,9 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_15_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5778,8 +5608,7 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5809,10 +5638,9 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_15_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5859,8 +5687,7 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5890,10 +5717,9 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_16_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5940,8 +5766,7 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5971,10 +5796,9 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_16_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6021,8 +5845,7 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6052,10 +5875,9 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_17_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6102,8 +5924,7 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6133,10 +5954,9 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_17_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6183,8 +6003,7 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6214,10 +6033,9 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_18_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6264,8 +6082,7 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6295,10 +6112,9 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_18_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6345,8 +6161,7 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6376,10 +6191,9 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_19_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6426,8 +6240,7 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6457,10 +6270,9 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_19_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6507,8 +6319,7 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6538,10 +6349,9 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_20_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6588,8 +6398,7 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6619,10 +6428,9 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_20_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6669,8 +6477,7 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6700,10 +6507,9 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_21_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6750,8 +6556,7 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6781,10 +6586,9 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_21_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6831,8 +6635,7 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6862,10 +6665,9 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_22_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6912,8 +6714,7 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6943,10 +6744,9 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_22_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6993,8 +6793,7 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7024,10 +6823,9 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_23_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7074,8 +6872,7 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7105,10 +6902,9 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_23_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7155,8 +6951,7 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7186,10 +6981,9 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_24_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7236,8 +7030,7 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7267,10 +7060,9 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_24_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7317,8 +7109,7 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7348,10 +7139,9 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_25_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7398,8 +7188,7 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7429,10 +7218,9 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_25_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7479,8 +7267,7 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7510,10 +7297,9 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_26_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7560,8 +7346,7 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7591,10 +7376,9 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_26_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7641,8 +7425,7 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7672,10 +7455,9 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_27_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7722,8 +7504,7 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7753,10 +7534,9 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_27_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7803,8 +7583,7 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7834,10 +7613,9 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_28_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7884,8 +7662,7 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7915,10 +7692,9 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_28_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7965,8 +7741,7 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7996,10 +7771,9 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_29_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8046,8 +7820,7 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8077,10 +7850,9 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_29_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8127,8 +7899,7 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8158,10 +7929,9 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_30_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8208,8 +7978,7 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8239,10 +8008,9 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_30_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8289,8 +8057,7 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8320,10 +8087,9 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_31_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8444,8 +8210,7 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8467,10 +8232,9 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_2_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8509,8 +8273,7 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8532,10 +8295,9 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_3_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8574,8 +8336,7 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8597,10 +8358,9 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_3_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8639,8 +8399,7 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8662,10 +8421,9 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_4_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8704,8 +8462,7 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8727,10 +8484,9 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_4_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8769,8 +8525,7 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8792,10 +8547,9 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_5_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8834,8 +8588,7 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8857,10 +8610,9 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_5_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8899,8 +8651,7 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8922,10 +8673,9 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_6_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8964,8 +8714,7 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8987,10 +8736,9 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_6_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9029,8 +8777,7 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9052,10 +8799,9 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_7_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9094,8 +8840,7 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9117,10 +8862,9 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_7_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9159,8 +8903,7 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9182,10 +8925,9 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_8_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9224,8 +8966,7 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9247,10 +8988,9 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_8_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9289,8 +9029,7 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9312,10 +9051,9 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_9_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9354,8 +9092,7 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9377,10 +9114,9 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_9_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9419,8 +9155,7 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9442,10 +9177,9 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_10_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9484,8 +9218,7 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9507,10 +9240,9 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_10_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9549,8 +9281,7 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9572,10 +9303,9 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_11_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9614,8 +9344,7 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9637,10 +9366,9 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_11_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9679,8 +9407,7 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9702,10 +9429,9 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_12_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9744,8 +9470,7 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9767,10 +9492,9 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_12_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9809,8 +9533,7 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9832,10 +9555,9 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_13_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9874,8 +9596,7 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9897,10 +9618,9 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_13_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9939,8 +9659,7 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9962,10 +9681,9 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_14_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10004,8 +9722,7 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10027,10 +9744,9 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_14_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10069,8 +9785,7 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10092,10 +9807,9 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_15_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10134,8 +9848,7 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10157,10 +9870,9 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_15_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10199,8 +9911,7 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10222,10 +9933,9 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_16_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10264,8 +9974,7 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10287,10 +9996,9 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_16_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10329,8 +10037,7 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10352,10 +10059,9 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_17_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10394,8 +10100,7 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10417,10 +10122,9 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_17_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10459,8 +10163,7 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10482,10 +10185,9 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_18_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10524,8 +10226,7 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10547,10 +10248,9 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_18_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10589,8 +10289,7 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10612,10 +10311,9 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_19_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10654,8 +10352,7 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10677,10 +10374,9 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_19_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10719,8 +10415,7 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10742,10 +10437,9 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_20_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10784,8 +10478,7 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10807,10 +10500,9 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_20_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10849,8 +10541,7 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10872,10 +10563,9 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_21_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10914,8 +10604,7 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10937,10 +10626,9 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_21_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10979,8 +10667,7 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11002,10 +10689,9 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_22_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11044,8 +10730,7 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11067,10 +10752,9 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_22_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11109,8 +10793,7 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11132,10 +10815,9 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_23_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11174,8 +10856,7 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11197,10 +10878,9 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_23_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11239,8 +10919,7 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11262,10 +10941,9 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_24_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11304,8 +10982,7 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11327,10 +11004,9 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_24_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11369,8 +11045,7 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11392,10 +11067,9 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_25_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11434,8 +11108,7 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11457,10 +11130,9 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_25_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11499,8 +11171,7 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11522,10 +11193,9 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_26_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11564,8 +11234,7 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11587,10 +11256,9 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_26_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11629,8 +11297,7 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11652,10 +11319,9 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_27_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11694,8 +11360,7 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11717,10 +11382,9 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_27_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11759,8 +11423,7 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11782,10 +11445,9 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_28_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11824,8 +11486,7 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11847,10 +11508,9 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_28_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11889,8 +11549,7 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11912,10 +11571,9 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_29_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11954,8 +11612,7 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11977,10 +11634,9 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_29_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12019,8 +11675,7 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12042,10 +11697,9 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_30_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12084,8 +11738,7 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12107,10 +11760,9 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_30_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12149,8 +11801,7 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12172,10 +11823,9 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_31_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12214,8 +11864,7 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12237,10 +11886,9 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_31_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12279,8 +11927,7 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12302,10 +11949,9 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_32_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12344,8 +11990,7 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12367,10 +12012,9 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_32_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12409,8 +12053,7 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12432,10 +12075,9 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_33_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12474,8 +12116,7 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12497,10 +12138,9 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_33_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12539,8 +12179,7 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12562,10 +12201,9 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_34_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12604,8 +12242,7 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12627,10 +12264,9 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_34_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12669,8 +12305,7 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12692,10 +12327,9 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_35_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12734,8 +12368,7 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12757,10 +12390,9 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_35_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12799,8 +12431,7 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12822,10 +12453,9 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_36_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12864,8 +12494,7 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12887,10 +12516,9 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_36_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12929,8 +12557,7 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12952,10 +12579,9 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_37_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12994,8 +12620,7 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13017,10 +12642,9 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_37_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13059,8 +12683,7 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13082,10 +12705,9 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_38_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13124,8 +12746,7 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13147,10 +12768,9 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_38_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13189,8 +12809,7 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13212,10 +12831,9 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_39_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13254,8 +12872,7 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13277,10 +12894,9 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_39_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13319,8 +12935,7 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13342,10 +12957,9 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_40_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13384,8 +12998,7 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13407,10 +13020,9 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_40_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13449,8 +13061,7 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13472,10 +13083,9 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_41_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13514,8 +13124,7 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13537,10 +13146,9 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_41_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13579,8 +13187,7 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13602,10 +13209,9 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_42_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13644,8 +13250,7 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13667,10 +13272,9 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_42_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13709,8 +13313,7 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13732,10 +13335,9 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_43_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13774,8 +13376,7 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13797,10 +13398,9 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_43_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13839,8 +13439,7 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13862,10 +13461,9 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_44_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13904,8 +13502,7 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13927,10 +13524,9 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_44_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13969,8 +13565,7 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13992,10 +13587,9 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_45_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14034,8 +13628,7 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14057,10 +13650,9 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_45_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14099,8 +13691,7 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14122,10 +13713,9 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_46_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14164,8 +13754,7 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14187,10 +13776,9 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_46_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14229,8 +13817,7 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14252,10 +13839,9 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_47_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14294,8 +13880,7 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14317,10 +13902,9 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_47_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14359,8 +13943,7 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14382,10 +13965,9 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_48_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14424,8 +14006,7 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14447,10 +14028,9 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_48_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14489,8 +14069,7 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14512,10 +14091,9 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_49_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14554,8 +14132,7 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14577,10 +14154,9 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_49_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14619,8 +14195,7 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14642,10 +14217,9 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_50_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14684,8 +14258,7 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14707,10 +14280,9 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_50_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14749,8 +14321,7 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14772,10 +14343,9 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_51_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14814,8 +14384,7 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14837,10 +14406,9 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_51_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14879,8 +14447,7 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14902,10 +14469,9 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_52_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14944,8 +14510,7 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14967,10 +14532,9 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_52_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15009,8 +14573,7 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15032,10 +14595,9 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_53_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15074,8 +14636,7 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15097,10 +14658,9 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_53_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15139,8 +14699,7 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15162,10 +14721,9 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_54_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15204,8 +14762,7 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15227,10 +14784,9 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_54_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15269,8 +14825,7 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15292,10 +14847,9 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_55_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15334,8 +14888,7 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15357,10 +14910,9 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_55_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15399,8 +14951,7 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15422,10 +14973,9 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_56_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15464,8 +15014,7 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15487,10 +15036,9 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_56_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15529,8 +15077,7 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15552,10 +15099,9 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_57_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15594,8 +15140,7 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15617,10 +15162,9 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_57_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15659,8 +15203,7 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15682,10 +15225,9 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_58_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15724,8 +15266,7 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15747,10 +15288,9 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_58_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15789,8 +15329,7 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15812,10 +15351,9 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_59_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15854,8 +15392,7 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15877,10 +15414,9 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_59_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15919,8 +15455,7 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15942,10 +15477,9 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_60_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15984,8 +15518,7 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -16007,10 +15540,9 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_60_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -16049,8 +15581,7 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -16072,10 +15603,9 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_61_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -16114,8 +15644,7 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -16137,10 +15666,9 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_61_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -16179,8 +15707,7 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -16202,10 +15729,9 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_62_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -16244,8 +15770,7 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -16267,10 +15792,9 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_62_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -16309,8 +15833,7 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -16332,10 +15855,9 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_63_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll index f470a2be8aee8..a7aaf03a2408c 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll @@ -11,8 +11,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -32,10 +31,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; ; AVX512BW-LABEL: testv8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -66,8 +64,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -95,10 +92,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; ; AVX512BW-LABEL: testv16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -136,8 +132,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -161,10 +156,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512BW-LABEL: testv32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -189,10 +183,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-BW-LABEL: testv32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -217,8 +210,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -235,10 +227,9 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512BW-LABEL: testv64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -251,8 +242,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -269,10 +259,9 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-BW-LABEL: testv64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index 4898ae98faea2..e924fbd258f89 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -232,7 +232,7 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) { ; ; SSE41-LABEL: test_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm8 = [1,1] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [1,1] ; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: paddq %xmm5, %xmm1 @@ -255,7 +255,7 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) { ; ; AVX1-LABEL: test_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,1,1,1] ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -560,7 +560,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ; ; SSE41-LABEL: test_v16i32_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: paddd %xmm2, %xmm0 @@ -577,7 +577,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ; ; AVX1-SLOW-LABEL: test_v16i32_v16i8: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -595,7 +595,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ; ; AVX1-FAST-LABEL: test_v16i32_v16i8: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-FAST-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -669,7 +669,7 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) { ; ; SSE41-LABEL: test_v32i32_v32i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: paddd %xmm5, %xmm1 @@ -694,7 +694,7 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) { ; ; AVX1-SLOW-LABEL: test_v32i32_v32i8: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -720,7 +720,7 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) { ; ; AVX1-FAST-LABEL: test_v32i32_v32i8: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-FAST-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -993,7 +993,7 @@ define i16 @test_v16i16_v16i8(<16 x i16> %a0) { ; ; SSE41-LABEL: test_v16i16_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: packuswb %xmm1, %xmm0 @@ -1191,7 +1191,7 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) { ; ; SSE41-LABEL: test_v64i16_v64i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127] ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: packuswb %xmm1, %xmm0 @@ -1218,7 +1218,7 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) { ; ; AVX1-LABEL: test_v64i16_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index f434fc8c6cad8..e197e28383b97 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -519,8 +519,8 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: psllw $7, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %eax @@ -569,35 +569,15 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_v32i16_v32i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 -; AVX512F-NEXT: sete %al -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v32i16_v32i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: kortestw %k0, %k0 -; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v32i16_v32i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512VL-NEXT: kortestw %k0, %k0 -; AVX512VL-NEXT: sete %al -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: trunc_v32i16_v32i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a = trunc <32 x i16> %0 to <32 x i1> %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b @@ -658,35 +638,15 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_v64i8_v64i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 -; AVX512F-NEXT: sete %al -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v64i8_v64i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: kortestw %k0, %k0 -; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v64i8_v64i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512VL-NEXT: kortestw %k0, %k0 -; AVX512VL-NEXT: sete %al -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: trunc_v64i8_v64i1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a = trunc <64 x i8> %0 to <64 x i1> %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b @@ -923,8 +883,8 @@ define i1 @icmp0_v8i64_v8i1(<8 x i64>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por 8(%ebp), %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm0 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -991,8 +951,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por 8(%ebp), %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm0 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -1059,8 +1019,8 @@ define i1 @icmp0_v32i16_v32i1(<32 x i16>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por 8(%ebp), %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 @@ -1127,8 +1087,8 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por 8(%ebp), %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 @@ -1567,8 +1527,8 @@ define i1 @icmp1_v8i64_v8i1(<8 x i64>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -1640,8 +1600,8 @@ define i1 @icmp1_v16i32_v16i1(<16 x i32>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -1713,8 +1673,8 @@ define i1 @icmp1_v32i16_v32i1(<32 x i16>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 @@ -1786,8 +1746,8 @@ define i1 @icmp1_v64i8_v64i1(<64 x i8>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll b/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll index 5317f7ccc588b..95c70e1af3279 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-scalar.ll @@ -185,14 +185,14 @@ define i1 @test_v16i64(ptr %ptr) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm3 -; SSE2-NEXT: pand 112(%rdi), %xmm3 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 +; SSE2-NEXT: pand 112(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm3 ; SSE2-NEXT: pand 80(%rdi), %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand 96(%rdi), %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand 96(%rdi), %xmm3 ; SSE2-NEXT: pand 64(%rdi), %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 @@ -209,9 +209,9 @@ define i1 @test_v16i64(ptr %ptr) nounwind { ; SSE41-NEXT: movdqa 48(%rdi), %xmm3 ; SSE41-NEXT: pand 112(%rdi), %xmm3 ; SSE41-NEXT: pand 80(%rdi), %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand 96(%rdi), %xmm2 ; SSE41-NEXT: pand 64(%rdi), %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll index df6e079785568..455173a1314b5 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -166,8 +166,8 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pand 56(%ebp), %xmm2 ; X86-SSE-NEXT: pand 24(%ebp), %xmm0 -; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand 72(%ebp), %xmm3 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand 40(%ebp), %xmm1 ; X86-SSE-NEXT: pand %xmm3, %xmm1 ; X86-SSE-NEXT: pand %xmm0, %xmm1 @@ -337,8 +337,8 @@ define i32 @test_v16i32(<16 x i32> %a0) nounwind { ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand 8(%ebp), %xmm1 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand %xmm0, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE-NEXT: pand %xmm1, %xmm0 @@ -411,8 +411,8 @@ define i32 @test_v32i32(<32 x i32> %a0) nounwind { ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pand 56(%ebp), %xmm2 +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pand 24(%ebp), %xmm0 ; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand 72(%ebp), %xmm3 @@ -735,9 +735,9 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind { ; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pand 56(%ebp), %xmm2 ; X86-SSE-NEXT: pand 24(%ebp), %xmm0 -; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand 72(%ebp), %xmm3 ; X86-SSE-NEXT: pand 40(%ebp), %xmm1 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand %xmm3, %xmm1 ; X86-SSE-NEXT: pand %xmm0, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] @@ -1127,9 +1127,9 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind { ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pand 56(%ebp), %xmm2 ; X86-SSE-NEXT: pand 24(%ebp), %xmm0 +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand 72(%ebp), %xmm3 ; X86-SSE-NEXT: pand 40(%ebp), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll index 29366f74da12a..4855c00610a5d 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll @@ -31,7 +31,7 @@ define i64 @reduce_ctpop_v2i64(<2 x i64> %a0) { ; ; AVX2-LABEL: reduce_ctpop_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -48,7 +48,7 @@ define i64 @reduce_ctpop_v2i64(<2 x i64> %a0) { ; ; AVX512VL-LABEL: reduce_ctpop_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -103,7 +103,7 @@ define i32 @reduce_ctpop_v4i32(<4 x i32> %a0) { ; ; AVX2-LABEL: reduce_ctpop_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -126,7 +126,7 @@ define i32 @reduce_ctpop_v4i32(<4 x i32> %a0) { ; ; AVX512VL-LABEL: reduce_ctpop_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -184,7 +184,7 @@ define i16 @reduce_ctpop_v8i16(<8 x i16> %a0) { ; ; AVX2-LABEL: reduce_ctpop_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -204,7 +204,7 @@ define i16 @reduce_ctpop_v8i16(<8 x i16> %a0) { ; ; AVX512VL-LABEL: reduce_ctpop_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -261,7 +261,7 @@ define i8 @reduce_ctpop_v16i8(<16 x i8> %a0) { ; ; AVX2-LABEL: reduce_ctpop_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -279,7 +279,7 @@ define i8 @reduce_ctpop_v16i8(<16 x i8> %a0) { ; ; AVX512VL-LABEL: reduce_ctpop_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -347,8 +347,7 @@ define i64 @reduce_ctpop_v4i64(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -366,10 +365,9 @@ define i64 @reduce_ctpop_v4i64(<4 x i64> %a0) { ; ; AVX512VL-LABEL: reduce_ctpop_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -443,8 +441,7 @@ define i32 @reduce_ctpop_v8i32(<8 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -468,10 +465,9 @@ define i32 @reduce_ctpop_v8i32(<8 x i32> %a0) { ; ; AVX512VL-LABEL: reduce_ctpop_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -558,8 +554,7 @@ define i64 @reduce_ctpop_v8i64(<8 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -584,10 +579,9 @@ define i64 @reduce_ctpop_v8i64(<8 x i64> %a0) { ; ; AVX512VL-LABEL: reduce_ctpop_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VL-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -691,8 +685,7 @@ define i32 @reduce_ctpop_v16i32(<16 x i32> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -728,10 +721,9 @@ define i32 @reduce_ctpop_v16i32(<16 x i32> %a0) { ; ; AVX512VL-LABEL: reduce_ctpop_v16i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VL-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -862,8 +854,7 @@ define i64 @reduce_ctpop_v16i64(<16 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 @@ -902,10 +893,9 @@ define i64 @reduce_ctpop_v16i64(<16 x i64> %a0) { ; ; AVX512VL-LABEL: reduce_ctpop_v16i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %zmm3, %zmm4, %zmm3 ; AVX512VL-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VL-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -1054,8 +1044,7 @@ define <4 x i64> @reduce_ctpop_v4i64_buildvector_v4i64(<4 x i64> %a0, <4 x i64> ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 @@ -1097,10 +1086,9 @@ define <4 x i64> @reduce_ctpop_v4i64_buildvector_v4i64(<4 x i64> %a0, <4 x i64> ; ; AVX512VL-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 @@ -1372,8 +1360,7 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64> ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm0, %ymm8, %ymm10 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm10 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm0, %ymm8, %ymm0 @@ -1479,10 +1466,9 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64> ; ; AVX512VL-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm0, %ymm8, %ymm9 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %ymm9, %ymm10, %ymm9 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm0, %ymm8, %ymm0 @@ -1552,13 +1538,13 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64> ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsadbw %zmm5, %zmm4, %zmm4 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,12,8,12,8,12,0,4] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [8,12,8,12,8,12,0,4] ; AVX512VL-NEXT: vpermd %zmm4, %zmm6, %zmm4 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-NEXT: vpsadbw %zmm5, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,12,0,4] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [8,12,0,4] ; AVX512VL-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX512VL-NEXT: retq @@ -1586,13 +1572,13 @@ define <8 x i32> @reduce_ctpop_v4i64_buildvector_v8i32(<4 x i64> %a0, <4 x i64> ; AVX512VPOPCNT-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512VPOPCNT-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512VPOPCNT-NEXT: vpsadbw %zmm5, %zmm4, %zmm4 -; AVX512VPOPCNT-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,12,8,12,8,12,0,4] +; AVX512VPOPCNT-NEXT: vmovdqa {{.*#+}} ymm6 = [8,12,8,12,8,12,0,4] ; AVX512VPOPCNT-NEXT: vpermd %zmm4, %zmm6, %zmm4 ; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 ; AVX512VPOPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VPOPCNT-NEXT: vpsadbw %zmm5, %zmm0, %zmm0 -; AVX512VPOPCNT-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,12,0,4] +; AVX512VPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [8,12,0,4] ; AVX512VPOPCNT-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512VPOPCNT-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX512VPOPCNT-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll index 7048b98227620..19b5e426c7e12 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -162,7 +162,6 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] ; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] @@ -175,6 +174,7 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE41-NEXT: cmpunordss %xmm1, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE41-NEXT: movaps %xmm3, %xmm1 ; SSE41-NEXT: maxss %xmm2, %xmm1 ; SSE41-NEXT: cmpunordss %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll index 008e3e4c217cb..143e971591256 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll @@ -24,8 +24,8 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB1_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movaps %xmm2, %xmm3 @@ -107,8 +107,8 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB2_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movaps %xmm3, %xmm4 @@ -127,8 +127,8 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-NEXT: andnps %xmm4, %xmm2 ; SSE2-NEXT: orps %xmm5, %xmm2 ; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB2_6 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: movaps %xmm1, %xmm4 @@ -146,8 +146,8 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-NEXT: andnps %xmm4, %xmm3 ; SSE2-NEXT: orps %xmm5, %xmm3 ; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB2_10 ; SSE2-NEXT: # %bb.9: ; SSE2-NEXT: movaps %xmm0, %xmm2 @@ -329,8 +329,8 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB3_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm2, %xmm3 @@ -348,8 +348,8 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE2-NEXT: orps %xmm4, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB3_6 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: movdqa %xmm3, %xmm4 @@ -367,8 +367,8 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE2-NEXT: orps %xmm5, %xmm2 ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB3_10 ; SSE2-NEXT: # %bb.9: ; SSE2-NEXT: movdqa %xmm1, %xmm3 @@ -650,8 +650,8 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB4_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm2, %xmm3 @@ -669,8 +669,8 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-NEXT: orps %xmm4, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB4_6 ; SSE2-NEXT: # %bb.5: ; SSE2-NEXT: movdqa %xmm3, %xmm4 @@ -688,8 +688,8 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE2-NEXT: orps %xmm5, %xmm2 ; SSE2-NEXT: movd %xmm2, %eax ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: js .LBB4_10 ; SSE2-NEXT: # %bb.9: ; SSE2-NEXT: movdqa %xmm1, %xmm3 @@ -964,8 +964,8 @@ define double @test_v2f64(<2 x double> %a0) { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB5_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm2, %xmm3 @@ -1066,8 +1066,8 @@ define double @test_v4f64(<4 x double> %a0) { ; SSE2-NEXT: orpd %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB6_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm2, %xmm3 @@ -1250,8 +1250,8 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE2-NEXT: orpd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB7_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm2, %xmm3 @@ -1547,8 +1547,8 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE2-NEXT: orpd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB8_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: movdqa %xmm2, %xmm3 @@ -1568,71 +1568,71 @@ define double @test_v16f64(<16 x double> %a0) { ; ; SSE41-LABEL: test_v16f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: movapd %xmm0, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm10 -; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: maxpd %xmm10, %xmm9 +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: movapd %xmm8, %xmm9 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: maxpd %xmm9, %xmm8 ; SSE41-NEXT: movapd %xmm7, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movapd %xmm8, %xmm7 -; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm3 -; SSE41-NEXT: maxpd %xmm7, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: maxpd %xmm7, %xmm1 ; SSE41-NEXT: movapd %xmm5, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm3 -; SSE41-NEXT: maxpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm9, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 -; SSE41-NEXT: movapd %xmm2, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movapd %xmm8, %xmm5 +; SSE41-NEXT: maxpd %xmm7, %xmm5 +; SSE41-NEXT: movapd %xmm8, %xmm0 +; SSE41-NEXT: cmpunordpd %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 +; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm2 -; SSE41-NEXT: maxpd %xmm5, %xmm2 +; SSE41-NEXT: maxpd %xmm1, %xmm2 ; SSE41-NEXT: movapd %xmm6, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: movapd %xmm4, %xmm1 -; SSE41-NEXT: maxpd %xmm5, %xmm1 +; SSE41-NEXT: maxpd %xmm6, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 +; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: maxpd %xmm4, %xmm1 +; SSE41-NEXT: maxpd %xmm3, %xmm1 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm1 ; SSE41-NEXT: maxpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: cmpunordpd %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: movq %xmm1, %rax diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll index 727af12217c67..c19dbc610bb7e 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -97,7 +97,6 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] ; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] @@ -110,6 +109,7 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE41-NEXT: cmpunordss %xmm1, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE41-NEXT: movaps %xmm3, %xmm1 ; SSE41-NEXT: minss %xmm2, %xmm1 ; SSE41-NEXT: cmpunordss %xmm2, %xmm2 @@ -627,8 +627,8 @@ define double @test_v2f64(<2 x double> %a0) { define double @test_v3f64(<3 x double> %a0) { ; SSE2-LABEL: test_v3f64: ; SSE2: # %bb.0: -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movapd %xmm2, %xmm1 ; SSE2-NEXT: minpd %xmm0, %xmm1 ; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 @@ -648,8 +648,8 @@ define double @test_v3f64(<3 x double> %a0) { ; ; SSE41-LABEL: test_v3f64: ; SSE41: # %bb.0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: minpd %xmm0, %xmm1 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index ab95081e2938e..616e2e661f720 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -2326,8 +2326,8 @@ define i8 @illegal_v8i8(i8 %a0, ptr %a1) { ; ; AVX-LABEL: illegal_v8i8: ; AVX: # %bb.0: -; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: movl %edi, %eax ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index f80544fdef7e6..2eb8f91eee604 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -2302,9 +2302,9 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>, <32 x i16>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqw 56(%ebp), %xmm2 ; X86-SSE2-NEXT: pcmpeqw 24(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: pcmpeqw 72(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqw 40(%ebp), %xmm1 @@ -2402,9 +2402,9 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqb 56(%ebp), %xmm2 ; X86-SSE2-NEXT: pcmpeqb 24(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: pcmpeqb 72(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqb 40(%ebp), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 9cd0f4d12e15a..066998037c4bc 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -1027,21 +1027,21 @@ define zeroext i1 @PR44781(ptr %0) { ; ; AVX2-LABEL: PR44781: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [64424509455,64424509455] ; AVX2-NEXT: vptest (%rdi), %xmm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: retq ; ; AVX512F-LABEL: PR44781: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [64424509455,64424509455] ; AVX512F-NEXT: vptest (%rdi), %xmm0 ; AVX512F-NEXT: sete %al ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: PR44781: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [64424509455,64424509455] ; AVX512BW-NEXT: vptest (%rdi), %xmm0 ; AVX512BW-NEXT: sete %al ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll index 3c960f255046c..c553327987c4e 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -166,8 +166,8 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: por 56(%ebp), %xmm2 ; X86-SSE-NEXT: por 24(%ebp), %xmm0 -; X86-SSE-NEXT: por %xmm2, %xmm0 ; X86-SSE-NEXT: por 72(%ebp), %xmm3 +; X86-SSE-NEXT: por %xmm2, %xmm0 ; X86-SSE-NEXT: por 40(%ebp), %xmm1 ; X86-SSE-NEXT: por %xmm3, %xmm1 ; X86-SSE-NEXT: por %xmm0, %xmm1 @@ -337,8 +337,8 @@ define i32 @test_v16i32(<16 x i32> %a0) nounwind { ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: por %xmm2, %xmm0 ; X86-SSE-NEXT: por 8(%ebp), %xmm1 +; X86-SSE-NEXT: por %xmm2, %xmm0 ; X86-SSE-NEXT: por %xmm0, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE-NEXT: por %xmm1, %xmm0 @@ -411,8 +411,8 @@ define i32 @test_v32i32(<32 x i32> %a0) nounwind { ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: por 56(%ebp), %xmm2 +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: por 24(%ebp), %xmm0 ; X86-SSE-NEXT: por %xmm2, %xmm0 ; X86-SSE-NEXT: por 72(%ebp), %xmm3 @@ -735,9 +735,9 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind { ; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: por 56(%ebp), %xmm2 ; X86-SSE-NEXT: por 24(%ebp), %xmm0 -; X86-SSE-NEXT: por %xmm2, %xmm0 ; X86-SSE-NEXT: por 72(%ebp), %xmm3 ; X86-SSE-NEXT: por 40(%ebp), %xmm1 +; X86-SSE-NEXT: por %xmm2, %xmm0 ; X86-SSE-NEXT: por %xmm3, %xmm1 ; X86-SSE-NEXT: por %xmm0, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] @@ -1127,9 +1127,9 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind { ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: por 56(%ebp), %xmm2 ; X86-SSE-NEXT: por 24(%ebp), %xmm0 +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: por %xmm2, %xmm0 ; X86-SSE-NEXT: por 72(%ebp), %xmm3 ; X86-SSE-NEXT: por 40(%ebp), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll index 322fdde106dcf..80b6b7e27e892 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -37,7 +37,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm0 @@ -127,7 +127,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-LABEL: test_v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm4 @@ -283,7 +283,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-LABEL: test_v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm6 @@ -538,7 +538,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-LABEL: test_v16i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm10 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll index bb87740c21538..7ead5a0eda6d7 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -37,7 +37,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm4 @@ -127,7 +127,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE41-LABEL: test_v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 @@ -283,7 +283,7 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE41-LABEL: test_v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm6 @@ -538,7 +538,7 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE41-LABEL: test_v16i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm10 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index b355c3dee5309..409c859259457 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -78,7 +78,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX2-LABEL: test_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index 2d68cf9d6374d..0a7772fc06c5a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -78,7 +78,7 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; AVX2-LABEL: test_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 320dce840ea57..f3654cd48f889 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -342,29 +342,17 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) nounwind { } define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { -; SSE2-LABEL: trunc_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorb %ah, %al -; SSE2-NEXT: setnp %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: xorb %ah, %al -; SSE41-NEXT: setnp %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: xorb %ah, %al +; SSE-NEXT: setnp %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: trunc_v16i16_v16i1: ; AVX1: # %bb.0: @@ -638,9 +626,9 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; X86-SSE2-NEXT: pand %xmm3, %xmm1 ; X86-SSE2-NEXT: pand %xmm3, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: pand 8(%ebp), %xmm3 +; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm3, %xmm2 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE2-NEXT: psllw $7, %xmm0 @@ -669,7 +657,7 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; ; SSE41-LABEL: trunc_v16i32_v16i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 @@ -685,7 +673,7 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -766,9 +754,9 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) nounwind { ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; X86-SSE2-NEXT: pand %xmm3, %xmm1 ; X86-SSE2-NEXT: pand %xmm3, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: pand 8(%ebp), %xmm3 +; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm3, %xmm2 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 ; X86-SSE2-NEXT: psllw $7, %xmm2 @@ -779,37 +767,21 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) nounwind { ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: trunc_v32i16_v32i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; X64-SSE2-NEXT: pand %xmm4, %xmm3 -; X64-SSE2-NEXT: pand %xmm4, %xmm2 -; X64-SSE2-NEXT: packuswb %xmm3, %xmm2 -; X64-SSE2-NEXT: pand %xmm4, %xmm1 -; X64-SSE2-NEXT: pand %xmm4, %xmm0 -; X64-SSE2-NEXT: packuswb %xmm1, %xmm0 -; X64-SSE2-NEXT: pxor %xmm2, %xmm0 -; X64-SSE2-NEXT: psllw $7, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: xorb %ah, %al -; X64-SSE2-NEXT: setnp %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: xorb %ah, %al -; SSE41-NEXT: setnp %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: trunc_v32i16_v32i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; X64-SSE-NEXT: pand %xmm4, %xmm3 +; X64-SSE-NEXT: pand %xmm4, %xmm2 +; X64-SSE-NEXT: packuswb %xmm3, %xmm2 +; X64-SSE-NEXT: pand %xmm4, %xmm1 +; X64-SSE-NEXT: pand %xmm4, %xmm0 +; X64-SSE-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE-NEXT: pxor %xmm2, %xmm0 +; X64-SSE-NEXT: psllw $7, %xmm0 +; X64-SSE-NEXT: pmovmskb %xmm0, %eax +; X64-SSE-NEXT: xorb %ah, %al +; X64-SSE-NEXT: setnp %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_v32i1: ; AVX1: # %bb.0: @@ -2884,9 +2856,9 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>, <32 x i16>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqw 56(%ebp), %xmm2 ; X86-SSE2-NEXT: pcmpeqw 24(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pcmpeqw 72(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqw 40(%ebp), %xmm1 @@ -3004,9 +2976,9 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqb 56(%ebp), %xmm2 ; X86-SSE2-NEXT: pcmpeqb 24(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pcmpeqb 72(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqb 40(%ebp), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll index 33199388fd6fc..37e3dad500a4f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -166,8 +166,8 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind { ; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pxor 56(%ebp), %xmm2 ; X86-SSE-NEXT: pxor 24(%ebp), %xmm0 -; X86-SSE-NEXT: pxor %xmm2, %xmm0 ; X86-SSE-NEXT: pxor 72(%ebp), %xmm3 +; X86-SSE-NEXT: pxor %xmm2, %xmm0 ; X86-SSE-NEXT: pxor 40(%ebp), %xmm1 ; X86-SSE-NEXT: pxor %xmm3, %xmm1 ; X86-SSE-NEXT: pxor %xmm0, %xmm1 @@ -337,8 +337,8 @@ define i32 @test_v16i32(<16 x i32> %a0) nounwind { ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: pxor %xmm2, %xmm0 ; X86-SSE-NEXT: pxor 8(%ebp), %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm0 ; X86-SSE-NEXT: pxor %xmm0, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; X86-SSE-NEXT: pxor %xmm1, %xmm0 @@ -411,8 +411,8 @@ define i32 @test_v32i32(<32 x i32> %a0) nounwind { ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pxor 56(%ebp), %xmm2 +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pxor 24(%ebp), %xmm0 ; X86-SSE-NEXT: pxor %xmm2, %xmm0 ; X86-SSE-NEXT: pxor 72(%ebp), %xmm3 @@ -735,9 +735,9 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind { ; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pxor 56(%ebp), %xmm2 ; X86-SSE-NEXT: pxor 24(%ebp), %xmm0 -; X86-SSE-NEXT: pxor %xmm2, %xmm0 ; X86-SSE-NEXT: pxor 72(%ebp), %xmm3 ; X86-SSE-NEXT: pxor 40(%ebp), %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm0 ; X86-SSE-NEXT: pxor %xmm3, %xmm1 ; X86-SSE-NEXT: pxor %xmm0, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] @@ -1127,9 +1127,9 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind { ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pxor 56(%ebp), %xmm2 ; X86-SSE-NEXT: pxor 24(%ebp), %xmm0 +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pxor %xmm2, %xmm0 ; X86-SSE-NEXT: pxor 72(%ebp), %xmm3 ; X86-SSE-NEXT: pxor 40(%ebp), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index a768baae97add..5cbb552df765f 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -139,7 +139,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 @@ -155,7 +155,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 @@ -194,7 +194,7 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 @@ -220,7 +220,7 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 @@ -276,7 +276,7 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k2 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill @@ -324,7 +324,7 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 4(%rdi), %k2 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill @@ -445,7 +445,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0] +; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,1,1,u,u] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: movb $63, %al @@ -462,7 +462,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,1,1,u,u] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movb $63, %al @@ -480,7 +480,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,1,1,u,u] ; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512BW-NEXT: movb $63, %al @@ -506,7 +506,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF @@ -522,7 +522,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF @@ -538,7 +538,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF @@ -563,7 +563,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 @@ -573,7 +573,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -587,7 +587,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 @@ -596,7 +596,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -610,7 +610,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF @@ -637,7 +637,7 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 @@ -645,10 +645,10 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -664,7 +664,7 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -672,10 +672,10 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -691,15 +691,15 @@ define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -722,7 +722,7 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 @@ -730,10 +730,10 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -763,7 +763,7 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw (%rdi), %k1 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 @@ -771,10 +771,10 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -1272,7 +1272,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 @@ -1286,9 +1286,9 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm8 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm9 @@ -1341,7 +1341,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 @@ -1355,9 +1355,9 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm8 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm9 @@ -2314,7 +2314,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} @@ -2338,7 +2338,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} @@ -2364,7 +2364,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512BW-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} @@ -2390,7 +2390,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512VBMI-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VBMI-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} @@ -2411,7 +2411,7 @@ define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -2423,7 +2423,7 @@ define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -2435,7 +2435,7 @@ define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -2455,10 +2455,10 @@ define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -2472,10 +2472,10 @@ define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -2511,16 +2511,16 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} @@ -2538,16 +2538,16 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} @@ -2594,17 +2594,17 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 @@ -2641,17 +2641,17 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -2768,13 +2768,13 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm10 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm11 @@ -2849,13 +2849,13 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm10 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm11 @@ -3057,7 +3057,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-ONLY-NEXT: movw $1023, %ax # imm = 0x3FF @@ -3074,7 +3074,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movw $1023, %ax # imm = 0x3FF @@ -3091,7 +3091,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512BW-NEXT: movw $1023, %ax # imm = 0x3FF @@ -3122,7 +3122,7 @@ define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: movw $15, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -3141,7 +3141,7 @@ define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-NEXT: movw $15, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -3155,7 +3155,7 @@ define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: movl $1048575, %eax # imm = 0xFFFFF @@ -3182,7 +3182,7 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 @@ -3190,12 +3190,12 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: kmovw %eax, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -3211,7 +3211,7 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 @@ -3219,11 +3219,11 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k3 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -3290,7 +3290,7 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 @@ -3298,16 +3298,16 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -3327,7 +3327,7 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -3335,16 +3335,16 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -3364,23 +3364,23 @@ define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -3404,7 +3404,7 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 @@ -3414,13 +3414,13 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -3463,7 +3463,7 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 @@ -3473,13 +3473,13 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -4243,7 +4243,7 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 @@ -4257,13 +4257,13 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm6 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm13 @@ -4346,7 +4346,7 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -4360,13 +4360,13 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm13 @@ -5793,7 +5793,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF @@ -5809,7 +5809,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF @@ -5825,7 +5825,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF @@ -5856,7 +5856,7 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-SLOW-NEXT: movw $255, %ax ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -5870,13 +5870,13 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512F-FAST-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: movw $255, %ax ; AVX512F-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -5896,7 +5896,7 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-SLOW-NEXT: movw $255, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} -; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -5910,13 +5910,13 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-FAST-NEXT: movw $255, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 ; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -5930,7 +5930,7 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF @@ -5957,7 +5957,7 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 @@ -5965,10 +5965,10 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -5984,7 +5984,7 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -5992,10 +5992,10 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -6011,15 +6011,15 @@ define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -6041,7 +6041,7 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 @@ -6049,19 +6049,19 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -6083,7 +6083,7 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -6091,19 +6091,19 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -6125,27 +6125,27 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 ; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -6170,7 +6170,7 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 @@ -6180,15 +6180,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -6238,7 +6238,7 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 @@ -6248,15 +6248,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -7158,7 +7158,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 @@ -7172,15 +7172,15 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm9 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm5 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm6 ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm4, %zmm7 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm15 @@ -7278,7 +7278,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -7292,15 +7292,15 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm6 ; AVX512DQ-NEXT: vpermd %zmm7, %zmm4, %zmm7 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm15 @@ -9001,7 +9001,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-ONLY-NEXT: movw $16383, %ax # imm = 0x3FFF @@ -9019,7 +9019,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: movw $16383, %ax # imm = 0x3FFF @@ -9037,7 +9037,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512BW-NEXT: movw $16383, %ax # imm = 0x3FFF @@ -9064,13 +9064,13 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -9085,13 +9085,13 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -9106,7 +9106,7 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: movl $268435455, %eax # imm = 0xFFFFFFF @@ -9134,7 +9134,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 ; AVX512F-SLOW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-SLOW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 @@ -9142,10 +9142,10 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-SLOW-NEXT: kmovw %eax, %k2 ; AVX512F-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -9168,7 +9168,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512F-FAST-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-FAST-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 @@ -9176,15 +9176,15 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-FAST-NEXT: kmovw %eax, %k2 ; AVX512F-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -9202,7 +9202,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k1, %zmm1 @@ -9210,10 +9210,10 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 @@ -9235,7 +9235,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-FAST-NEXT: vpmovm2d %k1, %zmm1 @@ -9243,14 +9243,14 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k4 ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -9325,7 +9325,7 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 @@ -9333,22 +9333,22 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -9372,7 +9372,7 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 @@ -9380,22 +9380,22 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -9419,31 +9419,31 @@ define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 ; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm6 ; AVX512BW-NEXT: vptestmd %zmm6, %zmm6, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512BW-NEXT: vpermd %zmm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -9469,7 +9469,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 @@ -9479,17 +9479,17 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -9546,7 +9546,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 @@ -9556,17 +9556,17 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 @@ -10604,7 +10604,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -10618,17 +10618,17 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm11 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm0 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm17, %zmm4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm6 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm19, %zmm7 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm5, %zmm8 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm10 @@ -10741,7 +10741,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -10755,17 +10755,17 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm11 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm7 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm5, %zmm8 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm10 @@ -12716,7 +12716,7 @@ define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -12728,7 +12728,7 @@ define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -12740,7 +12740,7 @@ define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -12760,10 +12760,10 @@ define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -12777,10 +12777,10 @@ define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} @@ -12794,7 +12794,7 @@ define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} @@ -12817,16 +12817,16 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} @@ -12844,16 +12844,16 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} @@ -12899,29 +12899,29 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -12948,29 +12948,29 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -13040,21 +13040,21 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm0 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm16 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm2, %zmm2 @@ -13121,21 +13121,21 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm8 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm10 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm12 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm14 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm1 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm2, %zmm2 @@ -13207,8 +13207,8 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpmovb2m %zmm2, %k3 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] +; AVX512BW-NEXT: vpmovb2m %zmm2, %k3 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 ; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 @@ -13277,25 +13277,25 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm10 {%k1} {z} = -1 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm12, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm14, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm16, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm4 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm22, %zmm5 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm24, %zmm7 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm26, %zmm9 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm11 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm13 @@ -13436,25 +13436,25 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm12, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm16, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm4 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm22, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm24, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm9 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm11 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm13 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 993e6afc0eaf3..df6276d241fb4 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -22,58 +22,59 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE2-LABEL: var_rotate_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] -; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psllq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE2-NEXT: psllq %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psllq %xmm1, %xmm4 -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE2-NEXT: psubq %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE2-NEXT: psrlq %xmm2, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: orpd %xmm4, %xmm0 +; SSE2-NEXT: orpd %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_rotate_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [64,64] -; SSE41-NEXT: psubq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllq %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE41-NEXT: psllq %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [64,64] ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psllq %xmm1, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: psubq %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psrlq %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: psrlq %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: var_rotate_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_rotate_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 @@ -101,20 +102,20 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; X86-SSE2-LABEL: var_rotate_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,0,64,0] -; X86-SSE2-NEXT: psubq %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psllq %xmm1, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psllq %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: psllq %xmm1, %xmm4 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; X86-SSE2-NEXT: psubq %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlq %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: psrlq %xmm3, %xmm0 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm4, %xmm0 +; X86-SSE2-NEXT: orpd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %b64 = sub <2 x i64> , %b %shl = shl <2 x i64> %a, %b @@ -344,7 +345,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -355,7 +356,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -660,34 +661,34 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { -; SSE2-LABEL: splatvar_rotate_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] -; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psllq %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: splatvar_rotate_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,64] +; SSE-NEXT: psubq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psllq %xmm1, %xmm3 +; SSE-NEXT: psrlq %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: retq ; -; SSE41-LABEL: splatvar_rotate_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [64,64] -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllq %xmm1, %xmm3 -; SSE41-NEXT: psrlq %xmm2, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: retq +; AVX1-LABEL: splatvar_rotate_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq ; -; AVX-LABEL: splatvar_rotate_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] -; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: splatvar_rotate_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512NOVLX-LABEL: splatvar_rotate_v2i64: ; AVX512NOVLX: # %bb.0: @@ -805,16 +806,16 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: pslld %xmm1, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld %xmm1, %xmm0 +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_rotate_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [15,0] +; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pandn %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -827,7 +828,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX-LABEL: splatvar_rotate_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -838,7 +839,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512F-LABEL: splatvar_rotate_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -849,7 +850,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512VL-LABEL: splatvar_rotate_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -860,7 +861,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512BW-LABEL: splatvar_rotate_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -871,7 +872,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512VLBW-LABEL: splatvar_rotate_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -914,9 +915,9 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; X86-SSE2-NEXT: pslld %xmm1, %xmm2 -; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld %xmm1, %xmm0 +; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm0 ; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 ; X86-SSE2-NEXT: retl @@ -935,9 +936,9 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; SSE-NEXT: psllw %xmm1, %xmm2 -; SSE-NEXT: psrlw $8, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psllw %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm2 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq @@ -947,11 +948,11 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: splatvar_rotate_v16i8: @@ -959,11 +960,11 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512-NEXT: vpsllw %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_rotate_v16i8: @@ -985,9 +986,9 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; X86-SSE2-NEXT: psllw %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: psllw %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm2 ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE2-NEXT: retl @@ -1053,7 +1054,7 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { ; AVX512NOVLX-LABEL: constant_rotate_v2i64: ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] +; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] ; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512NOVLX-NEXT: vzeroupper @@ -1093,8 +1094,8 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1137,7 +1138,7 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { ; AVX512NOVLX-LABEL: constant_rotate_v4i32: ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512NOVLX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] +; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] ; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512NOVLX-NEXT: vzeroupper @@ -1157,8 +1158,8 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1173,27 +1174,18 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { } define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { -; SSE2-LABEL: constant_rotate_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pmulhuw %xmm1, %xmm2 -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_rotate_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmulhuw %xmm1, %xmm2 -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: constant_rotate_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmulhuw %xmm1, %xmm2 +; SSE-NEXT: pmullw %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: constant_rotate_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 @@ -1201,7 +1193,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { ; ; AVX512F-LABEL: constant_rotate_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 @@ -1218,8 +1210,8 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_rotate_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0 @@ -1236,7 +1228,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { ; AVX512VBMI2-LABEL: constant_rotate_v8i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1272,9 +1264,9 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,128,64,32,16,8,4,2] -; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] +; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1283,9 +1275,9 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -1294,9 +1286,9 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -1305,20 +1297,20 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] -; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_rotate_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1330,20 +1322,20 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: constant_rotate_v16i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -1355,9 +1347,9 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VLVBMI2-NEXT: retq @@ -1372,9 +1364,9 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,128,64,32,16,8,4,2] -; X86-SSE2-NEXT: psrlw $8, %xmm1 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128] +; X86-SSE2-NEXT: psrlw $8, %xmm1 ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index c2c6a5f7eba57..7a86aa878912f 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -17,29 +17,30 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: var_rotate_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] -; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5 +; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vpsrlq %xmm6, %xmm4, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-NEXT: vpsrlq %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -265,7 +266,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -513,14 +514,15 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -528,7 +530,7 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX2-LABEL: splatvar_rotate_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [64,64] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -582,19 +584,19 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm3[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_rotate_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpsllq %xmm1, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 @@ -643,19 +645,19 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw %xmm1, %xmm4, %xmm2 -; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm1 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_rotate_v16i16: @@ -780,11 +782,11 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_rotate_v32i8: @@ -792,11 +794,11 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_rotate_v32i8: @@ -860,7 +862,7 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX512NOVLX-LABEL: constant_rotate_v4i64: ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512NOVLX-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] +; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] ; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512NOVLX-NEXT: retq @@ -899,9 +901,9 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] @@ -920,7 +922,7 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind { ; AVX512NOVLX-LABEL: constant_rotate_v8i32: ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512NOVLX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] ; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512NOVLX-NEXT: retq @@ -959,7 +961,7 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -993,8 +995,8 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_rotate_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -1010,7 +1012,7 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { ; AVX512VBMI2-LABEL: constant_rotate_v16i16: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1046,11 +1048,11 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 @@ -1068,9 +1070,9 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1079,9 +1081,9 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1090,22 +1092,20 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_rotate_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1116,22 +1116,20 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: constant_rotate_v32i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] -; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] -; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1142,9 +1140,9 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: retq @@ -1370,7 +1368,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 @@ -1629,7 +1627,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 8ac0b178a16df..4acd60ca3bc46 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -67,7 +67,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512VL-LABEL: var_rotate_v32i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] @@ -97,7 +97,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -107,7 +107,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -134,30 +134,30 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] +; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm5 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm5 = zmm3 ^ (zmm4 & (zmm5 ^ zmm3)) ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm5 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm6 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4)) +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm6 = zmm5 ^ (zmm7 & (zmm6 ^ zmm5)) ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 +; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm5 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8 -; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3)) +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm5 = zmm3 ^ (zmm4 & (zmm5 ^ zmm3)) ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3)) @@ -190,7 +190,7 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm8) ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 @@ -320,12 +320,12 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind ; AVX512F-NEXT: vpsrlw %xmm3, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_rotate_v32i16: @@ -337,12 +337,12 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm6 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm6, %ymm3 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_rotate_v32i16: @@ -434,11 +434,11 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_rotate_v64i8: @@ -446,11 +446,11 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_rotate_v64i8: @@ -458,11 +458,11 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_rotate_v64i8: @@ -470,11 +470,11 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer %splat8 = sub <64 x i8> , %splat @@ -619,9 +619,9 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -630,9 +630,9 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -641,9 +641,9 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq @@ -652,9 +652,9 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index f57efb40bf0e3..12f67c6244ed8 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -2121,38 +2121,16 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(ptr%ptr) { } define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { -; SSE2-LABEL: load_sext_8i1_to_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i1_to_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movzbl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i1_to_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movzbl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: load_sext_8i1_to_8i16: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: load_sext_8i1_to_8i16: ; AVX1: # %bb.0: # %entry @@ -2160,7 +2138,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -2168,7 +2146,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { ; AVX2-LABEL: load_sext_8i1_to_8i16: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -2192,29 +2170,17 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; -; X86-SSE2-LABEL: load_sext_8i1_to_8i16: -; X86-SSE2: # %bb.0: # %entry -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movzbl (%eax), %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pcmpeqw %xmm1, %xmm0 -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: load_sext_8i1_to_8i16: -; X86-SSE41: # %bb.0: # %entry -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movzbl (%eax), %eax -; X86-SSE41-NEXT: movd %eax, %xmm0 -; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; X86-SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; X86-SSE41-NEXT: pand %xmm1, %xmm0 -; X86-SSE41-NEXT: pcmpeqw %xmm1, %xmm0 -; X86-SSE41-NEXT: retl +; X86-SSE-LABEL: load_sext_8i1_to_8i16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl (%eax), %eax +; X86-SSE-NEXT: movd %eax, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; X86-SSE-NEXT: retl entry: %X = load <8 x i1>, ptr %ptr %Y = sext <8 x i1> %X to <8 x i16> @@ -2371,47 +2337,19 @@ entry: } define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) { -; SSE2-LABEL: load_sext_8i1_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i1_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movzbl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i1_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movzbl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,2,4,8] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [16,32,64,128] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: load_sext_8i1_to_8i32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: load_sext_8i1_to_8i32: ; AVX1: # %bb.0: # %entry @@ -2427,7 +2365,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) { ; AVX2-LABEL: load_sext_8i1_to_8i32: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -2448,35 +2386,20 @@ define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; -; X86-SSE2-LABEL: load_sext_8i1_to_8i32: -; X86-SSE2: # %bb.0: # %entry -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movzbl (%eax), %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: load_sext_8i1_to_8i32: -; X86-SSE41: # %bb.0: # %entry -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movzbl (%eax), %eax -; X86-SSE41-NEXT: movd %eax, %xmm0 -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; X86-SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,2,4,8] -; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pand %xmm2, %xmm0 -; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; X86-SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [16,32,64,128] -; X86-SSE41-NEXT: pand %xmm2, %xmm1 -; X86-SSE41-NEXT: pcmpeqd %xmm2, %xmm1 -; X86-SSE41-NEXT: retl +; X86-SSE-LABEL: load_sext_8i1_to_8i32: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl (%eax), %eax +; X86-SSE-NEXT: movd %eax, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 +; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] +; X86-SSE-NEXT: pand %xmm2, %xmm1 +; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; X86-SSE-NEXT: retl entry: %X = load <8 x i1>, ptr %ptr %Y = sext <8 x i1> %X to <8 x i32> @@ -2649,50 +2572,20 @@ entry: } define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) { -; SSE2-LABEL: load_sext_16i1_to_16i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_16i1_to_16i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movzwl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pcmpeqw %xmm2, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pcmpeqw %xmm2, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_16i1_to_16i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movzwl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: load_sext_16i1_to_16i16: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movzwl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: load_sext_16i1_to_16i16: ; AVX1: # %bb.0: # %entry @@ -2731,37 +2624,21 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; -; X86-SSE2-LABEL: load_sext_16i1_to_16i16: -; X86-SSE2: # %bb.0: # %entry -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movzwl (%eax), %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpeqw %xmm2, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpeqw %xmm2, %xmm1 -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: load_sext_16i1_to_16i16: -; X86-SSE41: # %bb.0: # %entry -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movzwl (%eax), %eax -; X86-SSE41-NEXT: movd %eax, %xmm0 -; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; X86-SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] -; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE41-NEXT: pand %xmm2, %xmm0 -; X86-SSE41-NEXT: pcmpeqw %xmm2, %xmm0 -; X86-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] -; X86-SSE41-NEXT: pand %xmm2, %xmm1 -; X86-SSE41-NEXT: pcmpeqw %xmm2, %xmm1 -; X86-SSE41-NEXT: retl +; X86-SSE-LABEL: load_sext_16i1_to_16i16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzwl (%eax), %eax +; X86-SSE-NEXT: movd %eax, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 +; X86-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] +; X86-SSE-NEXT: pand %xmm2, %xmm1 +; X86-SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; X86-SSE-NEXT: retl entry: %X = load <16 x i1>, ptr %ptr %Y = sext <16 x i1> %X to <16 x i16> @@ -3520,9 +3397,9 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind { ; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE-NEXT: pcmpeqw 40(%ebp), %xmm1 ; X86-SSE-NEXT: pcmpeqw 24(%ebp), %xmm0 -; X86-SSE-NEXT: packsswb %xmm1, %xmm0 ; X86-SSE-NEXT: pcmpeqw 72(%ebp), %xmm3 ; X86-SSE-NEXT: pcmpeqw 56(%ebp), %xmm2 +; X86-SSE-NEXT: packsswb %xmm1, %xmm0 ; X86-SSE-NEXT: packsswb %xmm3, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE-NEXT: movl %ebp, %esp @@ -3735,9 +3612,9 @@ define <4 x i32> @sext_4i17_to_4i32(ptr %ptr) { ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE41-NEXT: movl (%edx), %eax ; X86-SSE41-NEXT: movl 4(%edx), %ecx -; X86-SSE41-NEXT: movl %ecx, %esi ; X86-SSE41-NEXT: movl 8(%edx), %edx ; X86-SSE41-NEXT: shldl $13, %ecx, %edx +; X86-SSE41-NEXT: movl %ecx, %esi ; X86-SSE41-NEXT: shldl $15, %eax, %ecx ; X86-SSE41-NEXT: shll $15, %ecx ; X86-SSE41-NEXT: sarl $15, %ecx diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 60295f1c145a1..395a62aef0b4c 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -67,7 +67,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: var_shift_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -123,14 +123,14 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrad %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -144,13 +144,13 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -160,14 +160,14 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v4i32: @@ -205,14 +205,14 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE-NEXT: psrad %xmm4, %xmm2 -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE-NEXT: psrad %xmm3, %xmm4 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE-NEXT: psrad %xmm4, %xmm5 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE-NEXT: psrad %xmm1, %xmm0 -; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; X86-SSE-NEXT: movaps %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -264,10 +264,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE41-NEXT: psllw $4, %xmm1 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $8, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -467,7 +467,6 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE41-NEXT: psraw $1, %xmm4 ; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 @@ -481,6 +480,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE41-NEXT: psraw $1, %xmm2 ; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -499,7 +499,6 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 ; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 @@ -510,8 +509,9 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 ; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: var_shift_v16i8: @@ -645,7 +645,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: splatvar_shift_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -800,15 +800,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: psrlw %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: psubb %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v16i8: @@ -817,12 +817,12 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm3 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v16i8: @@ -831,10 +831,10 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -845,11 +845,11 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -914,15 +914,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw %xmm1, %xmm0 ; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X86-SSE-NEXT: psrlw %xmm1, %xmm3 ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X86-SSE-NEXT: pand %xmm2, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; X86-SSE-NEXT: psrlw %xmm1, %xmm2 -; X86-SSE-NEXT: pxor %xmm2, %xmm0 -; X86-SSE-NEXT: psubb %xmm2, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: pxor %xmm3, %xmm0 +; X86-SSE-NEXT: psubb %xmm3, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer %shift = ashr <16 x i8> %a, %splat @@ -958,7 +958,7 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi ; AVX2-LABEL: splatvar_modulo_shift_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -985,8 +985,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi ; ; AVX512-LABEL: splatvar_modulo_shift_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -1104,15 +1104,15 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; SSE2-NEXT: psrlw %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: psubb %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_modulo_shift_v16i8: @@ -1121,12 +1121,12 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm3 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_modulo_shift_v16i8: @@ -1135,10 +1135,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1149,11 +1149,11 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -1223,15 +1223,15 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; X86-SSE-NEXT: psrlw %xmm1, %xmm0 ; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X86-SSE-NEXT: psrlw %xmm1, %xmm3 ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X86-SSE-NEXT: pand %xmm2, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; X86-SSE-NEXT: psrlw %xmm1, %xmm2 -; X86-SSE-NEXT: pxor %xmm2, %xmm0 -; X86-SSE-NEXT: psubb %xmm2, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: pxor %xmm3, %xmm0 +; X86-SSE-NEXT: psubb %xmm3, %xmm0 ; X86-SSE-NEXT: retl %mod = and <16 x i8> %b, %splat = shufflevector <16 x i8> %mod, <16 x i8> poison, <16 x i32> zeroinitializer @@ -1292,7 +1292,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; AVX512-LABEL: constant_shift_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,7] ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -1452,7 +1452,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1543,7 +1543,7 @@ define <8 x i16> @constant_shift_v8i16_pairs(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v8i16_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [7,7,1,1,3,3,6,6] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,1,1,3,3,6,6] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1581,10 +1581,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE-NEXT: psraw $8, %xmm1 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,4,8,16,32,64,128,256] -; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] +; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1594,10 +1594,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1627,7 +1627,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1657,10 +1657,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; X86-SSE-NEXT: psraw $8, %xmm1 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,4,8,16,32,64,128,256] -; X86-SSE-NEXT: psrlw $8, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] +; X86-SSE-NEXT: psrlw $8, %xmm1 ; X86-SSE-NEXT: psrlw $8, %xmm0 ; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl @@ -1720,7 +1720,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i8_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [1,4,7,3,2,0,5,6] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [1,4,7,3,2,0,5,6] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2] @@ -1998,7 +1998,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -2012,26 +2012,18 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512DQVL-LABEL: splatconstant_shift_v16i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512DQVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) -; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatconstant_shift_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BWVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) -; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq +; AVX512VL-LABEL: splatconstant_shift_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v16i8: ; X86-SSE: # %bb.0: @@ -2070,7 +2062,7 @@ define <2 x i64> @PR52719(<2 x i64> %a0, i32 %a1) { ; AVX2-LABEL: PR52719: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index 9f3fff34ea20c..54b9a5b002285 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -35,8 +35,8 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 @@ -105,8 +105,8 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 @@ -135,25 +135,25 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 ; AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX1-NEXT: vpsrad %xmm7, %xmm2, %xmm7 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 +; AVX1-NEXT: vpsrad %xmm5, %xmm0, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -191,32 +191,32 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; X86-AVX1-LABEL: var_shift_v8i32: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4 -; X86-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; X86-AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; X86-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; X86-AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6 -; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; X86-AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; X86-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX1-NEXT: vpsrad %xmm2, %xmm3, %xmm5 +; X86-AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpsrad %xmm2, %xmm3, %xmm6 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; X86-AVX1-NEXT: vpsrad %xmm7, %xmm3, %xmm7 +; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; X86-AVX1-NEXT: vpsrad %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX1-NEXT: vpsrad %xmm5, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7] +; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 +; X86-AVX1-NEXT: vpsrad %xmm6, %xmm0, %xmm6 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm5[4,5,6,7] ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X86-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: var_shift_v8i32: @@ -234,18 +234,18 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsraw $8, %xmm4, %xmm5 -; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 -; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 -; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 -; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsraw $8, %xmm3, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 @@ -336,18 +336,18 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; X86-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 ; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpsraw $8, %xmm4, %xmm5 -; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 -; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 -; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 -; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpsraw $8, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsraw $4, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsraw $2, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsraw $1, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 ; X86-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 @@ -451,7 +451,6 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 ; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 @@ -462,8 +461,9 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 ; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: var_shift_v32i8: @@ -503,7 +503,6 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4 ; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm3 @@ -514,8 +513,9 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i8: @@ -539,7 +539,6 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQVL-NEXT: vpsraw $1, %ymm3, %ymm4 ; AVX512DQVL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpsraw $4, %ymm0, %ymm3 @@ -550,8 +549,9 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQVL-NEXT: vpsraw $1, %ymm0, %ymm3 ; AVX512DQVL-NEXT: vpaddw %ymm1, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: var_shift_v32i8: @@ -630,7 +630,6 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; X86-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 ; X86-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 -; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X86-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 @@ -641,8 +640,9 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; X86-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 ; X86-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift @@ -888,11 +888,11 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpsrlw %xmm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -925,11 +925,11 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm3, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq @@ -946,7 +946,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -990,11 +990,11 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm3, %ymm1 ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl @@ -1058,8 +1058,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; ; AVX512-LABEL: splatvar_modulo_shift_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq @@ -1254,11 +1254,11 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpsrlw %xmm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1293,11 +1293,11 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm3, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq @@ -1315,7 +1315,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -1360,11 +1360,11 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm3, %ymm1 ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl @@ -1385,7 +1385,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 ; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967296,2] ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 @@ -1424,7 +1424,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512-LABEL: constant_shift_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,7,31,62] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,31,62] ; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq @@ -1440,7 +1440,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; X86-AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; X86-AVX1-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,0] +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,0] ; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 @@ -1583,7 +1583,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1672,7 +1672,7 @@ define <16 x i16> @constant_shift_v16i16_pairs(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i16_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,1,1,3,3,2,2,6,6,7,7,5,5,4,4] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,3,3,2,2,6,6,7,7,5,5,4,4] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1745,10 +1745,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpsraw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1776,10 +1776,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq @@ -1796,10 +1796,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpsraw $8, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpsraw $8, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq @@ -1842,10 +1842,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X86-AVX2-NEXT: vpsraw $8, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; X86-AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X86-AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; X86-AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl @@ -1912,7 +1912,7 @@ define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v32i8_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [5,6,5,4,2,6,2,0,7,2,4,6,3,5,4,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,5,4,2,6,2,0,7,2,4,6,3,5,4,1] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,2,4,4,8,8,32,32,2,2,32,32,128,128,1,1,32,32,8,8,2,2,16,16,4,4,8,8,64,64] @@ -2085,7 +2085,8 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX1-LABEL: splatconstant_shift_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609] +; XOPAVX1-NEXT: # xmm2 = mem[0,0] ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2291,21 +2292,13 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; -; AVX512DQVL-LABEL: splatconstant_shift_v32i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512DQVL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) -; AVX512DQVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatconstant_shift_v32i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BWVL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) -; AVX512BWVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: retq +; AVX512VL-LABEL: splatconstant_shift_v32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem) +; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq ; ; X86-AVX1-LABEL: splatconstant_shift_v32i8: ; X86-AVX1: # %bb.0: @@ -2360,7 +2353,8 @@ define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind { ; XOPAVX1-LABEL: shift32_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584] +; XOPAVX1-NEXT: # xmm2 = mem[0,0] ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshaq %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 74dbee5e5d2ca..845aa699c1d25 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -212,7 +212,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -301,7 +301,7 @@ define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwi ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -342,7 +342,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 @@ -414,10 +414,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsraw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -539,7 +539,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512BW-LABEL: splatconstant_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem) ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index 4f8cbc07243fd..4164790d57625 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -26,14 +26,14 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrad %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrad %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -47,13 +47,13 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -63,14 +63,14 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v2i32: @@ -108,14 +108,14 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE-NEXT: psrad %xmm4, %xmm2 -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE-NEXT: psrad %xmm3, %xmm4 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE-NEXT: psrad %xmm4, %xmm5 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE-NEXT: psrad %xmm1, %xmm0 -; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; X86-SSE-NEXT: movaps %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -167,10 +167,10 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE41-NEXT: psllw $4, %xmm1 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $8, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -338,10 +338,10 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; SSE41-NEXT: psllw $4, %xmm1 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $8, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psraw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -541,7 +541,6 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE41-NEXT: psraw $1, %xmm4 ; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 @@ -555,6 +554,7 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE41-NEXT: psraw $1, %xmm2 ; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -573,7 +573,6 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 ; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 @@ -584,8 +583,9 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 ; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: var_shift_v8i8: @@ -769,7 +769,6 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE41-NEXT: psraw $1, %xmm4 ; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 @@ -783,6 +782,7 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE41-NEXT: psraw $1, %xmm2 ; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -801,7 +801,6 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 ; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 @@ -812,8 +811,9 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 ; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: var_shift_v4i8: @@ -997,7 +997,6 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE41-NEXT: psraw $1, %xmm4 ; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE41-NEXT: movdqa %xmm1, %xmm2 @@ -1011,6 +1010,7 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE41-NEXT: psraw $1, %xmm2 ; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1029,7 +1029,6 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 ; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 @@ -1040,8 +1039,9 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 ; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm2, %xmm1 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: var_shift_v2i8: @@ -1308,15 +1308,15 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE2-NEXT: psrlw %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: psubb %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v8i8: @@ -1325,12 +1325,12 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm3 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v8i8: @@ -1339,10 +1339,10 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1353,11 +1353,11 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -1423,15 +1423,15 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw %xmm1, %xmm0 ; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X86-SSE-NEXT: psrlw %xmm1, %xmm3 ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X86-SSE-NEXT: pand %xmm2, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; X86-SSE-NEXT: psrlw %xmm1, %xmm2 -; X86-SSE-NEXT: pxor %xmm2, %xmm0 -; X86-SSE-NEXT: psubb %xmm2, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: pxor %xmm3, %xmm0 +; X86-SSE-NEXT: psubb %xmm3, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer %shift = ashr <8 x i8> %a, %splat @@ -1446,15 +1446,15 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE2-NEXT: psrlw %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: psubb %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v4i8: @@ -1463,12 +1463,12 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm3 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v4i8: @@ -1477,10 +1477,10 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1491,11 +1491,11 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -1561,15 +1561,15 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw %xmm1, %xmm0 ; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X86-SSE-NEXT: psrlw %xmm1, %xmm3 ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X86-SSE-NEXT: pand %xmm2, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; X86-SSE-NEXT: psrlw %xmm1, %xmm2 -; X86-SSE-NEXT: pxor %xmm2, %xmm0 -; X86-SSE-NEXT: psubb %xmm2, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: pxor %xmm3, %xmm0 +; X86-SSE-NEXT: psubb %xmm3, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer %shift = ashr <4 x i8> %a, %splat @@ -1584,15 +1584,15 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-NEXT: psrlw %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: psrlw %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE2-NEXT: psrlw %xmm1, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: psubb %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: splatvar_shift_v2i8: @@ -1601,12 +1601,12 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: psrlw %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; SSE41-NEXT: psrlw %xmm1, %xmm3 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubb %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatvar_shift_v2i8: @@ -1615,10 +1615,10 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1629,11 +1629,11 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -1690,15 +1690,15 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw %xmm1, %xmm0 ; X86-SSE-NEXT: pcmpeqd %xmm2, %xmm2 ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32896,32896,32896,32896,32896,32896,32896,32896] +; X86-SSE-NEXT: psrlw %xmm1, %xmm3 ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; X86-SSE-NEXT: pand %xmm2, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; X86-SSE-NEXT: psrlw %xmm1, %xmm2 -; X86-SSE-NEXT: pxor %xmm2, %xmm0 -; X86-SSE-NEXT: psubb %xmm2, %xmm0 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; X86-SSE-NEXT: pand %xmm1, %xmm0 +; X86-SSE-NEXT: pxor %xmm3, %xmm0 +; X86-SSE-NEXT: psubb %xmm3, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer %shift = ashr <2 x i8> %a, %splat @@ -1933,10 +1933,10 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq @@ -1944,12 +1944,12 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,128,64,32,16,8,4,2] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i8: @@ -1977,7 +1977,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2005,10 +2005,10 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: psrlw $8, %xmm0 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -2021,10 +2021,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq @@ -2032,12 +2032,12 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256] -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,128,64,32,256,256,256,256] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: @@ -2065,7 +2065,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2093,10 +2093,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: psrlw $8, %xmm0 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -2109,10 +2109,10 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq @@ -2120,12 +2120,12 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256] -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,32,256,256,256,256,256,256] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: @@ -2153,7 +2153,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2181,10 +2181,10 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: psrlw $8, %xmm0 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -2321,7 +2321,7 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -2335,26 +2335,18 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512DQVL-LABEL: splatconstant_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512DQVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) -; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatconstant_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BWVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) -; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq +; AVX512VL-LABEL: splatconstant_shift_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v8i8: ; X86-SSE: # %bb.0: @@ -2391,7 +2383,7 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -2405,26 +2397,18 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512DQVL-LABEL: splatconstant_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512DQVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) -; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatconstant_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BWVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) -; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq +; AVX512VL-LABEL: splatconstant_shift_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v4i8: ; X86-SSE: # %bb.0: @@ -2461,7 +2445,7 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -2475,26 +2459,18 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512DQVL-LABEL: splatconstant_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512DQVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) -; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatconstant_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BWVL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) -; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq +; AVX512VL-LABEL: splatconstant_shift_v2i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem) +; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v2i8: ; X86-SSE: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll index b453f925b94e8..4badb9319c0ce 100644 --- a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll +++ b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll @@ -30,87 +30,85 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movd %r8d, %xmm1 ; SSE-NEXT: xorl %ecx, %ecx -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: pmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero +; SSE-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero ; SSE-NEXT: .p2align 4 ; SSE-NEXT: .LBB0_4: # %vector.body ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm11 = mem[0],zero -; SSE-NEXT: pcmpeqb %xmm8, %xmm0 -; SSE-NEXT: pmovsxbd %xmm0, %xmm7 +; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm9 = mem[0],zero +; SSE-NEXT: pcmpeqb %xmm6, %xmm0 +; SSE-NEXT: pmovsxbd %xmm0, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE-NEXT: pcmpeqb %xmm8, %xmm1 -; SSE-NEXT: pmovsxbd %xmm1, %xmm5 +; SSE-NEXT: pcmpeqb %xmm6, %xmm1 +; SSE-NEXT: pmovsxbd %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: pmovsxbd %xmm1, %xmm6 -; SSE-NEXT: pcmpeqb %xmm8, %xmm2 -; SSE-NEXT: pmovsxbd %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: pmovsxbd %xmm1, %xmm4 -; SSE-NEXT: pcmpeqb %xmm8, %xmm11 -; SSE-NEXT: pmovsxbd %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] -; SSE-NEXT: pmovsxbd %xmm2, %xmm2 -; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm11 +; SSE-NEXT: pmovsxbd %xmm1, %xmm3 +; SSE-NEXT: pcmpeqb %xmm6, %xmm5 +; SSE-NEXT: pmovsxbd %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm11 +; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pslld %xmm7, %xmm12 +; SSE-NEXT: pslld %xmm8, %xmm10 +; SSE-NEXT: blendvps %xmm0, %xmm12, %xmm10 ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pslld %xmm9, %xmm12 -; SSE-NEXT: pslld %xmm10, %xmm11 -; SSE-NEXT: blendvps %xmm0, %xmm12, %xmm11 -; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm12 -; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm7 -; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm6 -; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm5 +; SSE-NEXT: pslld %xmm7, %xmm12 +; SSE-NEXT: pslld %xmm8, %xmm11 +; SSE-NEXT: pmovsxbd %xmm5, %xmm5 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm5 -; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm4 +; SSE-NEXT: blendvps %xmm0, %xmm12, %xmm11 +; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm4 +; SSE-NEXT: pslld %xmm7, %xmm13 +; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm12 +; SSE-NEXT: pslld %xmm8, %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm4 -; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm3 +; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pslld %xmm7, %xmm13 +; SSE-NEXT: pslld %xmm8, %xmm12 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm3 -; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pslld %xmm9, %xmm13 -; SSE-NEXT: pslld %xmm10, %xmm2 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm12 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pslld %xmm7, %xmm2 +; SSE-NEXT: pslld %xmm8, %xmm3 +; SSE-NEXT: pcmpeqb %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm3 +; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pslld %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm9[1,1,1,1] +; SSE-NEXT: pslld %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm2 -; SSE-NEXT: movups %xmm12, (%rdi,%rcx,4) -; SSE-NEXT: movups %xmm11, 16(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm7, 48(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm4, 64(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm5, 80(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm2, 96(%rdi,%rcx,4) -; SSE-NEXT: movups %xmm3, 112(%rdi,%rcx,4) +; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm2 +; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pslld %xmm7, %xmm5 +; SSE-NEXT: pslld %xmm8, %xmm1 +; SSE-NEXT: pmovsxbd %xmm13, %xmm0 +; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm1 +; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: pslld %xmm7, %xmm13 +; SSE-NEXT: pmovsxbd %xmm9, %xmm0 +; SSE-NEXT: pslld %xmm8, %xmm5 +; SSE-NEXT: blendvps %xmm0, %xmm13, %xmm5 +; SSE-NEXT: movups %xmm11, (%rdi,%rcx,4) +; SSE-NEXT: movups %xmm10, 16(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm12, 32(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm4, 48(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm2, 64(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm3, 80(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm5, 96(%rdi,%rcx,4) +; SSE-NEXT: movups %xmm1, 112(%rdi,%rcx,4) ; SSE-NEXT: addq $32, %rcx ; SSE-NEXT: cmpq %rcx, %rdx ; SSE-NEXT: jne .LBB0_4 @@ -163,7 +161,8 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm7[0],zero,xmm7[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm8[0],zero,xmm8[1],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero @@ -172,69 +171,70 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovq {{.*#+}} xmm9 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm10 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm11 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm12 = mem[0],zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm9, %xmm9 +; AVX1-NEXT: vmovq {{.*#+}} xmm11 = mem[0],zero +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm9, %xmm9 ; AVX1-NEXT: vpmovsxbd %xmm9, %xmm14 ; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm9, %xmm15 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm10, %xmm9 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm10, %xmm9 ; AVX1-NEXT: vpmovsxbd %xmm9, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm9, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm11, %xmm9 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm12, %xmm9 ; AVX1-NEXT: vpmovsxbd %xmm9, %xmm13 ; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm9, %xmm11 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm12, %xmm9 -; AVX1-NEXT: vpmovsxbd %xmm9, %xmm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm9, %xmm9 -; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm12 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpslld %xmm3, %xmm12, %xmm2 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpslld %xmm4, %xmm12, %xmm12 -; AVX1-NEXT: vblendvps %xmm14, %xmm2, %xmm12, %xmm12 -; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm2 -; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm14 -; AVX1-NEXT: vpslld %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vblendvps %xmm15, %xmm14, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpslld %xmm3, %xmm14, %xmm15 +; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm10 +; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm12 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpslld %xmm4, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm0, %xmm15, %xmm14, %xmm0 +; AVX1-NEXT: vpslld %xmm4, %xmm10, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vpslld %xmm5, %xmm10, %xmm10 +; AVX1-NEXT: vblendvps %xmm14, %xmm2, %xmm10, %xmm10 +; AVX1-NEXT: vpslld %xmm4, %xmm12, %xmm2 +; AVX1-NEXT: vpslld %xmm5, %xmm12, %xmm12 +; AVX1-NEXT: vblendvps %xmm15, %xmm2, %xmm12, %xmm12 +; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm2 ; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm3, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm4, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm1, %xmm15, %xmm14, %xmm1 -; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm13, %xmm15, %xmm14, %xmm13 -; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm11, %xmm15, %xmm14, %xmm11 -; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm10, %xmm15, %xmm14, %xmm10 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpslld %xmm4, %xmm2, %xmm15 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vpslld %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vblendvps %xmm0, %xmm15, %xmm2, %xmm0 +; AVX1-NEXT: vpslld %xmm4, %xmm14, %xmm2 +; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm14 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm11, %xmm11 +; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm14, %xmm1 +; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm14 +; AVX1-NEXT: vpslld %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbd %xmm11, %xmm15 +; AVX1-NEXT: vblendvps %xmm13, %xmm14, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm13 +; AVX1-NEXT: vpslld %xmm3, %xmm13, %xmm14 +; AVX1-NEXT: vpslld %xmm6, %xmm13, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,1,1] +; AVX1-NEXT: vblendvps %xmm9, %xmm14, %xmm13, %xmm9 +; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm13 +; AVX1-NEXT: vpslld %xmm7, %xmm13, %xmm14 +; AVX1-NEXT: vpslld %xmm8, %xmm13, %xmm13 +; AVX1-NEXT: vpmovsxbd %xmm11, %xmm11 +; AVX1-NEXT: vblendvps %xmm15, %xmm14, %xmm13, %xmm13 ; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm14 ; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 ; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm9, %xmm15, %xmm14, %xmm9 -; AVX1-NEXT: vmovups %xmm12, (%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm2, 16(%rdi,%rcx,4) +; AVX1-NEXT: vblendvps %xmm11, %xmm15, %xmm14, %xmm11 +; AVX1-NEXT: vmovups %xmm10, (%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm12, 16(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm0, 32(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm1, 48(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm13, 64(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm11, 80(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm10, 96(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm9, 112(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm2, 64(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm9, 80(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm13, 96(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm11, 112(%rdi,%rcx,4) ; AVX1-NEXT: addq $32, %rcx ; AVX1-NEXT: cmpq %rcx, %rdx ; AVX1-NEXT: jne .LBB0_4 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 1d1697aa38bae..2e9907fbb2aa2 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -92,14 +92,14 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -113,13 +113,13 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrld %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -129,14 +129,14 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v4i32: @@ -174,14 +174,14 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE-NEXT: psrld %xmm4, %xmm2 -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE-NEXT: psrld %xmm3, %xmm4 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE-NEXT: psrld %xmm4, %xmm5 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE-NEXT: psrld %xmm1, %xmm0 -; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; X86-SSE-NEXT: movaps %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -233,10 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE41-NEXT: psllw $4, %xmm1 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -365,23 +365,23 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 @@ -480,23 +480,23 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: psllw $5, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psrlw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psrlw $2, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm1 @@ -1193,7 +1193,7 @@ define <8 x i16> @constant_shift_v8i16_pairs(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v8i16_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [1,1,3,3,2,2,4,4] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,3,3,2,2,4,4] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1254,7 +1254,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1309,7 +1309,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i8_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [7,2,4,6,1,2,3,4] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [7,2,4,6,1,2,3,4] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1388,9 +1388,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [2,4,8,16,32,64,128,256] -; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] +; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1401,8 +1401,8 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16,32,64,128,256] -; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2] +; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: packuswb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1413,9 +1413,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1445,7 +1445,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1475,9 +1475,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [2,4,8,16,32,64,128,256] -; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] +; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: psrlw $8, %xmm0 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -1661,11 +1661,11 @@ define <4 x i32> @vector_variable_shift_right(<4 x i1> %cond, <4 x i32> %x, <4 x ; ; AVX1-LABEL: vector_variable_shift_right: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index d8852956c66f3..85415d725b1e0 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -24,12 +24,12 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v4i64: @@ -71,12 +71,12 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: var_shift_v4i64: @@ -96,25 +96,25 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 ; AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX1-NEXT: vpsrld %xmm7, %xmm2, %xmm7 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 +; AVX1-NEXT: vpsrld %xmm5, %xmm0, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -152,32 +152,32 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; X86-AVX1-LABEL: var_shift_v8i32: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4 -; X86-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; X86-AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; X86-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; X86-AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6 -; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; X86-AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; X86-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; X86-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm5 +; X86-AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm6 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; X86-AVX1-NEXT: vpsrld %xmm7, %xmm3, %xmm7 +; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; X86-AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; X86-AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX1-NEXT: vpsrld %xmm5, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7] +; X86-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 +; X86-AVX1-NEXT: vpsrld %xmm6, %xmm0, %xmm6 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; X86-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm5[4,5,6,7] ; X86-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; X86-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: var_shift_v8i32: @@ -195,18 +195,18 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 -; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4 -; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 -; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 -; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 @@ -297,18 +297,18 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; X86-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 ; X86-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 -; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4 -; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 -; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 -; X86-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 ; X86-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 @@ -710,10 +710,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v32i8: @@ -795,10 +795,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: splatvar_shift_v32i8: @@ -1018,10 +1018,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_modulo_shift_v32i8: @@ -1107,10 +1107,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8: @@ -1306,7 +1306,7 @@ define <16 x i16> @constant_shift_v16i16_pairs(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i16_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1382,7 +1382,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1462,8 +1462,7 @@ define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v32i8_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,2,4,6,1,2,3,4,7,2,4,6,1,2,3,4] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,4,6,1,2,3,4,7,2,4,6,1,2,3,4] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1592,11 +1591,11 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: constant_shift_v32i8: @@ -1622,11 +1621,11 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i8: @@ -1641,11 +1640,11 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm1 ; AVX512DQVL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: constant_shift_v32i8: @@ -1683,11 +1682,11 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] -; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; X86-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl %shift = lshr <32 x i8> %a, ret <32 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index 103d5702fb93a..c69d4337fb55d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -324,12 +324,12 @@ define <32 x i16> @constant_shift_v32i16_pairs(<32 x i16> %a) nounwind { define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8_pairs: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] +; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpmulhuw %ymm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15] ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpandq %zmm0, %zmm1, %zmm0 @@ -385,11 +385,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index 79281116665f6..be35b415f58e5 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -26,14 +26,14 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: psrld %xmm4, %xmm5 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -47,13 +47,13 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrld %xmm1, %xmm4 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] ; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: retq ; @@ -63,14 +63,14 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v2i32: @@ -108,14 +108,14 @@ define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE-NEXT: psrld %xmm4, %xmm2 -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X86-SSE-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE-NEXT: psrld %xmm3, %xmm4 +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE-NEXT: psrld %xmm4, %xmm5 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] ; X86-SSE-NEXT: psrld %xmm1, %xmm0 -; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] ; X86-SSE-NEXT: movaps %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -167,10 +167,10 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { ; SSE41-NEXT: psllw $4, %xmm1 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -338,10 +338,10 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { ; SSE41-NEXT: psllw $4, %xmm1 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrlw $4, %xmm3 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -470,23 +470,23 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 @@ -585,23 +585,23 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: psllw $5, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psrlw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psrlw $2, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm1 @@ -620,23 +620,23 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 @@ -735,23 +735,23 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: psllw $5, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psrlw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psrlw $2, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm1 @@ -770,23 +770,23 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 @@ -885,23 +885,23 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: psllw $5, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psrlw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psrlw $2, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm1 @@ -1618,9 +1618,9 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1629,8 +1629,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: packuswb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1639,11 +1639,11 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,128,64,32,16,8,4,2] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i8: @@ -1671,7 +1671,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1699,9 +1699,9 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: psrlw $8, %xmm0 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -1714,9 +1714,9 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1725,8 +1725,8 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,256,256,256,256] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: packuswb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1735,11 +1735,11 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256] -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,128,64,32,256,256,256,256] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: @@ -1767,7 +1767,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1795,9 +1795,9 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: psrlw $8, %xmm0 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE-NEXT: retl @@ -1810,9 +1810,9 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: psrlw $8, %xmm0 ; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1821,8 +1821,8 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,256,256,256,256,256,256] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: packuswb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1831,11 +1831,11 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256] -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,32,256,256,256,256,256,256] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: @@ -1863,7 +1863,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1891,9 +1891,9 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: psrlw $8, %xmm0 ; X86-SSE-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-lut.ll b/llvm/test/CodeGen/X86/vector-shift-lut.ll index 0bf2006090893..2ffc10221d9f5 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lut.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lut.ll @@ -84,8 +84,8 @@ define <16 x i8> @uniform_lshr_v16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: paddb %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 @@ -142,8 +142,8 @@ define <16 x i8> @uniform_ashr_v16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: paddb %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 @@ -290,13 +290,13 @@ define <32 x i8> @uniform_shl_v32i8(<32 x i8> %a) nounwind { ; ; AVX512DQ-LABEL: uniform_shl_v32i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,8,16,32,64,128,0,0,u,u,u,u,u,u,u,u,4,8,16,32,64,128,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: uniform_shl_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0,4,8,16,32,64,128,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,8,16,32,64,128,0,0,u,u,u,u,u,u,u,u,4,8,16,32,64,128,0,0,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: retq %shift = shl <32 x i8> , %a @@ -402,13 +402,13 @@ define <32 x i8> @uniform_lshr_v32i8(<32 x i8> %a) nounwind { ; ; AVX512DQ-LABEL: uniform_lshr_v32i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,2,1,0,0,0,0,0,5,2,1,0,0,0,0,0,5,2,1,0,0,0,0,0,5,2,1,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: uniform_lshr_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,2,1,0,0,0,0,0,5,2,1,0,0,0,0,0,5,2,1,0,0,0,0,0,5,2,1,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: retq %shift = lshr <32 x i8> , %a @@ -514,13 +514,13 @@ define <32 x i8> @uniform_ashr_v32i8(<32 x i8> %a) nounwind { ; ; AVX512DQ-LABEL: uniform_ashr_v32i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,3,1,0,0,0,0,0,6,3,1,0,0,0,0,0,6,3,1,0,0,0,0,0,6,3,1,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: uniform_ashr_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,3,1,0,0,0,0,0,6,3,1,0,0,0,0,0,6,3,1,0,0,0,0,0,6,3,1,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: retq %shift = ashr <32 x i8> , %a @@ -694,7 +694,7 @@ define <64 x i8> @uniform_shl_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: uniform_shl_v64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [7,14,28,56,112,224,192,128,u,u,u,u,u,u,u,u,7,14,28,56,112,224,192,128,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -702,7 +702,7 @@ define <64 x i8> @uniform_shl_v64i8(<64 x i8> %a) nounwind { ; ; AVX512BW-LABEL: uniform_shl_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128,7,14,28,56,112,224,192,128] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,28,56,112,224,192,128,u,u,u,u,u,u,u,u,7,14,28,56,112,224,192,128,u,u,u,u,u,u,u,u,7,14,28,56,112,224,192,128,u,u,u,u,u,u,u,u,7,14,28,56,112,224,192,128,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq %shift = shl <64 x i8> , %a @@ -790,7 +790,7 @@ define <64 x i8> @uniform_lshr_v64i8(<64 x i8> %a) nounwind { ; ; SSE41-LABEL: uniform_lshr_v64i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,0] +; SSE41-NEXT: movd {{.*#+}} xmm4 = [1,0,0,0] ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pshufb %xmm0, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm6 @@ -807,7 +807,7 @@ define <64 x i8> @uniform_lshr_v64i8(<64 x i8> %a) nounwind { ; AVX1-LABEL: uniform_lshr_v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [1,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -819,7 +819,7 @@ define <64 x i8> @uniform_lshr_v64i8(<64 x i8> %a) nounwind { ; ; AVX2-LABEL: uniform_lshr_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,1,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq @@ -844,7 +844,7 @@ define <64 x i8> @uniform_lshr_v64i8(<64 x i8> %a) nounwind { ; ; XOPAVX2-LABEL: uniform_lshr_v64i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,1,0] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ; XOPAVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; XOPAVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; XOPAVX2-NEXT: retq @@ -852,7 +852,7 @@ define <64 x i8> @uniform_lshr_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: uniform_lshr_v64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,1,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -860,7 +860,7 @@ define <64 x i8> @uniform_lshr_v64i8(<64 x i8> %a) nounwind { ; ; AVX512BW-LABEL: uniform_lshr_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> , %a @@ -1010,7 +1010,7 @@ define <64 x i8> @uniform_ashr_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: uniform_ashr_v64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1018,7 +1018,7 @@ define <64 x i8> @uniform_ashr_v64i8(<64 x i8> %a) nounwind { ; ; AVX512BW-LABEL: uniform_ashr_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> , %a @@ -1032,11 +1032,11 @@ define <32 x i8> @perlane_shl_v32i8(<32 x i8> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psllw $5, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: paddb %xmm2, %xmm2 @@ -1172,8 +1172,8 @@ define <32 x i8> @perlane_lshr_v32i8(<32 x i8> %a) nounwind { ; SSE2-NEXT: pcmpgtb %xmm0, %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 ; SSE2-NEXT: pandn %xmm3, %xmm4 ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 @@ -1208,7 +1208,7 @@ define <32 x i8> @perlane_lshr_v32i8(<32 x i8> %a) nounwind { ; ; AVX2-LABEL: perlane_lshr_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm1 = [259,0,66052,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,4,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1225,19 +1225,19 @@ define <32 x i8> @perlane_lshr_v32i8(<32 x i8> %a) nounwind { ; ; XOPAVX2-LABEL: perlane_lshr_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxdq {{.*#+}} ymm1 = [259,0,66052,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,4,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; XOPAVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512DQ-LABEL: perlane_lshr_v32i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm1 = [259,0,66052,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,4,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: perlane_lshr_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm1 = [259,0,66052,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,4,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: retq %shift = lshr <32 x i8> , %a @@ -1319,7 +1319,7 @@ define <32 x i8> @perlane_ashr_v32i8(<32 x i8> %a) nounwind { ; ; AVX2-LABEL: perlane_ashr_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm1 = [66053,0,66310,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1336,19 +1336,19 @@ define <32 x i8> @perlane_ashr_v32i8(<32 x i8> %a) nounwind { ; ; XOPAVX2-LABEL: perlane_ashr_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxdq {{.*#+}} ymm1 = [66053,0,66310,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; XOPAVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512DQ-LABEL: perlane_ashr_v32i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm1 = [66053,0,66310,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: perlane_ashr_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm1 = [66053,0,66310,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: retq %shift = ashr <32 x i8> , %a @@ -1360,11 +1360,11 @@ define <64 x i8> @perlane_shl_v64i8(<64 x i8> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: paddb %xmm1, %xmm1 @@ -1543,8 +1543,8 @@ define <64 x i8> @perlane_lshr_v64i8(<64 x i8> %a) nounwind { ; SSE2-NEXT: pcmpgtb %xmm5, %xmm2 ; SSE2-NEXT: paddb %xmm5, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtb %xmm5, %xmm8 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pcmpgtb %xmm5, %xmm8 ; SSE2-NEXT: movdqa %xmm8, %xmm9 ; SSE2-NEXT: pandn %xmm2, %xmm9 ; SSE2-NEXT: psrlw $2, %xmm2 @@ -1636,9 +1636,9 @@ define <64 x i8> @perlane_lshr_v64i8(<64 x i8> %a) nounwind { ; ; AVX2-LABEL: perlane_lshr_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm2 = [259,0,66052,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,4,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm2 = [66053,0,66310,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; @@ -1646,40 +1646,40 @@ define <64 x i8> @perlane_lshr_v64i8(<64 x i8> %a) nounwind { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm0 -; XOPAVX1-NEXT: vpshlb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 -; XOPAVX1-NEXT: vpshlb %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; XOPAVX1-NEXT: vpshlb %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; XOPAVX1-NEXT: vpshlb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; XOPAVX1-NEXT: vpshlb %xmm4, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: perlane_lshr_v64i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxdq {{.*#+}} ymm2 = [259,0,66052,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,4,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; XOPAVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; XOPAVX2-NEXT: vpmovsxdq {{.*#+}} ymm2 = [66053,0,66310,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; XOPAVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; XOPAVX2-NEXT: retq ; ; AVX512DQ-LABEL: perlane_lshr_v64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm2 = [66053,0,66310,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm2 = [259,0,66052,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,4,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: perlane_lshr_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxdq {{.*#+}} zmm1 = [259,0,66052,0,66053,0,66310,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,4,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,5,2,1,0,0,0,0,0,u,u,u,u,u,u,u,u,6,3,1,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> , %a @@ -1714,8 +1714,8 @@ define <64 x i8> @perlane_ashr_v64i8(<64 x i8> %a) nounwind { ; SSE2-NEXT: pcmpgtb %xmm2, %xmm5 ; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm6 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm6 ; SSE2-NEXT: pandn %xmm5, %xmm6 ; SSE2-NEXT: paddb %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm5, %xmm5 @@ -1749,7 +1749,7 @@ define <64 x i8> @perlane_ashr_v64i8(<64 x i8> %a) nounwind { ; ; SSE41-LABEL: perlane_ashr_v64i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [1,0] +; SSE41-NEXT: movd {{.*#+}} xmm4 = [1,0,0,0] ; SSE41-NEXT: pshufb %xmm1, %xmm4 ; SSE41-NEXT: movd {{.*#+}} xmm5 = [2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; SSE41-NEXT: pshufb %xmm2, %xmm5 @@ -1770,7 +1770,7 @@ define <64 x i8> @perlane_ashr_v64i8(<64 x i8> %a) nounwind { ; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [1,0,0,0] ; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 @@ -1778,9 +1778,9 @@ define <64 x i8> @perlane_ashr_v64i8(<64 x i8> %a) nounwind { ; ; AVX2-LABEL: perlane_ashr_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [258,0,259,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; @@ -1802,25 +1802,25 @@ define <64 x i8> @perlane_ashr_v64i8(<64 x i8> %a) nounwind { ; ; XOPAVX2-LABEL: perlane_ashr_v64i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; XOPAVX2-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; XOPAVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [258,0,259,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; XOPAVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; XOPAVX2-NEXT: retq ; ; AVX512DQ-LABEL: perlane_ashr_v64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm2 = [258,0,259,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: perlane_ashr_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxwq {{.*#+}} zmm1 = [0,0,1,0,258,0,259,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,1,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,2,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u,3,1,0,0,0,0,0,0,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> , %a diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 2b1cf5b671e53..8459194c5d4ea 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -282,14 +282,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 @@ -311,10 +311,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE41-LABEL: var_shift_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllw $4, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 @@ -392,14 +392,14 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: psllw $5, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psllw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 ; X86-SSE-NEXT: pxor %xmm3, %xmm3 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 @@ -988,8 +988,8 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq @@ -1033,8 +1033,8 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: retl @@ -1066,7 +1066,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1121,7 +1121,7 @@ define <8 x i16> @constant_shift_v8i16_pairs(<8 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v8i16_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [5,5,2,2,1,1,4,4] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,2,2,1,1,4,4] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1153,9 +1153,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,64,32,16,8,4,2,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1204,7 +1204,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1234,9 +1234,9 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [128,64,32,16,8,4,2,1] ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X86-SSE-NEXT: pand %xmm2, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128] +; X86-SSE-NEXT: pand %xmm2, %xmm1 ; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl @@ -1271,7 +1271,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i8_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [3,7,6,2,7,0,7,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [3,7,6,2,7,0,7,1] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 4f55f7af20f47..1f3831b3216cf 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -24,12 +24,12 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shift_v4i64: @@ -68,12 +68,12 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; X86-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; X86-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; X86-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X86-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: var_shift_v4i64: @@ -641,10 +641,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_shift_v32i8: @@ -721,10 +721,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: splatvar_shift_v32i8: @@ -944,10 +944,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_modulo_shift_v32i8: @@ -1028,10 +1028,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 ; X86-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8: @@ -1202,7 +1202,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1270,7 +1270,7 @@ define <16 x i16> @constant_shift_v16i16_pairs(<16 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v16i16_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,2,3,3,0,0,1,1,6,6,7,7,4,4,5,5] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,3,3,0,0,1,1,6,6,7,7,4,4,5,5] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1313,7 +1313,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 @@ -1388,7 +1388,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1 ; X86-AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] ; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3 ; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 @@ -1449,7 +1449,7 @@ define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v32i8_pairs: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,7,6,2,7,0,7,1,5,0,4,7,6,1,4,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,6,2,7,0,7,1,5,0,4,7,6,1,4,0] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index d245bdca6ee29..7ed438001c891 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -341,14 +341,14 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 @@ -370,10 +370,10 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE41-LABEL: var_shift_v8i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllw $4, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 @@ -451,14 +451,14 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: psllw $5, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psllw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 ; X86-SSE-NEXT: pxor %xmm3, %xmm3 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 @@ -485,14 +485,14 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 @@ -514,10 +514,10 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE41-LABEL: var_shift_v4i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllw $4, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 @@ -595,14 +595,14 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: psllw $5, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psllw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 ; X86-SSE-NEXT: pxor %xmm3, %xmm3 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 @@ -629,14 +629,14 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: psllw $5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 @@ -658,10 +658,10 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE41-LABEL: var_shift_v2i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psllw $4, %xmm3 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: psllw $5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 @@ -739,14 +739,14 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: psllw $5, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pxor %xmm3, %xmm3 -; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pandn %xmm0, %xmm4 +; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pandn %xmm0, %xmm3 ; X86-SSE-NEXT: psllw $4, %xmm0 -; X86-SSE-NEXT: pand %xmm3, %xmm0 +; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: por %xmm4, %xmm0 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: por %xmm3, %xmm0 ; X86-SSE-NEXT: paddb %xmm1, %xmm1 ; X86-SSE-NEXT: pxor %xmm3, %xmm3 ; X86-SSE-NEXT: pcmpgtb %xmm1, %xmm3 @@ -1478,7 +1478,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1567,7 +1567,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1656,7 +1656,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 152814fbc631b..11f66bf19574f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -611,7 +611,7 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31( ; ; AVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -630,7 +630,7 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31( ; ; XOPAVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -669,7 +669,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31( ; ; AVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -688,7 +688,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31( ; ; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -747,7 +747,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31( ; ; AVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -767,7 +767,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31( ; ; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -931,29 +931,13 @@ define <16 x i8> @shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4(<16 x i8> } define <16 x i8> @shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(<16 x i8> %a, <16 x i8> %b) { -; SSE2-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: ; AVX1: # %bb.0: @@ -965,7 +949,7 @@ define <16 x i8> @shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30( ; ; AVX2-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1085,7 +1069,7 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) { ; ; AVX2-LABEL: load_fold_pblendvb: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] ; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1105,7 +1089,7 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) { ; ; XOPAVX2-LABEL: load_fold_pblendvb: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] ; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %x = load <16 x i8>, ptr %px, align 16 @@ -1149,7 +1133,7 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) { ; ; AVX2-LABEL: load_fold_pblendvb_commute: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] ; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1171,7 +1155,7 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) { ; ; XOPAVX2-LABEL: load_fold_pblendvb_commute: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] ; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %x = load <16 x i8>, ptr %px, align 16 @@ -2180,29 +2164,13 @@ define <16 x i8> @shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09( } define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { -; SSE2-LABEL: PR12412: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR12412: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR12412: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: PR12412: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: PR12412: ; AVX1: # %bb.0: # %entry @@ -2214,7 +2182,7 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { ; ; AVX2-LABEL: PR12412: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index 0eb72c8bc0be4..e648540b8911c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -370,7 +370,7 @@ define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_0124: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,2,4] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -418,7 +418,7 @@ define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_0142: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,1,4,2] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -469,7 +469,7 @@ define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_0412: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4,1,2] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -511,7 +511,7 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_4012: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,1,2] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -545,7 +545,7 @@ define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_0451: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,4,5,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -580,7 +580,7 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_4015: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,1,5] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5] ; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -1549,7 +1549,7 @@ define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v4i32_2456: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [6,0,1,2] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,0,1,2] ; AVX512VL-NEXT: vpermi2d %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512VL-NEXT: retq @@ -2516,7 +2516,7 @@ define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, ptr %a1) { ; ; AVX512VL-LABEL: shuffle_mem_v4f32_0624: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,2,6,0] +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [4,2,6,0] ; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0 ; AVX512VL-NEXT: retq %1 = load <4 x float>, ptr %a1 @@ -2540,7 +2540,7 @@ define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, ptr %a1) { ; ; AVX512VL-LABEL: shuffle_mem_v4f32_4760: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,2,4] +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,3,2,4] ; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0 ; AVX512VL-NEXT: retq %1 = load <4 x float>, ptr %a1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index f3659fd934e71..7684897d8cd0b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1248,7 +1248,7 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i16_0213cedf: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,1,3,12,14,13,15] ; AVX512VL-FAST-ALL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1370,7 +1370,7 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-LABEL: shuffle_v8i16_032dXXXX: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,3,2,13,0,13,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,3,2,13,0,13,0,1] ; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; @@ -1557,7 +1557,7 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i16_012dcde3: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,13,12,13,14,3] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,12,13,14,3] ; AVX512VL-FAST-ALL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1666,7 +1666,7 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i16_XXX1X579: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,1,1,1,4,5,7,9] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,4,5,7,9] ; AVX512VL-FAST-ALL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1724,7 +1724,7 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-LABEL: shuffle_v8i16_XX4X8acX: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,5,4,5,8,10,12,10] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,8,10,12,10] ; AVX512VL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index dbbfaab9ea26a..0fe0d06607e66 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -57,7 +57,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -69,7 +69,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -112,7 +112,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -124,7 +124,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -167,7 +167,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -179,7 +179,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -219,7 +219,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -257,7 +257,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,5,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -295,7 +295,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -333,7 +333,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,7,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -374,7 +374,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,0,0,0,0,0,0,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -416,7 +416,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,9] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,9,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -457,7 +457,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,0,0,0,0,10,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,10,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -498,7 +498,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,11,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -580,7 +580,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -662,7 +662,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [15,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [15,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1434,7 +1434,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1 ; ; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1452,7 +1452,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1470,7 +1470,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3 ; ; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1488,7 +1488,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3 ; ; XOPAVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1553,7 +1553,7 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1594,7 +1594,7 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -1647,7 +1647,7 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1 ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -1694,7 +1694,7 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0 ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -2080,7 +2080,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_3 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2122,7 +2122,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2 ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2584,7 +2584,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2633,7 +2633,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_2 ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2684,7 +2684,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_2 ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2735,7 +2735,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -2776,7 +2776,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3090,7 +3090,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z ; ; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15] ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 @@ -3259,7 +3259,7 @@ define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_1 ; ; AVX512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3295,7 +3295,7 @@ define <16 x i16> @shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_2 ; ; AVX512VL-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3334,7 +3334,7 @@ define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,0,1,2,3,2,11,8,9,8,9,10,11,10,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,2,3,2,11,8,9,8,9,10,11,10,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3373,7 +3373,7 @@ define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_0 ; ; AVX512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [6,7,4,5,2,3,0,9,14,15,12,13,10,11,8,9] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,4,5,2,3,0,9,14,15,12,13,10,11,8,9] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3400,9 +3400,9 @@ define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_2 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,14,15] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -3417,7 +3417,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_2 ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3462,7 +3462,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3508,7 +3508,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,6,0,4,6,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -3523,7 +3523,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,4,4,4,12,8,8,8,8,12,12,12,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,4,4,4,12,8,8,8,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3567,7 +3567,7 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,1,0,2,0,11,0,8,0,9,0,10,0,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [u,0,u,1,u,2,u,11,u,8,u,9,u,10,u,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3607,7 +3607,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,0,5,0,6,0,15,0,12,0,13,0,14,0,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [u,4,u,5,u,6,u,15,u,12,u,13,u,14,u,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3656,7 +3656,7 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,1,2,0,6,7,4,13,11,9,10,8,14,15,12,13] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,2,0,6,7,4,13,11,9,10,8,14,15,12,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3701,7 +3701,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,4,0,6,4,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,4,u,6,4,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -3716,7 +3716,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0 ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,4,4,4,0,0,0,8,12,12,12,12,8,8,8,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,0,0,0,8,12,12,12,12,8,8,8,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3757,7 +3757,7 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,1,6,7,4,13,10,11,8,9,14,15,12,13] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,13,10,11,8,9,14,15,12,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3807,7 +3807,7 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,2,6,7,4,13,10,11,8,10,14,15,12,13] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,2,6,7,4,13,10,11,8,10,14,15,12,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3851,7 +3851,7 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,1,6,7,4,15,10,11,8,9,14,15,12,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,15,10,11,8,9,14,15,12,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3895,7 +3895,7 @@ define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_0 ; ; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [7,5,6,4,3,1,2,8,15,13,14,12,11,9,10,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,5,6,4,3,1,2,8,15,13,14,12,11,9,10,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3938,7 +3938,7 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,0,4,6,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u,4,6,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -3953,7 +3953,7 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0 ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,0,5,4,5,4,1,8,9,8,13,12,13,12,9,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,5,4,5,4,1,8,9,8,13,12,13,12,9,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -3997,7 +3997,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,4,0,6,4,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,4,u,6,4,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,2,3,0,1,6,7,8,9,18,19,16,17,22,23,20,21,18,19,16,17,22,23,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -4012,7 +4012,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0 ; ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [5,4,1,0,5,4,1,8,13,12,9,8,13,12,9,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,1,0,5,4,1,8,13,12,9,8,13,12,9,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4056,7 +4056,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4071,7 +4071,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1 ; ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [5,4,1,0,1,0,5,12,13,12,9,8,9,8,13,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,1,0,1,0,5,12,13,12,9,8,9,8,13,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4115,7 +4115,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,0,4,6,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u,4,6,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4130,7 +4130,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,0,4,4,8,8,12,12,8,8,12,12,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,0,4,4,8,8,12,12,8,8,12,12,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4174,7 +4174,7 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4189,7 +4189,7 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,0,0,4,4,0,0,12,12,8,8,12,12,8,8,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,0,0,4,4,0,0,12,12,8,8,12,12,8,8,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4232,7 +4232,7 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,6,4,0,5,1,7,11,10,14,12,8,13,9,15,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,4,0,5,1,7,11,10,14,12,8,13,9,15,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4274,7 +4274,7 @@ define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,0,6,4,5,1,7,11,10,8,14,12,13,9,15,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,4,5,1,7,11,10,8,14,12,13,9,15,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4316,7 +4316,7 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,6,4,0,1,3,7,13,10,14,12,8,9,11,15,13] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,4,0,1,3,7,13,10,14,12,8,9,11,15,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4351,28 +4351,28 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; ; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,2,0,5,7,6,4,5] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,0,5,7,6,4,5] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,2,3,6,7,10,11,0,1,4,5,14,15,16,17,16,17,18,19,22,23,26,27,16,17,20,21,30,31] ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [6,6,7,5,1,6,4,11,14,14,15,13,9,14,12,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,7,5,1,6,4,11,14,14,15,13,9,14,12,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4386,7 +4386,7 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; XOPAVX2-NEXT: retq @@ -4415,7 +4415,7 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,6,0,4,6,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,4,5,4,5,4,5,8,9,16,17,16,17,20,21,20,21,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -4430,7 +4430,7 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,4,4,4,4,4,12,8,8,12,12,12,12,12,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,4,4,4,4,4,12,8,8,12,12,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4474,7 +4474,7 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,0,1,0,1,0,1,8,9,16,17,16,17,20,21,20,21,16,17,16,17,16,17,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4489,7 +4489,7 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,4,0,0,4,4,4,12,12,12,8,8,12,12,12,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,0,0,4,4,4,12,12,12,8,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4533,7 +4533,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,6,0,4,6,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,4,5,4,5,8,9,16,17,20,21,20,21,16,17,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -4548,7 +4548,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,4,4,4,12,8,12,12,8,12,12,12,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,4,4,4,12,8,12,12,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4592,7 +4592,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,0,4,6,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,u,4,6,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,0,1,0,1,8,9,16,17,20,21,20,21,16,17,16,17,16,17,16,17,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4607,7 +4607,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,0,0,0,8,8,12,12,8,8,8,8,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,0,0,0,8,8,12,12,8,8,8,8,8] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4652,7 +4652,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,3,7,4,6,7,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,7,4,6,7,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,6,7,8,9,14,15,16,17,20,21,20,21,16,17,20,21,22,23,24,25,26,27] ; AVX2-FAST-ALL-NEXT: retq @@ -4666,7 +4666,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,4,5,6,15,8,12,12,8,12,13,14,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,4,5,6,15,8,12,12,8,12,13,14,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4711,7 +4711,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,6,0,4,6,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,4,5,4,5,4,5,4,5,4,5,8,9,16,17,u,u,20,21,20,21,20,21,20,21,20,21,20,21] ; AVX2-FAST-ALL-NEXT: retq @@ -4726,7 +4726,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,4,4,4,4,4,12,8,0,12,12,12,12,12,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,u,4,4,4,4,4,12,8,u,12,12,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4770,7 +4770,7 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,u,u,4,5,0,1,0,1,0,1,8,9,16,17,16,17,u,u,20,21,16,17,16,17,16,17,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4785,7 +4785,7 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,4,0,0,4,4,4,12,12,12,0,8,12,12,12,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,u,0,4,4,4,12,12,12,u,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4829,7 +4829,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,6,0,6,4,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,0,1,4,5,0,1,0,1,0,1,8,9,u,u,16,17,16,17,20,21,16,17,16,17,16,17,16,17] ; AVX2-FAST-ALL-NEXT: retq @@ -4844,7 +4844,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,4,0,4,4,4,12,0,12,12,8,12,12,12,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [u,4,4,0,4,4,4,12,u,12,12,8,12,12,12,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4917,7 +4917,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,5,8,9,4,5,6,11,12,13,8,9,12,13,14,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,8,9,4,5,6,11,12,13,8,9,12,13,14,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5021,7 +5021,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,7,4,5,6,11,8,9,10,15,12,13,14,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,7,4,5,6,11,8,9,10,15,12,13,14,11] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5062,7 +5062,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [4,5,6,3,0,1,2,15,12,13,14,11,8,9,10,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,3,0,1,2,15,12,13,14,11,8,9,10,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5097,28 +5097,28 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; ; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,6,5,7,4,6] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,6,5,7,4,6] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,8,9,0,1,6,7,2,3,14,15,18,19,22,23,26,27,24,25,16,17,22,23,18,19,30,31] ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,7,1,0,2,7,3,13,11,15,9,8,10,15,11,13] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,1,0,2,7,3,13,11,15,9,8,10,15,11,13] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5132,7 +5132,7 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; ; XOPAVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; XOPAVX2-NEXT: retq @@ -5161,7 +5161,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5192,9 +5192,9 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -5209,7 +5209,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3 ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5218,8 +5218,8 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13],xmm3[14,15],xmm1[u,u,u,u,u,u,u,u] +; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq @@ -5256,7 +5256,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_3 ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5287,11 +5287,11 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,14,15] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: @@ -5304,7 +5304,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2 ; ; AVX512VL-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5313,10 +5313,10 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,2,3,4,5],xmm2[6,7] +; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27: @@ -5358,7 +5358,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,7,0,4,7,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,7,u,4,7,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] @@ -5376,7 +5376,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3 ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5429,7 +5429,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2 ; ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,0,4,0,6,4,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,4,u,6,4,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] @@ -5447,7 +5447,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5509,7 +5509,7 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2 ; ; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5557,7 +5557,7 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1 ; ; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -5604,7 +5604,7 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1 ; ; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -5664,7 +5664,7 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3 ; ; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5720,7 +5720,7 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_u ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,4,3,18,0,0,0,0,12,12,11,26,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5765,7 +5765,7 @@ define <16 x i16> @shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_u ; ; AVX512VL-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,2,21,0,0,0,0,8,11,10,29,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5817,7 +5817,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_u ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,21,0,0,0,0,8,9,10,29,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5860,7 +5860,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,5,6,27,0,0,0,0,12,13,14,27] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,4,5,6,27,u,u,u,u,12,13,14,27] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -5904,7 +5904,7 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u ; ; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,5,6,19,0,0,0,0,12,13,14,27,0,0,0,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -5952,7 +5952,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -5997,7 +5997,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6062,7 +6062,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,1,0,5,7,25,0,0,0,9,0,13,15,25] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,1,u,5,7,25,u,u,u,9,u,13,15,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6114,7 +6114,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,20,0,0,2,4,0,0,0,28,0,8,10,12,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,20,u,0,2,4,u,u,u,28,u,8,10,12,u] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -6160,7 +6160,7 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1 ; ; AVX512VL-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6236,7 +6236,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_1 ; ; AVX512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [5,6,7,0,1,2,3,12,13,14,15,8,9,10,11,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,7,0,1,2,3,12,13,14,15,8,9,10,11,12] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6342,7 +6342,7 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1 ; ; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -6419,7 +6419,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_1 ; ; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,4,5,6,7,0,1,10,11,12,13,14,15,8,9,10] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,4,5,6,7,0,1,10,11,12,13,14,15,8,9,10] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6525,7 +6525,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2 ; ; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6603,7 +6603,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ; ; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -6646,7 +6646,7 @@ define <16 x i16> @shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_2 ; ; AVX512VL-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,4,6,1,3,5,7,31,30,29,28,27,26,25,24] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,1,3,5,7,31,30,29,28,27,26,25,24] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -6721,7 +6721,7 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u ; ; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [7,0,19,0,4,4,21,0,15,0,27,0,12,12,29,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -6844,7 +6844,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, ; ; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: ; AVX512VL-FAST-CROSSLANE: # %bb.0: -; AVX512VL-FAST-CROSSLANE-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-CROSSLANE-NEXT: retq ; @@ -6994,7 +6994,7 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_2 ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -7045,7 +7045,7 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2 ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [2,18,3,19,10,26,11,27,0,16,1,17,8,24,9,25] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,10,26,11,27,0,16,1,17,8,24,9,25] ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; @@ -7397,7 +7397,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0 ; ; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: ; AVX512VL-FAST-CROSSLANE: # %bb.0: -; AVX512VL-FAST-CROSSLANE-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-CROSSLANE-NEXT: retq ; @@ -7447,7 +7447,7 @@ define <16 x i16> @shuffle_v16i16_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11_1 ; ; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11: ; AVX512VL-FAST-CROSSLANE: # %bb.0: -; AVX512VL-FAST-CROSSLANE-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-CROSSLANE-NEXT: retq ; @@ -7492,7 +7492,7 @@ define <16 x i16> @shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -7572,9 +7572,9 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6],xmm2[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,4,5,u,u,10,11,4,5,14,15,u,u,0,1] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5],xmm0[6],xmm1[7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -7590,16 +7590,16 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,0,0,65535,65535,0,0,0,65535] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: PR24935: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,5,0,0,0,4,6,2] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,u,u,0,4,6,2] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[6,7],zero,zero,ymm0[18,19,22,23],zero,zero,zero,zero,ymm0[26,27,28,29,16,17],zero,zero -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,6,3,0,0,6,4,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,3,0,0,6,4,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1],zero,zero,ymm1[6,7,0,1,10,11],zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[16,17,20,21],zero,zero,zero,zero,zero,zero,ymm1[24,25] ; AVX2-FAST-ALL-NEXT: vpor %ymm0, %ymm1, %ymm0 @@ -7616,13 +7616,13 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5],zero,zero,ymm1[10,11,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,0,0,65535,65535,0,0,0,65535] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: PR24935: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8] ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -7652,7 +7652,7 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] -; XOPAVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,0,0,65535,65535,0,0,0,65535] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -7704,7 +7704,7 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) { ; ; AVX512VL-LABEL: PR34369: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12] ; AVX512VL-NEXT: vptestnmw %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} ; AVX512VL-NEXT: retq @@ -7725,10 +7725,10 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) { ; ; XOPAVX2-LABEL: PR34369: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] -; XOPAVX2-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm2[10,11],xmm0[8,9,10,11,12,13],xmm2[4,5] -; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; XOPAVX2-NEXT: vpperm {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[10,11],xmm2[8,9,10,11,12,13],xmm0[4,5] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 @@ -8075,18 +8075,18 @@ define <16 x i16> @pr43230(<16 x i16> %a, <16 x i16> %b) { ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 -; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index dbbd6b19b2829..db2f8cf4c1154 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -63,7 +63,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -107,7 +107,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxwd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,512] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -151,7 +151,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,3] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -195,7 +195,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,0,0,67108864] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -239,7 +239,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -283,7 +283,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxwq {{.*#+}} ymm1 = [0,0,0,1536] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -327,7 +327,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,7] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -415,7 +415,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -459,7 +459,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxwd {{.*#+}} ymm1 = [0,0,0,0,0,2560,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -503,7 +503,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,11,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -547,7 +547,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,0,201326592,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -591,7 +591,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -635,7 +635,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxwq {{.*#+}} ymm1 = [0,0,3584,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -679,7 +679,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,15,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -772,7 +772,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,0,0,0,0,0,0,17] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -819,7 +819,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxwd {{.*#+}} xmm1 = [0,0,0,4608] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -866,7 +866,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,19] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -913,7 +913,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxdq {{.*#+}} xmm1 = [0,335544320] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -960,7 +960,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,0,0,0,0,21,0,0] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1007,7 +1007,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxwq {{.*#+}} xmm1 = [0,5632] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1054,7 +1054,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,23] +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1242,7 +1242,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,27,0,0] +; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1430,7 +1430,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} xmm1 = [31,0] +; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [31,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1959,7 +1959,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1995,7 +1995,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -2384,7 +2384,7 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_ ; ; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0] ; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -2422,7 +2422,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_ ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,0,0,0,0,0,0,1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2436,7 +2436,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_ ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,0,0,0,0,0,0,1] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2454,7 +2454,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_ ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmovsxwd {{.*#+}} xmm2 = [0,0,0,512] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2468,7 +2468,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_ ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vpmovsxwd {{.*#+}} xmm2 = [0,0,0,512] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2486,7 +2486,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_ ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2500,7 +2500,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_ ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2582,7 +2582,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2596,7 +2596,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] +; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -3138,16 +3138,16 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,u,1,6],zero,zero,xmm2[0],zero,xmm2[11,u],zero,zero,zero,zero ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,6,u,6,u,u,u,u,u,u,u,15,u,u,u,u] ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,6,u,6,u,u,u,u,u,u,u,15,u,u,u,u] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u],zero,zero,xmm5[u,u,u,u,1,6,13,u,u],zero,xmm5[u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255] @@ -3158,7 +3158,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,12,12,13,1,6,13,7,u,u,u,u,u,u,u,u,u,u,u,u,17,22,29,23,20,19,u,19,u,u,u,u] -; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,6,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,5,0,6,u,1,u] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] @@ -3171,7 +3171,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX512VLBW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,12,12,13,1,6,13,7,u,u,u,u,u,u,u,u,u,u,u,u,17,22,29,23,20,19,u,19,u,u,u,u] -; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,6,1] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,5,0,6,u,1,u] ; AVX512VLBW-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] @@ -3207,7 +3207,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; XOPAVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,12,12,13,1,6,13,7,u,u,u,u,u,u,u,u,u,u,u,u,17,22,29,23,20,19,u,19,u,u,u,u] -; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,6,1] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,5,0,6,u,1,u] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] @@ -3890,7 +3890,7 @@ define <32 x i8> @shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_ ; AVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u] -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,4,5,6,u,u] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] @@ -3900,7 +3900,7 @@ define <32 x i8> @shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u] ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,1,2,4,5,6,14,15] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,4,5,6,14,15] ; AVX512VLBW-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; @@ -3922,7 +3922,7 @@ define <32 x i8> @shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_ ; XOPAVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u] -; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,4,5,6,u,u] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] @@ -4259,7 +4259,7 @@ define <32 x i8> @shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_ ; ; AVX512VL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4337,7 +4337,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -4603,7 +4603,7 @@ define <32 x i8> @shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_ ; ; AVX512VL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -4649,7 +4649,7 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_ ; ; AVX512VLBW-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4878,9 +4878,9 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[9],zero,xmm0[11],zero,xmm0[13],zero,xmm0[15],zero -; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[1],zero,xmm1[3],zero,xmm1[5],zero,xmm1[7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],zero,xmm0[4],zero,xmm0[6],zero,xmm0[1],zero,xmm0[3],zero,xmm0[5],zero,xmm0[7],zero +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index d848a8b879215..f750c353daaf8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -332,7 +332,7 @@ define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v4f64_0423: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,2,3] +; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,2,3] ; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -349,7 +349,7 @@ define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-LABEL: shuffle_v4f64_0462: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,6,2] +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,6,2] ; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -461,7 +461,7 @@ define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_1054: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [1,0,5,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -489,7 +489,7 @@ define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_3254: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,2,5,4] +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [3,2,5,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -517,7 +517,7 @@ define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_3276: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,2,7,6] +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [3,2,7,6] ; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -545,7 +545,7 @@ define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v4f64_1076: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,7,6] +; AVX512VL-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,0,7,6] ; AVX512VL-FAST-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -569,7 +569,7 @@ define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-LABEL: shuffle_v4f64_0415: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,1,5] +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,1,5] ; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -586,7 +586,7 @@ define <4 x double> @shuffle_v4f64_2741(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-LABEL: shuffle_v4f64_2741: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,7,4,1] +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [2,7,4,1] ; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -675,7 +675,7 @@ define <4 x double> @shuffle_v4f64_0456(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0456: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4,0,1,2] ; AVX512VL-FAST-ALL-NEXT: vpermi2pd %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovapd %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -762,7 +762,7 @@ define <4 x double> @shuffle_v4f64_0044(<4 x double> %a, <4 x double> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0044: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4] +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,4,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -794,7 +794,7 @@ define <4 x double> @shuffle_v4f64_0044_v2f64(<2 x double> %a, <2 x double> %b) ; AVX512VL-FAST-ALL: # %bb.0: ; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4] +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,4,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -829,7 +829,7 @@ define <4 x double> @shuffle_v4f64_1032_v2f64(<2 x double> %a, <2 x double> %b) ; AVX512VL-FAST-ALL: # %bb.0: ; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [1,0,5,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -891,7 +891,7 @@ define <4 x double> @shuffle_v4f64_1436_split_load(ptr %px, ptr %py) { ; AVX512VL-FAST-ALL-NEXT: vmovapd 16(%rsi), %xmm1 ; AVX512VL-FAST-ALL-NEXT: vmovupd (%rdi), %ymm2 ; AVX512VL-FAST-ALL-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[3] -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,1,2,4] +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm0 = [0,1,2,4] ; AVX512VL-FAST-ALL-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1141,7 +1141,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_0124: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,1,2,4] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1170,7 +1170,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0142: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,1,4,2] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,2] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1194,7 +1194,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0412: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,1,2] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,1,2] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1222,7 +1222,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_4012: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1261,7 +1261,7 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0451: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,5,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,5,1] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1294,7 +1294,7 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_4015: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,0,1,5] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,5] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1323,7 +1323,7 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_2u35: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,5,3,5] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,3,5] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1353,7 +1353,7 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_1251: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,2,5,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,5,1] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1381,7 +1381,7 @@ define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1054: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,5,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1415,7 +1415,7 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_3254: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,2,5,4] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,5,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1449,7 +1449,7 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_3276: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,2,7,6] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,7,6] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1483,7 +1483,7 @@ define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_1076: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,7,6] +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,7,6] ; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1507,7 +1507,7 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0415: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,1,5] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,1,5] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1536,7 +1536,7 @@ define <4 x i64> @shuffle_v4i64_2741(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_2741: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,7,4,1] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,4,1] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1565,7 +1565,7 @@ define <4 x i64> @shuffle_v4i64_0437(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0437: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,4,3,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,3,7] ; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1725,7 +1725,7 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) { ; AVX512VL-FAST-ALL: # %bb.0: ; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,4,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1767,7 +1767,7 @@ define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) { ; AVX512VL-FAST-ALL: # %bb.0: ; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,5,4] ; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -2241,9 +2241,9 @@ define <4 x i64> @add_v4i64_0246_1357(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: add_v4i64_0246_1357: ; AVX512VL-FAST-ALL: # %bb.0: # %entry -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm3 ; AVX512VL-FAST-ALL-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -2297,9 +2297,9 @@ define <4 x i64> @add_v4i64_4602_5713(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-FAST-ALL-LABEL: add_v4i64_4602_5713: ; AVX512VL-FAST-ALL: # %bb.0: # %entry -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,4,6] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,5,7] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm0, %ymm1, %ymm3 ; AVX512VL-FAST-ALL-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index bd78dbded0705..c7a398bf316df 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -56,7 +56,7 @@ define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00000010: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -103,7 +103,7 @@ define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00000200: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -150,7 +150,7 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00003000: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,3,0] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -172,17 +172,11 @@ define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[2,0],ymm0[4,4],ymm1[6,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_00040000: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_00040000: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_00040000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -195,17 +189,11 @@ define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_00500000: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,5,0] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_00500000: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,5] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_00500000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,5,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -218,17 +206,11 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_06000000: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_06000000: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,6,0,0] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_06000000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -241,17 +223,11 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_70000000: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_70000000: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,0] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_70000000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -273,17 +249,11 @@ define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_00112233: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_00112233: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_00112233: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -322,7 +292,7 @@ define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00001111: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -394,7 +364,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_08084c4c: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] ; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -456,7 +426,7 @@ define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-LABEL: shuffle_v8f32_08192a3b: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11] +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11] ; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -502,7 +472,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-LABEL: shuffle_v8f32_08991abb: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] ; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -533,7 +503,7 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_091b2d3f: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] ; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -577,7 +547,7 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-LABEL: shuffle_v8f32_09ab1def: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] ; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -933,12 +903,11 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) { ; ; AVX2-FAST-ALL-LABEL: shuffle_v8f32_c348cda0: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,4,0,0,3,4,0] -; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [u,3,4,u,u,u,u,0] +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [4,5,2,0,4,5,2,0] +; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] -; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm1 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-FAST-ALL-NEXT: retq ; @@ -954,7 +923,7 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-LABEL: shuffle_v8f32_c348cda0: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8] +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8] ; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -984,7 +953,7 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) { ; ; AVX2-FAST-ALL-LABEL: shuffle_v8f32_f511235a: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7,2,7,2,7,2,7,2] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [7,u,u,u,u,u,u,2] ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [u,5,1,1,2,3,5,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 @@ -1002,7 +971,7 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-LABEL: shuffle_v8f32_f511235a: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [15,5,1,1,2,3,5,10] +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [15,5,1,1,2,3,5,10] ; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1138,7 +1107,7 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_76543210: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1166,7 +1135,7 @@ define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_3210ba98: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] ; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1194,7 +1163,7 @@ define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1216,7 +1185,7 @@ define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_7654fedc: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] ; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1244,7 +1213,7 @@ define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_fedc7654: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] ; AVX512VL-FAST-ALL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -1280,7 +1249,7 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) { ; ; AVX512VL-FAST-ALL-LABEL: PR21138: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] ; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1308,7 +1277,7 @@ define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq @@ -1331,7 +1300,7 @@ define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_ba983210: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] ; AVX512VL-FAST-ALL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -1386,7 +1355,7 @@ define <8 x float> @shuffle_v8f32_084c195d(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_084c195d: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,8,4,12,1,9,5,13] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,4,12,1,9,5,13] ; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1434,7 +1403,7 @@ define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-LABEL: shuffle_v8f32_089abcde: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] ; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -1457,7 +1426,7 @@ define <8 x float> @shuffle_v8f32_0189abcd(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_0189abcd: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4,0,1,2] ; AVX512VL-FAST-ALL-NEXT: vpermi2pd %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovapd %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -1489,7 +1458,7 @@ define <8 x float> @shuffle_v8f32_01289abc(<8 x float> %a, <8 x float> %b) { ; ; AVX512VL-LABEL: shuffle_v8f32_01289abc: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,9,10,0,1,2,3,4] +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,9,10,0,1,2,3,4] ; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -1629,7 +1598,7 @@ define <8 x float> @shuffle_mem_v8f32_8BA0CFE4(<8 x float> %a0, ptr %a1) { ; ; AVX512VL-FAST-LABEL: shuffle_mem_v8f32_8BA0CFE4: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,2,8,4,7,6,12] +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,3,2,8,4,7,6,12] ; AVX512VL-FAST-NEXT: vpermt2ps (%rdi), %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %1 = load <8 x float>, ptr %a1 @@ -1686,7 +1655,7 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00000010: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,1] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1733,7 +1702,7 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00000200: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1780,7 +1749,7 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00003000: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,3,0] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1802,17 +1771,11 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[2,0],ymm0[4,4],ymm1[6,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00040000: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00040000: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00040000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1825,17 +1788,11 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00500000: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,5,0] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00500000: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,5] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00500000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,5,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1848,17 +1805,11 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_06000000: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_06000000: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,6,0,0] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_06000000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1871,17 +1822,11 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_70000000: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_70000000: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,0] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_70000000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1934,7 +1879,7 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00112233: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -1981,7 +1926,7 @@ define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00001111: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -2059,7 +2004,7 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_08084c4c: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] ; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2121,7 +2066,7 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_08192a3b: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11] ; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2167,7 +2112,7 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_08991abb: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] ; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -2198,7 +2143,7 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_091b2d3f: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] ; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -2242,7 +2187,7 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_09ab1def: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] ; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -2383,17 +2328,11 @@ define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00015444: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00015444: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00015444: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2404,17 +2343,11 @@ define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00204644: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00204644: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00204644: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2425,17 +2358,11 @@ define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_03004474: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_03004474: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_03004474: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2446,17 +2373,11 @@ define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_10004444: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_10004444: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_10004444: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2467,17 +2388,11 @@ define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_22006446: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_22006446: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_22006446: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2488,17 +2403,11 @@ define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_33307474: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_33307474: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_33307474: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2509,17 +2418,11 @@ define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_32104567: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_32104567: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_32104567: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2530,17 +2433,11 @@ define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00236744: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00236744: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00236744: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2551,17 +2448,11 @@ define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00226644: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00226644: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00226644: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2572,17 +2463,11 @@ define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_10324567: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_10324567: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_10324567: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2593,17 +2478,11 @@ define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_11334567: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_11334567: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_11334567: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2614,17 +2493,11 @@ define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_01235467: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_01235467: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_01235467: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2635,17 +2508,11 @@ define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_01235466: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_01235466: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_01235466: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2656,17 +2523,11 @@ define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_002u6u44: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,u,6,u,4,4] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_002u6u44: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,2,0,6,0,4,4] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,u,6,u,4,4] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2677,17 +2538,11 @@ define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00uu66uu: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,u,u,6,6,u,u] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00uu66uu: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,6,6,0,0] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,u,u,6,6,u,u] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2698,17 +2553,11 @@ define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_103245uu: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,u,u] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_103245uu: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,0,3,2,4,5,0,0] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,u,u] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2719,17 +2568,11 @@ define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_1133uu67: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,3,3,u,u,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_1133uu67: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,1,3,3,0,0,6,7] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,3,3,u,u,6,7] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2740,17 +2583,11 @@ define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_0uu354uu: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,u,u,3,5,4,u,u] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_0uu354uu: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,3,5,4,0,0] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,u,u,3,5,4,u,u] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2761,17 +2598,11 @@ define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_uuu3uu66: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [u,u,u,3,u,u,6,6] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_uuu3uu66: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,3,0,0,6,6] -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [u,u,u,3,u,u,6,6] +; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2812,7 +2643,7 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_6caa87e5: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [14,4,2,2,0,15,6,13] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [14,4,2,2,0,15,6,13] ; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -2949,7 +2780,7 @@ define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_76543210: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] ; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -2977,7 +2808,7 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_3210ba98: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] ; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -3005,7 +2836,7 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_3210fedc: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -3027,7 +2858,7 @@ define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_7654fedc: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] ; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq ; @@ -3055,7 +2886,7 @@ define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_fedc7654: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] ; AVX512VL-FAST-ALL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -3084,7 +2915,7 @@ define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba987654: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq @@ -3107,7 +2938,7 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba983210: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq @@ -3139,7 +2970,7 @@ define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_089abcde: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] ; AVX512VL-FAST-ALL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -3174,7 +3005,7 @@ define <8 x i32> @shuffle_v8i32_0189abcd(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_0189abcd: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] ; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -3212,7 +3043,7 @@ define <8 x i32> @shuffle_v8i32_01289abc(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_01289abc: ; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,9,10,0,1,2,3,4] +; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,10,0,1,2,3,4] ; AVX512VL-FAST-ALL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-FAST-ALL-NEXT: retq @@ -3420,7 +3251,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_0dcd3f14: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,5,4,5,11,7,9,12] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,5,4,5,11,7,9,12] ; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -3807,10 +3638,10 @@ define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: add_v8i32_02468ACE_13579BDF: @@ -3831,10 +3662,10 @@ define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[2],ymm2[2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: add_v8i32_8ACE0246_9BDF1357: @@ -3881,7 +3712,7 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float ; AVX512VL-FAST: # %bb.0: # %entry ; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,3,3] +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,3,3] ; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 ; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-FAST-NEXT: retq @@ -3989,7 +3820,7 @@ define <8 x i32> @lowhalf_v8i32(<8 x i32> %x, <8 x i32> %y) { ; ; AVX512VL-LABEL: lowhalf_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,14,3,14] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,14,3,14] ; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> @@ -4015,7 +3846,7 @@ define <8 x float> @lowhalf_v8f32(<8 x float> %x, <8 x float> %y) { ; ; AVX512VL-LABEL: lowhalf_v8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,14,3,14] +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm2 = [2,14,3,14] ; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index 87c135ddcec95..e831bfe783ac9 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -109,25 +109,25 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512DQ-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0] ; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512VBMI-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> @@ -163,8 +163,7 @@ define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 @@ -179,8 +178,7 @@ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_ ; ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 @@ -210,7 +208,7 @@ define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_ define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u] ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,1,3,2,5,7,6,9,11] @@ -218,7 +216,7 @@ define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_ ; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,13,15,14,1,3,2,5,u,u,u,u,u,u,u,u,26,29,31,30,17,19,18,21,23,22,25,27,u,u,u,u] -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] ; AVX512F-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq @@ -226,13 +224,13 @@ define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_ ; AVX512BW-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u,33,35,34,37,39,38,41,43,42,45,47,46,u,u,u,u,49,51,50,53,55,54,57,59,58,61,63,62,u,u,u,u] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,4,5,6,8,9,10,12,13,14,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,4,5,6,8,9,10,12,13,14,0,0,0,0] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,4,5,6,22,23] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,1,3,2,5,7,6,9,11] @@ -240,7 +238,7 @@ define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_ ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,13,15,14,1,3,2,5,u,u,u,u,u,u,u,u,26,29,31,30,17,19,18,21,23,22,25,27,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,4,1,1,1,1] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: retq @@ -463,8 +461,7 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_ ; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -481,8 +478,7 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_ ; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -503,17 +499,16 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_ ; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512F-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: @@ -527,17 +522,16 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_ ; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: @@ -560,7 +554,7 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_ ; AVX512F-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm5 = [18446744073709551615,16777215,0,0] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13] @@ -587,7 +581,7 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_ ; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm5 = [18446744073709551615,16777215,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13] @@ -614,7 +608,7 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_ ; AVX512DQ-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm5 = [18446744073709551615,16777215,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13] @@ -644,26 +638,26 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_ ; AVX512F-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512F-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[3,5,9,11,15] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero -; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -671,26 +665,26 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_ ; AVX512BW-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] -; AVX512BW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[3,5,9,11,15] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -698,26 +692,26 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_ ; AVX512DQ-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[3,5,9,11,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,1,3,7,9,13,15],zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -748,14 +742,14 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_ ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,2,6,8,12,14] -; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -775,14 +769,14 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_ ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,2,6,8,12,14] -; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -802,14 +796,14 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_ ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,0,4,6,10,12],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,zero,zero,xmm1[0,2,6,8,12,14] -; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -827,26 +821,26 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_ ; AVX512F-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512F-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[2,4,8,10,14] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero -; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -854,26 +848,26 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_ ; AVX512BW-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] -; AVX512BW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[2,4,8,10,14] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -881,26 +875,26 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_ ; AVX512DQ-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm4 = [18446744073709551615,16777215,0,0] -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm4[2,4,8,10,14] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,0,2,6,8,12,14],zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -928,7 +922,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -945,7 +939,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -962,7 +956,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -986,10 +980,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -1003,10 +997,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -1020,10 +1014,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -1223,10 +1217,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -1240,10 +1234,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512BW-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -1257,10 +1251,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_ ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm3 = [0,0,0,0,0,4294967040,4294967295,4294967295] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm3 = [0,18446744073692774400,18446744073709551615,18446744073709551615] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq @@ -1482,7 +1476,7 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_ ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7] ; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1644,7 +1638,7 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) { ; ; AVX512BW-LABEL: PR54562_ref: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,1,2,3,4,4,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,2,3,4,4,5] ; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30,33,32,34,33,36,35,37,36,39,38,40,39,42,41,43,42,53,52,54,53,56,55,57,56,59,58,60,59,62,61,63,62] ; AVX512BW-NEXT: retq @@ -1679,8 +1673,8 @@ define void @PR54562_mem(ptr %src, ptr %dst) { ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] -; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2] +; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rsi) @@ -1689,7 +1683,7 @@ define void @PR54562_mem(ptr %src, ptr %dst) { ; ; AVX512BW-LABEL: PR54562_mem: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,1,2,3,4,4,5] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,2,3,4,4,5] ; AVX512BW-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30,33,32,34,33,36,35,37,36,39,38,40,39,42,41,43,42,53,52,54,53,56,55,57,56,59,58,60,59,62,61,63,62] ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) @@ -1702,8 +1696,8 @@ define void @PR54562_mem(ptr %src, ptr %dst) { ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rsi) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index fce98cd470bcd..b90919738478f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -44,61 +44,97 @@ define <8 x double> @shuffle_v8f64_44444444_bc(<8 x i64> %a, <8 x i64> %b) { } define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00000010: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00000010: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00000010: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00000200: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00000200: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00000200: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00003000: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00003000: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00003000: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00040000: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00040000: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00040000: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00500000: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00500000: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00500000: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,5,0,0,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_06000000: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,6] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_06000000: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_06000000: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,6,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } @@ -106,7 +142,7 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_70000000: ; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,0] +; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] ; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -123,21 +159,33 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { } define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00112233: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00112233: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00112233: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00001111: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00001111: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00001111: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } @@ -170,106 +218,171 @@ define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { } define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_08084c4c: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12] -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_08084c4c: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12] +; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_08084c4c: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,4,0,12,0,4,0,12,0] +; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_8823cc67: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] -; ALL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovapd %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_8823cc67: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_8823cc67: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0] +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_9832dc76: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] -; ALL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovapd %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_9832dc76: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_9832dc76: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0] +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_9810dc54: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] -; ALL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovapd %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_9810dc54: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_9810dc54: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0] +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_08194c5d: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13] -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_08194c5d: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13] +; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_08194c5d: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,4,0,12,0,5,0,13,0] +; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_2a3b6e7f: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15] -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_2a3b6e7f: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15] +; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_2a3b6e7f: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [2,0,10,0,3,0,11,0,6,0,14,0,7,0,15,0] +; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_08192a3b: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_08192a3b: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_08192a3b: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,2,0,10,0,3,0,11,0] +; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_08991abb: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] -; ALL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovapd %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_08991abb: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_08991abb: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0] +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_091b2d3f: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15] -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_091b2d3f: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15] +; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_091b2d3f: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,9,0,1,0,11,0,2,0,13,0,3,0,15,0] +; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_09ab1def: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] -; ALL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovapd %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_09ab1def: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_09ab1def: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } @@ -392,91 +505,145 @@ define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { } define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00015444: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00015444: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00015444: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00204644: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00204644: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00204644: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_03004474: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_03004474: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_03004474: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_10004444: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_10004444: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_10004444: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_22006446: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_22006446: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_22006446: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_33307474: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_33307474: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_33307474: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_32104567: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_32104567: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_32104567: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00236744: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00236744: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00236744: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00226644: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00226644: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00226644: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } @@ -518,21 +685,33 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { } define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_002u6u44: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,2,0,6,0,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_002u6u44: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,u,6,u,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_002u6u44: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_00uu66uu: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,6,6,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_00uu66uu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,u,u,6,6,u,u] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_00uu66uu: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } @@ -574,22 +753,35 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { } define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_c348cda0: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8] -; ALL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovapd %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_c348cda0: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8] +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_c348cda0: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,11,0,12,0,0,0,4,0,5,0,2,0,8,0] +; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { -; ALL-LABEL: shuffle_v8f64_f511235a: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [15,5,1,1,2,3,5,10] -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8f64_f511235a: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [15,5,1,1,2,3,5,10] +; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_f511235a: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [15,0,5,0,1,0,1,0,2,0,3,0,5,0,10,0] +; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } @@ -670,61 +862,97 @@ define <8 x i64> @shuffle_v8i64_66666666(<8 x i64> %a, <8 x i64> %b) { } define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00000010: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00000010: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00000010: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00000200: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00000200: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00000200: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00003000: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00003000: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00003000: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00040000: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00040000: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00040000: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00500000: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,5,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00500000: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00500000: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,5,0,0,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_06000000: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,6] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_06000000: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_06000000: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,6,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } @@ -732,7 +960,7 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_70000000: ; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,0] +; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] ; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -767,21 +995,33 @@ define <8 x i64> @shuffle_v8i64_01014545_mem(ptr %ptr, <8 x i64> %b) { } define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00112233: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00112233: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00112233: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00001111: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00001111: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00001111: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } @@ -816,106 +1056,171 @@ define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { } define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_08084c4c: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12] -; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_08084c4c: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_08084c4c: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,4,0,12,0,4,0,12,0] +; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_8823cc67: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] -; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_8823cc67: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_8823cc67: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0] +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_9832dc76: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] -; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_9832dc76: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_9832dc76: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0] +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_9810dc54: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] -; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_9810dc54: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_9810dc54: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0] +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_08194c5d: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13] -; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_08194c5d: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_08194c5d: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,4,0,12,0,5,0,13,0] +; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_2a3b6e7f: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15] -; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_2a3b6e7f: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_2a3b6e7f: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,0,10,0,3,0,11,0,6,0,14,0,7,0,15,0] +; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_08192a3b: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] -; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_08192a3b: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_08192a3b: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,2,0,10,0,3,0,11,0] +; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_08991abb: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] -; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_08991abb: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_08991abb: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0] +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_091b2d3f: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15] -; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_091b2d3f: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_091b2d3f: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,1,0,11,0,2,0,13,0,3,0,15,0] +; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_09ab1def: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] -; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_09ab1def: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_09ab1def: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } @@ -1038,202 +1343,323 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { } define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00015444: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00015444: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00015444: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00204644: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00204644: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00204644: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_03004474: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_03004474: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_03004474: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_10004444: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_10004444: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_10004444: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_22006446: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_22006446: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_22006446: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_33307474: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_33307474: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_33307474: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_32104567: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_32104567: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_32104567: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00236744: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00236744: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00236744: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00226644: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00226644: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00226644: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_10324567: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_10324567: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_10324567: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_11334567: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_11334567: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_11334567: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_01235467: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_01235467: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01235467: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_01235466: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_01235466: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01235466: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_002u6u44: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,2,0,6,0,4,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_002u6u44: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,u,6,u,4,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_002u6u44: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_00uu66uu: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,6,6,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_00uu66uu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,u,u,6,6,u,u] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_00uu66uu: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_103245uu: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,0,3,2,4,5,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_103245uu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,3,2,4,5,u,u] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_103245uu: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_1133uu67: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,1,3,3,0,0,6,7] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_1133uu67: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,1,3,3,u,u,6,7] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_1133uu67: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_0uu354uu: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,3,5,4,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_0uu354uu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,u,u,3,5,4,u,u] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_0uu354uu: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_uuu3uu66: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,3,0,0,6,6] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_uuu3uu66: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [u,u,u,3,u,u,6,6] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_uuu3uu66: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { -; ALL-LABEL: shuffle_v8i64_6caa87e5: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13] -; ALL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64_6caa87e5: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_6caa87e5: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,0,4,0,2,0,2,0,0,0,15,0,6,0,13,0] +; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } @@ -1689,9 +2115,9 @@ define <8 x double> @concat_shuffle_v8f64_v2f64_10325476(<2 x double> %a0, <2 x ; AVX512F-32-NEXT: subl $12, %esp ; AVX512F-32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-32-NEXT: vinsertf128 $1, {{[0-9]+}}(%esp), %ymm2, %ymm2 ; AVX512F-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinsertf128 $1, {{[0-9]+}}(%esp), %ymm2, %ymm1 -; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-32-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6] ; AVX512F-32-NEXT: addl $12, %esp ; AVX512F-32-NEXT: retl @@ -1822,49 +2248,79 @@ define <8 x double> @shuffle_v2f64_v8f64_01010101(<2 x double> %a) { ;FIXME: compressp define <4 x double> @test_v8f64_2346 (<8 x double> %v) { -; ALL-LABEL: test_v8f64_2346: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [2,3,4,6] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: test_v8f64_2346: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [2,3,4,6] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: test_v8f64_2346: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [2,0,3,0,4,0,6,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-32-NEXT: retl %res = shufflevector <8 x double> %v, <8 x double> poison, <4 x i32> ret <4 x double> %res } ;FIXME: compressp define <2 x double> @test_v8f64_34 (<8 x double> %v) { -; ALL-LABEL: test_v8f64_34: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,4] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: test_v8f64_34: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [3,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: test_v8f64_34: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl %res = shufflevector <8 x double> %v, <8 x double> poison, <2 x i32> ret <2 x double> %res } ; FIXME: vpcompress define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) { -; ALL-LABEL: test_v8i64_1257: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,5,7] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: test_v8i64_1257: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,5,7] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: test_v8i64_1257: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,2,0,5,0,7,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-32-NEXT: retl %res = shufflevector <8 x i64> %v, <8 x i64> poison, <4 x i32> ret <4 x i64> %res } define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) { -; ALL-LABEL: test_v8i64_2_5: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,5] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: test_v8i64_2_5: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [2,5] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: test_v8i64_2_5: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [2,0,5,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl %res = shufflevector <8 x i64> %v, <8 x i64> poison, <2 x i32> ret <2 x i64> %res } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll index 07498c1233b5d..44eb620734b55 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -38,7 +38,7 @@ define <8 x float> @expand1(<4 x float> %a ) { ; AVX512F-LABEL: expand1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3] +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3] ; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -114,7 +114,7 @@ define <8 x float> @expand5(<4 x float> %a ) { ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512-FAST-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [8,10,12,14] +; AVX512-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,10,0,12,0,14,0] ; AVX512-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512-FAST-NEXT: ret{{[l|q]}} ; @@ -245,7 +245,7 @@ define <16 x float> @expand12(<8 x float> %a) { ; CHECK-LABEL: expand12: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 @@ -278,7 +278,7 @@ define <8 x float> @expand14(<4 x float> %a) { ; AVX512F-LABEL: expand14: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23] +; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23] ; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -300,7 +300,7 @@ define <8 x float> @expand15(<4 x float> %a) { ; AVX512-FAST-LABEL: expand15: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0] +; AVX512-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [u,u,0,u,1,u,u,u] ; AVX512-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] ; AVX512-FAST-NEXT: ret{{[l|q]}} @@ -467,7 +467,7 @@ define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){ ; ; AVX512F-LABEL: test_mm_mask_blend_epi8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: ret{{[l|q]}} entry: @@ -480,13 +480,13 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) { ; X86-AVX512-LABEL: test_masked_permps_v8f32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7] +; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7] ; X86-AVX512-NEXT: vpermt2ps (%eax), %ymm1, %ymm0 ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: test_masked_permps_v8f32: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7] +; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7] ; X64-AVX512-NEXT: vpermt2ps (%rdi), %ymm1, %ymm0 ; X64-AVX512-NEXT: retq ; @@ -495,7 +495,7 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) { ; X86-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: vmovaps (%eax), %ymm1 -; X86-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7] +; X86-AVX512F-NEXT: vmovaps {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7] ; X86-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 ; X86-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; X86-AVX512F-NEXT: retl @@ -504,7 +504,7 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) { ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm1 -; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7] +; X64-AVX512F-NEXT: vmovaps {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7] ; X64-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 ; X64-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; X64-AVX512F-NEXT: retq @@ -518,26 +518,26 @@ define <16 x float> @test_masked_permps_v16f32(ptr %vp, <16 x float> %vec2) { ; X86-AVX512-LABEL: test_masked_permps_v16f32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] +; X86-AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] ; X86-AVX512-NEXT: vpermt2ps (%eax), %zmm1, %zmm0 ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: test_masked_permps_v16f32: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] +; X64-AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] ; X64-AVX512-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0 ; X64-AVX512-NEXT: retq ; ; X86-AVX512F-LABEL: test_masked_permps_v16f32: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] +; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] ; X86-AVX512F-NEXT: vpermt2ps (%eax), %zmm1, %zmm0 ; X86-AVX512F-NEXT: retl ; ; X64-AVX512F-LABEL: test_masked_permps_v16f32: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] +; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] ; X64-AVX512F-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0 ; X64-AVX512F-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -640,7 +640,7 @@ define <32 x float> @PR47534(<8 x float> %tmp) { ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [7,17,18,19,7,21,22,23,0,25,26,27,0,29,30,31] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,17,18,19,7,21,22,23,u,25,26,27,u,29,30,31] ; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: ret{{[l|q]}} %tmp1 = shufflevector <8 x float> %tmp, <8 x float> undef, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 2df013d0ff3e3..c6554ec7534cc 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -140,7 +140,7 @@ define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) { ; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0] +; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0] ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] @@ -322,7 +322,7 @@ define <8 x i32> @combine_blend_of_permutes_v8i32(<4 x i64> %a0, <4 x i64> %a1) ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,21,6,23,16,1,2,19] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [4,21,6,23,16,1,2,19] ; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: ret{{[l|q]}} @@ -377,9 +377,9 @@ define void @PR39483() { ; X86-AVX1-NEXT: vmovups 64, %ymm1 ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,3],ymm2[4,5],ymm1[4,7] -; X86-AVX1-NEXT: vmovups 16, %xmm2 -; X86-AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; X86-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] +; X86-AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; X86-AVX1-NEXT: vmovups 16, %xmm3 +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4] ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7] ; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -404,7 +404,7 @@ define void @PR39483() { ; X86-AVX512-LABEL: PR39483: ; X86-AVX512: # %bb.0: # %entry ; X86-AVX512-NEXT: vmovups 64, %ymm0 -; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7] +; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7] ; X86-AVX512-NEXT: vpermt2ps 0, %zmm1, %zmm0 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 @@ -417,9 +417,9 @@ define void @PR39483() { ; X64-AVX1-NEXT: vmovups 64, %ymm1 ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; X64-AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,3],ymm2[4,5],ymm1[4,7] -; X64-AVX1-NEXT: vmovups 16, %xmm2 -; X64-AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; X64-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] +; X64-AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; X64-AVX1-NEXT: vmovups 16, %xmm3 +; X64-AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4] ; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7] ; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -444,7 +444,7 @@ define void @PR39483() { ; X64-AVX512-LABEL: PR39483: ; X64-AVX512: # %bb.0: # %entry ; X64-AVX512-NEXT: vmovups 64, %ymm0 -; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7] +; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7] ; X64-AVX512-NEXT: vpermt2ps 0, %zmm1, %zmm0 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 @@ -518,19 +518,19 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9] +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [1,0,2,0,8,0,9,0] ; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3 -; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1] +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,8,0,2,0,1,0] ; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4 ; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] ; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3] ; X86-AVX512-NEXT: vmovapd %ymm4, (%edx) -; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1] +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,3,0,10,0,1,0] ; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 ; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx) -; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,11,0,0] +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [3,0,11,0,u,u,u,u] ; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 -; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3] +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,0,8,0,9,0,3,0] ; X86-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0 ; X86-AVX512-NEXT: vmovapd %ymm0, (%eax) ; X86-AVX512-NEXT: vzeroupper @@ -585,19 +585,19 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9] +; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,8,9] ; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3 -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1] +; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,8,2,1] ; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4 ; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] ; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3] ; X64-AVX512-NEXT: vmovapd %ymm4, (%rdi) -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1] +; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,3,10,1] ; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 ; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi) -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,11] +; X64-AVX512-NEXT: vmovapd {{.*#+}} xmm3 = [3,11] ; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3] +; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,8,9,3] ; X64-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0 ; X64-AVX512-NEXT: vmovapd %ymm0, (%rdx) ; X64-AVX512-NEXT: vzeroupper @@ -652,7 +652,7 @@ define <8 x i32> @concat_self_v8i32(<4 x i32> %x) { ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3] ; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: ret{{[l|q]}} @@ -661,7 +661,7 @@ define <8 x i32> @concat_self_v8i32(<4 x i32> %x) { ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,0,2,1,3] ; AVX512-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: ret{{[l|q]}} @@ -736,15 +736,15 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X86-AVX2-NEXT: popl %ebp ; X86-AVX2-NEXT: retl ; -; AVX512-LABEL: bit_reversal_permutation: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,8,4,12,2,10,6,14] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,9,5,13,3,11,7,15] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: ret{{[l|q]}} +; X86-AVX512-LABEL: bit_reversal_permutation: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,4,0,12,0,2,0,10,0,6,0,14,0] +; X86-AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; X86-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,0,9,0,5,0,13,0,3,0,11,0,7,0,15,0] +; X86-AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; X86-AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; X86-AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; X86-AVX512-NEXT: retl ; ; X64-AVX1-LABEL: bit_reversal_permutation: ; X64-AVX1: # %bb.0: @@ -782,6 +782,16 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vmovaps %ymm4, %ymm1 ; X64-AVX2-NEXT: vmovaps %ymm5, %ymm3 ; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: bit_reversal_permutation: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,4,12,2,10,6,14] +; X64-AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; X64-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,9,5,13,3,11,7,15] +; X64-AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; X64-AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 +; X64-AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 +; X64-AVX512-NEXT: retq %v0 = shufflevector <16 x i64> %a0, <16 x i64> undef, <16 x i32> %v1 = shufflevector <16 x i64> %v0, <16 x i64> undef, <16 x i32> ret <16 x i64> %v1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 38920aa5d7a12..1583c15f06b60 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -114,17 +114,11 @@ define <4 x i64> @combine_permq_pshufb_as_vmovdqa(<4 x i64> %a0) { } define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) { -; AVX2-LABEL: combine_as_vpermd: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512-LABEL: combine_as_vpermd: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] -; AVX512-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: ret{{[l|q]}} +; CHECK-LABEL: combine_as_vpermd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> @@ -132,17 +126,11 @@ define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) { } define <8 x float> @combine_as_vpermps(<8 x float> %a0) { -; AVX2-LABEL: combine_as_vpermps: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [6,4,7,5,1,u,4,7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512-LABEL: combine_as_vpermps: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,4,7,5,1,0,4,7] -; AVX512-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: ret{{[l|q]}} +; CHECK-LABEL: combine_as_vpermps: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [6,4,7,5,1,u,4,7] +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> @@ -857,7 +845,7 @@ define <8 x float> @demandedelts_vpermps(<8 x float> %a0, <8 x float> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,1,1,0,20,21,22,23] +; AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [3,1,1,0,20,21,22,23] ; AVX512-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: ret{{[l|q]}} @@ -867,15 +855,10 @@ define <8 x float> @demandedelts_vpermps(<8 x float> %a0, <8 x float> %a1) { } define <8 x i32> @constant_fold_permd() { -; AVX2-LABEL: constant_fold_permd: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] -; AVX2-NEXT: ret{{[l|q]}} -; -; AVX512-LABEL: constant_fold_permd: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] -; AVX512-NEXT: ret{{[l|q]}} +; CHECK-LABEL: constant_fold_permd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] +; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> , <8 x i32> ) ret <8 x i32> %1 } @@ -951,7 +934,7 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [23,18,7,2,20,0,3,2] +; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [23,18,7,2,20,u,3,2] ; AVX512-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: ret{{[l|q]}} @@ -980,16 +963,16 @@ define void @PR63030(ptr %p0) { ; X86-AVX2-LABEL: PR63030: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: vmovaps (%eax), %xmm0 -; X86-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [3,0,3,0] -; X86-AVX2-NEXT: # xmm1 = mem[0,0] -; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[1,1,0,0] -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; X86-AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [3,0,3,0] +; X86-AVX2-NEXT: # xmm0 = mem[0,0] +; X86-AVX2-NEXT: vmovaps (%eax), %xmm1 +; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[1,1,0,0] +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; X86-AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [3,0,2,0] -; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] -; X86-AVX2-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; X86-AVX2-NEXT: vmovaps %ymm1, (%eax) +; X86-AVX2-NEXT: vmovaps %ymm0, (%eax) ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl ; @@ -997,7 +980,7 @@ define void @PR63030(ptr %p0) { ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vmovdqa (%eax), %xmm0 -; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,8,0,0,0,9,1,1] +; X86-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,8,0,0,0,0,0,0,0,9,0,1,0,1,0] ; X86-AVX512-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm1 ; X86-AVX512-NEXT: vmovdqa64 %zmm1, (%eax) ; X86-AVX512-NEXT: vzeroupper @@ -1005,23 +988,23 @@ define void @PR63030(ptr %p0) { ; ; X64-AVX2-LABEL: PR63030: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] -; X64-AVX2-NEXT: # xmm1 = mem[0,0] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[1,1,0,0] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [3,3] +; X64-AVX2-NEXT: # xmm0 = mem[0,0] +; X64-AVX2-NEXT: vmovaps (%rdi), %xmm1 +; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[1,1,0,0] +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; X64-AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [3,2] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] -; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; X64-AVX2-NEXT: vmovaps %ymm1, (%rax) +; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: PR63030: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,8,0,0,0,9,1,1] +; X64-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,8,0,0,0,9,1,1] ; X64-AVX512-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; X64-AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; X64-AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 376fe37ef1fa8..e23e8829cb53e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -21,7 +21,7 @@ define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) { ; X86-LABEL: combine_vpermt2var_32i16_identity_mask: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbw {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -29,7 +29,7 @@ define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x ; ; X64-LABEL: combine_vpermt2var_32i16_identity_mask: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbw {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -54,8 +54,7 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpternlogd {{.*#+}} zmm1 = -1 -; X86-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: vpternlogd {{.*#+}} zmm3 = -1 ; X86-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} ; X86-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} @@ -65,8 +64,7 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; X64-LABEL: combine_pshufb_identity_mask: ; X64: # %bb.0: ; X64-NEXT: vpternlogd {{.*#+}} zmm1 = -1 -; X64-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: kmovq %rdi, %k1 ; X64-NEXT: vpternlogd {{.*#+}} zmm3 = -1 ; X64-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} @@ -144,7 +142,7 @@ define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) { define <64 x i8> @combine_permi2q_pshufb_as_permi2d(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: combine_permi2q_pshufb_as_permi2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,11,11,11,11,24,24,24,24,29,29,29,29] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,11,11,11,11,24,24,24,24,29,29,29,29] ; CHECK-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} @@ -157,14 +155,14 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64 ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,0,5,0,0,12,0,14] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,u,u,5,0,u,u,u,u,12,0,u,u,14,0] ; X86-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63] ; X86-NEXT: retl ; ; X64-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,0,5,0,0,12,0,14] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,u,5,u,u,12,u,14] ; X64-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; X64-NEXT: kmovq %rdi, %k1 ; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63] @@ -289,7 +287,7 @@ define <32 x i16> @combine_vpermi2var_32i16_identity(<32 x i16> %x0, <32 x i16> define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { ; CHECK-LABEL: combine_vpermi2var_32i16_as_permw: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> , <32 x i16> %x1, i32 -1) @@ -300,7 +298,7 @@ define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { ; CHECK-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] ; CHECK-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index 6e0fa72398dda..0e256ed00ff15 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -17,7 +17,7 @@ define <16 x i16> @combine_vpermt2var_16i16_identity(<16 x i16> %x0, <16 x i16> define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x i16> %x1, i16 %m) { ; X86-LABEL: combine_vpermt2var_16i16_identity_mask: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} ; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} @@ -25,7 +25,7 @@ define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x ; ; X64-LABEL: combine_vpermt2var_16i16_identity_mask: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z} ; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} @@ -38,7 +38,7 @@ define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16> %x1) { ; CHECK-LABEL: combine_vpermi2var_16i16_as_permw: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7] ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1, i16 -1) @@ -49,7 +49,7 @@ define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16> define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0, <16 x i16> %x1) { ; CHECK-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] ; CHECK-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1, i16 -1) @@ -104,7 +104,7 @@ define <16 x i16> @concat2_permw_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [7,0,6,1,5,2,4,3,21,18,20,19,23,16,22,17] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,1,5,2,4,3,21,18,20,19,23,16,22,17] ; CHECK-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %lo = tail call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x, <8 x i16> ) @@ -124,9 +124,9 @@ define <32 x i16> @concat4_permw_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %z, ; X86-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X86-NEXT: vmovdqa 8(%ebp), %xmm3 -; X86-NEXT: vpmovsxbw {{.*#+}} ymm4 = [6,1,7,0,4,3,5,2,20,19,21,18,22,17,23,16] +; X86-NEXT: vmovdqa {{.*#+}} ymm4 = [6,1,7,0,4,3,5,2,20,19,21,18,22,17,23,16] ; X86-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 -; X86-NEXT: vpmovsxbw {{.*#+}} ymm2 = [7,0,6,1,5,2,4,3,21,18,20,19,23,16,22,17] +; X86-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,1,5,2,4,3,21,18,20,19,23,16,22,17] ; X86-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; X86-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0 ; X86-NEXT: movl %ebp, %esp @@ -139,9 +139,9 @@ define <32 x i16> @concat4_permw_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %z, ; X64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; X64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vpmovsxbw {{.*#+}} ymm4 = [6,1,7,0,4,3,5,2,20,19,21,18,22,17,23,16] +; X64-NEXT: vmovdqa {{.*#+}} ymm4 = [6,1,7,0,4,3,5,2,20,19,21,18,22,17,23,16] ; X64-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 -; X64-NEXT: vpmovsxbw {{.*#+}} ymm2 = [7,0,6,1,5,2,4,3,21,18,20,19,23,16,22,17] +; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,1,5,2,4,3,21,18,20,19,23,16,22,17] ; X64-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; X64-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0 ; X64-NEXT: retq @@ -187,7 +187,7 @@ define <8 x i32> @concat_vrotlv_v4i32(<4 x i32> %a0, <4 x i32> %a1, <8 x i32> %a define <8 x i16> @demandedelts_vpermvar_32i16_v8i16(<32 x i16> %x0) { ; CHECK-LABEL: demandedelts_vpermvar_32i16_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [7,0,6,1,5,2,4,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [7,0,6,1,5,2,4,3] ; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll index 68967c2ce6536..2f4826c741a44 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -33,42 +33,42 @@ define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { ; X86-AVX512F-LABEL: combine_permvar_8f64_identity_mask: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: kmovw %eax, %k1 ; X86-AVX512F-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} -; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] ; X86-AVX512F-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} ; X86-AVX512F-NEXT: vmovapd %zmm1, %zmm0 ; X86-AVX512F-NEXT: retl ; ; X86-AVX512BW-LABEL: combine_permvar_8f64_identity_mask: ; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: kmovd %eax, %k1 ; X86-AVX512BW-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} -; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] ; X86-AVX512BW-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} ; X86-AVX512BW-NEXT: vmovapd %zmm1, %zmm0 ; X86-AVX512BW-NEXT: retl ; ; X64-AVX512F-LABEL: combine_permvar_8f64_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} -; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X64-AVX512F-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} ; X64-AVX512F-NEXT: vmovapd %zmm1, %zmm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_permvar_8f64_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} -; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X64-AVX512BW-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} ; X64-AVX512BW-NEXT: vmovapd %zmm1, %zmm0 ; X64-AVX512BW-NEXT: retq @@ -92,42 +92,42 @@ define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) { define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { ; X86-AVX512F-LABEL: combine_permvar_8i64_identity_mask: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: kmovw %eax, %k1 ; X86-AVX512F-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} -; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] ; X86-AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} ; X86-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-AVX512F-NEXT: retl ; ; X86-AVX512BW-LABEL: combine_permvar_8i64_identity_mask: ; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: kmovd %eax, %k1 ; X86-AVX512BW-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} -; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] ; X86-AVX512BW-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} ; X86-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-AVX512BW-NEXT: retl ; ; X64-AVX512F-LABEL: combine_permvar_8i64_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} -; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X64-AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} ; X64-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_permvar_8i64_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} -; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] +; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X64-AVX512BW-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} ; X64-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-AVX512BW-NEXT: retq @@ -151,7 +151,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x dou define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { ; X86-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: kmovw %eax, %k1 ; X86-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -160,7 +160,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 ; ; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask: ; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: kmovd %eax, %k1 ; X86-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -169,7 +169,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 ; ; X64-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -177,7 +177,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 ; ; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -251,7 +251,7 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { ; X86-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: kmovw %eax, %k1 ; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -260,7 +260,7 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> ; ; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask: ; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: kmovd %eax, %k1 ; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -269,7 +269,7 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> ; ; X64-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -277,7 +277,7 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> ; ; X64-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -298,7 +298,7 @@ define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x f define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { ; X86-LABEL: combine_vpermt2var_16f32_identity_mask: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -306,7 +306,7 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1 ; ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -314,7 +314,7 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1 ; ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -584,7 +584,7 @@ define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) { ; X86-LABEL: combine_vpermt2var_16i32_identity_mask: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -592,7 +592,7 @@ define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x ; ; X64-AVX512F-LABEL: combine_vpermt2var_16i32_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -600,7 +600,7 @@ define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x ; ; X64-AVX512BW-LABEL: combine_vpermt2var_16i32_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} @@ -790,11 +790,17 @@ define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x } define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) { -; CHECK-LABEL: combine_vpermi2var_8f64_as_vpermpd: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd: +; X86: # %bb.0: +; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X86-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd: +; X64: # %bb.0: +; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 -1) %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> , <8 x double> %res0, i8 -1) ret <8 x double> %res1 @@ -830,11 +836,17 @@ define <8 x i64> @combine_vpermt2var_8i64_as_zero_valignq(<8 x i64> %x0) { } define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) { -; CHECK-LABEL: combine_vpermt2var_8i64_as_vpermq: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: combine_vpermt2var_8i64_as_vpermq: +; X86: # %bb.0: +; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X86-NEXT: retl +; +; X64-LABEL: combine_vpermt2var_8i64_as_vpermq: +; X64: # %bb.0: +; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X64-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 -1) %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %res0, <8 x i64> %res0, i8 -1) ret <8 x i64> %res1 @@ -843,7 +855,7 @@ define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1 define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x1, i16 -1) @@ -854,7 +866,7 @@ define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 -1) @@ -881,12 +893,19 @@ define <16 x i32> @combine_vpermt2var_16i32_as_vpsllq(<16 x i32> %x0) { } define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) { -; CHECK-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15] -; CHECK-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: +; X86: # %bb.0: +; X86-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0] +; X86-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; X86-NEXT: vmovapd %zmm2, %zmm0 +; X86-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: +; X64: # %bb.0: +; X64-NEXT: vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15] +; X64-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 +; X64-NEXT: vmovapd %zmm2, %zmm0 +; X64-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 -1) %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %res0, <8 x double> %res0, i8 -1) ret <8 x double> %res1 @@ -916,7 +935,7 @@ define <8 x double> @combine_vpermi2var_8f64_as_permpd(<8 x double> %x0, <8 x do define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] ; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> , <16 x i32> %x1, i16 -1) @@ -925,12 +944,19 @@ define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, } define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) { -; CHECK-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero: -; CHECK: # %bb.0: -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5] -; CHECK-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero: +; X86: # %bb.0: +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0] +; X86-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; X86-NEXT: retl +; +; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero: +; X64: # %bb.0: +; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X64-NEXT: vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5] +; X64-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; X64-NEXT: retq %res0 = shufflevector <8 x double> %x0, <8 x double> zeroinitializer, <8 x i32> %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %res0, <8 x i64> ) ret <8 x double> %1 @@ -940,7 +966,7 @@ define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x floa ; CHECK-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8] +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8] ; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll index 7d6ca16313583..f49469a9a7ee3 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -114,7 +114,7 @@ define <64 x i8> @combine_vpermi2var_64i8_as_vperm2(<64 x i8> %x0, <64 x i8> %x1 define <64 x i8> @combine_permi2q_pshufb_as_permi2d(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: combine_permi2q_pshufb_as_permi2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,11,11,11,11,24,24,24,24,29,29,29,29] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,11,11,11,11,24,24,24,24,29,29,29,29] ; CHECK-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index d3e4906450e43..a1de8a4cbd648 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -45,7 +45,7 @@ define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,19,0,17] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,19,0,17] ; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -83,7 +83,7 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-NEXT: pshufb %xmm3, %xmm4 ; SSE-NEXT: pshufb %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 2b89590a0bb41..a833c986227cd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -42,7 +42,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0] ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 @@ -56,7 +56,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551615,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [18446744073709551615,0] ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} @@ -67,7 +67,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 -; VL_BW_DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0] +; VL_BW_DQ-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0] ; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 @@ -111,7 +111,7 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) { ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermq %zmm2, %zmm1, %zmm2 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 @@ -123,7 +123,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; AVX512VL-NEXT: vpermq %zmm2, %zmm1, %zmm2 ; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 @@ -135,7 +135,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermq %zmm2, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 @@ -155,7 +155,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -169,7 +169,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -183,7 +183,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 -; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 @@ -207,7 +207,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -226,7 +226,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -262,7 +262,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -283,7 +283,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -320,7 +320,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -340,7 +340,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -372,7 +372,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -387,7 +387,7 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -423,7 +423,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -438,7 +438,7 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 @@ -542,7 +542,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,2,10,0,3,0,2,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,2,10,u,3,u,2,u] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 @@ -557,7 +557,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 @@ -571,7 +571,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] +; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax @@ -632,7 +632,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -646,7 +646,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax @@ -659,7 +659,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] ; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax @@ -677,7 +677,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 @@ -719,7 +719,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,14,9,8,11,15,15,9] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,14,9,8,11,15,15,9] ; AVX512F-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -764,7 +764,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 @@ -780,7 +780,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax @@ -793,7 +793,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 -; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2 ; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll index 497f71aea2227..a6517a7a0b9a5 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -16,10 +16,9 @@ define <64 x i8> @f1(ptr %p0) { ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,1,3,7,9,13,15,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13] @@ -107,8 +106,7 @@ define <64 x i8> @f1(ptr %p0) { ; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,1,3,7,9,13,15,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] @@ -160,10 +158,9 @@ define <64 x i8> @f2(ptr %p0) { ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15,1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,1,5,7,11,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] @@ -173,7 +170,7 @@ define <64 x i8> @f2(ptr %p0) { ; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 @@ -193,14 +190,13 @@ define <64 x i8> @f2(ptr %p0) { ; ; AVX512F-LABEL: f2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] -; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] +; AVX512F-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128] ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] @@ -208,26 +204,27 @@ define <64 x i8> @f2(ptr %p0) { ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & mem) +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512F-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512F-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) ; AVX512F-NEXT: retq ; @@ -305,12 +302,11 @@ define <64 x i8> @f3(ptr %p0) { ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm0[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm4 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -340,10 +336,9 @@ define <64 x i8> @f3(ptr %p0) { ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm5 & (ymm0 ^ ymm2)) ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] @@ -391,8 +386,7 @@ define <64 x i8> @f3(ptr %p0) { ; AVX512BW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,2,4,8,10,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] @@ -443,10 +437,9 @@ define <64 x i8> @f4(ptr %p0) { ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14,0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,0,4,6,10,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm6 = [18446744073709551615,16777215] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] @@ -456,7 +449,7 @@ define <64 x i8> @f4(ptr %p0) { ; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 @@ -476,14 +469,13 @@ define <64 x i8> @f4(ptr %p0) { ; ; AVX512F-LABEL: f4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] -; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] +; AVX512F-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128] ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] @@ -491,26 +483,27 @@ define <64 x i8> @f4(ptr %p0) { ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & mem) +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512F-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512F-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll index 0efbe018764d2..5d41d68445d74 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll @@ -15,9 +15,9 @@ define <32 x i8> @foo(ptr %x0) { ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,2,3,5,6] ; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pshufb {{.*#+}} xmm2 = xmm2[8,9,11,12,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[1,2,4,5,7,8,10,11,13,14] +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: retq ; @@ -34,13 +34,12 @@ define <32 x i8> @foo(ptr %x0) { ; AVX2-LABEL: foo: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm0 -; AVX2-NEXT: vmovdqu (%rdi), %ymm1 -; AVX2-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] +; AVX2-NEXT: vmovdqu (%rdi), %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] @@ -64,17 +63,17 @@ define <32 x i8> @foo(ptr %x0) { ; AVX512BW-LABEL: foo: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] +; AVX512BW-NEXT: vmovdqu (%rdi), %ymm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $63488, %eax # imm = 0xF800 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: retq ; ; AVX512VBMI-LABEL: foo: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll index ce8d2acd035f6..a0356810b2e10 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -116,10 +116,10 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3 ; SSE41-NEXT: andl $3, %esi ; SSE41-NEXT: andl $3, %edx ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: andl $3, %ecx ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: andl $3, %ecx ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE41-NEXT: retq ; @@ -133,10 +133,10 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3 ; AVX-NEXT: andl $3, %esi ; AVX-NEXT: andl $3, %edx ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; AVX-NEXT: retq %x0 = extractelement <4 x float> %x, i32 %i0 @@ -201,10 +201,10 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i ; SSE41-NEXT: andl $3, %esi ; SSE41-NEXT: andl $3, %edx ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: andl $3, %ecx ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, -24(%rsp,%rsi,4), %xmm0 ; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0 +; SSE41-NEXT: andl $3, %ecx ; SSE41-NEXT: pinsrd $3, -24(%rsp,%rcx,4), %xmm0 ; SSE41-NEXT: retq ; @@ -218,10 +218,10 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i ; AVX-NEXT: andl $3, %esi ; AVX-NEXT: andl $3, %edx ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 +; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <4 x i32> %x, i32 %i0 @@ -333,7 +333,6 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; SSE41-NEXT: # kill: def $esi killed $esi def $rsi ; SSE41-NEXT: # kill: def $edi killed $edi def $rdi ; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; SSE41-NEXT: andl $7, %eax ; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; SSE41-NEXT: andl $7, %r10d ; SSE41-NEXT: andl $7, %edi @@ -351,6 +350,7 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; SSE41-NEXT: pinsrw $4, -24(%rsp,%r8,2), %xmm0 ; SSE41-NEXT: pinsrw $5, -24(%rsp,%r9,2), %xmm0 ; SSE41-NEXT: pinsrw $6, -24(%rsp,%r10,2), %xmm0 +; SSE41-NEXT: andl $7, %eax ; SSE41-NEXT: pinsrw $7, -24(%rsp,%rax,2), %xmm0 ; SSE41-NEXT: retq ; @@ -363,7 +363,6 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $7, %eax ; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; AVX-NEXT: andl $7, %r10d ; AVX-NEXT: andl $7, %edi @@ -381,6 +380,7 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1 ; AVX-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 ; AVX-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 ; AVX-NEXT: vpinsrw $6, -24(%rsp,%r10,2), %xmm0, %xmm0 +; AVX-NEXT: andl $7, %eax ; AVX-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <8 x i16> %x, i16 %i0 @@ -574,9 +574,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; SSE41: # %bb.0: ; SSE41-NEXT: # kill: def $r9d killed $r9d def $r9 -; SSE41-NEXT: # kill: def $r8d killed $r8d def $r8 -; SSE41-NEXT: # kill: def $ecx killed $ecx def $rcx -; SSE41-NEXT: # kill: def $edx killed $edx def $rdx ; SSE41-NEXT: # kill: def $esi killed $esi def $rsi ; SSE41-NEXT: # kill: def $edi killed $edi def $rdi ; SSE41-NEXT: andl $15, %edi @@ -585,10 +582,13 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: andl $15, %esi ; SSE41-NEXT: pinsrb $1, -24(%rsp,%rsi), %xmm0 +; SSE41-NEXT: # kill: def $edx killed $edx def $rdx ; SSE41-NEXT: andl $15, %edx ; SSE41-NEXT: pinsrb $2, -24(%rsp,%rdx), %xmm0 +; SSE41-NEXT: # kill: def $ecx killed $ecx def $rcx ; SSE41-NEXT: andl $15, %ecx ; SSE41-NEXT: pinsrb $3, -24(%rsp,%rcx), %xmm0 +; SSE41-NEXT: # kill: def $r8d killed $r8d def $r8 ; SSE41-NEXT: andl $15, %r8d ; SSE41-NEXT: pinsrb $4, -24(%rsp,%r8), %xmm0 ; SSE41-NEXT: andl $15, %r9d @@ -628,9 +628,6 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: ; AVX: # %bb.0: ; AVX-NEXT: # kill: def $r9d killed $r9d def $r9 -; AVX-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX-NEXT: # kill: def $edx killed $edx def $rdx ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: andl $15, %edi @@ -639,10 +636,13 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: andl $15, %esi ; AVX-NEXT: vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0 +; AVX-NEXT: # kill: def $edx killed $edx def $rdx ; AVX-NEXT: andl $15, %edx ; AVX-NEXT: vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0 +; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx ; AVX-NEXT: andl $15, %ecx ; AVX-NEXT: vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0 +; AVX-NEXT: # kill: def $r8d killed $r8d def $r8 ; AVX-NEXT: andl $15, %r8d ; AVX-NEXT: vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r9d @@ -767,11 +767,11 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, ptr %i) nounwin ; SSE41-NEXT: movl 8(%rdi), %edx ; SSE41-NEXT: andl $3, %edx ; SSE41-NEXT: movl 12(%rdi), %esi -; SSE41-NEXT: andl $3, %esi ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrd $1, -24(%rsp,%rcx,4), %xmm0 ; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0 +; SSE41-NEXT: andl $3, %esi ; SSE41-NEXT: pinsrd $3, -24(%rsp,%rsi,4), %xmm0 ; SSE41-NEXT: retq ; @@ -784,11 +784,11 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, ptr %i) nounwin ; AVX-NEXT: movl 8(%rdi), %edx ; AVX-NEXT: andl $3, %edx ; AVX-NEXT: movl 12(%rdi), %esi -; AVX-NEXT: andl $3, %esi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 +; AVX-NEXT: andl $3, %esi ; AVX-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0 ; AVX-NEXT: retq %p1 = getelementptr inbounds i32, ptr %i, i64 1 @@ -1379,12 +1379,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; SSE41-NEXT: andl $7, %ecx ; SSE41-NEXT: andl $7, %r8d ; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: andl $7, %r9d ; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: pinsrw $1, -24(%rsp,%rsi,2), %xmm0 ; SSE41-NEXT: pinsrw $2, -40(%rsp,%rdx,2), %xmm0 ; SSE41-NEXT: pinsrw $3, -24(%rsp,%rcx,2), %xmm0 ; SSE41-NEXT: pinsrw $4, -40(%rsp,%r8,2), %xmm0 +; SSE41-NEXT: andl $7, %r9d ; SSE41-NEXT: pinsrw $5, -24(%rsp,%r9,2), %xmm0 ; SSE41-NEXT: retq ; @@ -1404,12 +1404,12 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> % ; AVX-NEXT: andl $7, %ecx ; AVX-NEXT: andl $7, %r8d ; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $7, %r9d ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 ; AVX-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0 ; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 ; AVX-NEXT: vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0 +; AVX-NEXT: andl $7, %r9d ; AVX-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <8 x i16> %x, i16 %i0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll index 8f78438dedf92..60bed708b3230 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -196,13 +196,13 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0 ; ALL-NEXT: andl $7, %edi ; ALL-NEXT: andl $7, %esi ; ALL-NEXT: andl $7, %edx -; ALL-NEXT: andl $7, %ecx ; ALL-NEXT: andl $7, %r8d ; ALL-NEXT: vmovaps %ymm0, (%rsp) ; ALL-NEXT: andl $7, %r9d ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; ALL-NEXT: andl $7, %ecx ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] @@ -247,13 +247,13 @@ define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0 ; ALL-NEXT: andl $3, %edi ; ALL-NEXT: andl $3, %esi ; ALL-NEXT: andl $3, %edx -; ALL-NEXT: andl $3, %ecx ; ALL-NEXT: andl $3, %r8d ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: andl $3, %r9d ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; ALL-NEXT: andl $3, %ecx ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] @@ -289,8 +289,6 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, ; AVX1-NEXT: subq $64, %rsp ; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9 ; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX1-NEXT: # kill: def $edx killed $edx def $rdx ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1-NEXT: movl 32(%rbp), %eax @@ -324,8 +322,10 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, ; AVX1-NEXT: vmovd %eax, %xmm1 ; AVX1-NEXT: andl $15, %esi ; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx ; AVX1-NEXT: andl $15, %edx ; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx ; AVX1-NEXT: andl $15, %ecx ; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 ; AVX1-NEXT: andl $15, %r8d @@ -351,8 +351,6 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, ; AVX2-NEXT: subq $64, %rsp ; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 ; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX2-NEXT: movl 32(%rbp), %eax @@ -386,8 +384,10 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: andl $15, %esi ; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: andl $15, %edx ; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx ; AVX2-NEXT: andl $15, %ecx ; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 ; AVX2-NEXT: andl $15, %r8d @@ -444,8 +444,6 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9 ; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX1-NEXT: # kill: def $edx killed $edx def $rdx ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -479,8 +477,10 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i ; AVX1-NEXT: vmovd %eax, %xmm1 ; AVX1-NEXT: andl $7, %esi ; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx ; AVX1-NEXT: andl $7, %edx ; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx ; AVX1-NEXT: andl $7, %ecx ; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 ; AVX1-NEXT: andl $7, %r8d @@ -500,8 +500,6 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 ; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax @@ -535,8 +533,10 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: andl $7, %esi ; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: andl $7, %edx ; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx ; AVX2-NEXT: andl $7, %ecx ; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 ; AVX2-NEXT: andl $7, %r8d diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 1af7542436501..0bc9d0f8ae0e6 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -43,7 +43,7 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -224,7 +224,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -320,8 +321,8 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-LABEL: trunc_add_v16i32_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 @@ -428,8 +429,8 @@ define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { ; ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -479,7 +480,7 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -520,7 +521,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_add_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -620,7 +621,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -688,7 +689,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -813,7 +814,7 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -994,7 +995,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -1090,8 +1092,8 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-LABEL: trunc_sub_v16i32_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 @@ -1219,7 +1221,7 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -1260,7 +1262,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1360,7 +1362,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -1428,7 +1430,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1589,7 +1591,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1662,7 +1664,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX1-LABEL: trunc_mul_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -1831,7 +1833,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -1971,8 +1974,8 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-LABEL: trunc_mul_v16i32_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 @@ -2079,8 +2082,8 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { ; ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -2133,7 +2136,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -2174,7 +2177,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2292,7 +2295,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm8 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [255,255] +; AVX1-NEXT: # xmm8 = mem[0,0] ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 @@ -2372,26 +2376,26 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] @@ -2539,8 +2543,7 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -2586,7 +2589,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2699,7 +2702,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -2775,7 +2778,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2789,8 +2792,8 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-LABEL: trunc_and_v16i32_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 @@ -2897,8 +2900,7 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [u,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -2939,7 +2941,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3039,7 +3041,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3107,7 +3109,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3230,8 +3232,7 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -3277,7 +3278,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3390,7 +3391,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3466,7 +3467,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3480,8 +3481,8 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX2-LABEL: trunc_xor_v16i32_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 @@ -3588,8 +3589,7 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -3630,7 +3630,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3730,7 +3730,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3798,7 +3798,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3921,8 +3921,7 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -3968,7 +3967,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4081,7 +4080,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -4157,7 +4156,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4171,8 +4170,8 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; AVX2-LABEL: trunc_or_v16i32_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 @@ -4279,8 +4278,7 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -4321,7 +4319,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4421,7 +4419,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -4489,7 +4487,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index da8a3f3fa0d4e..bf49edc1da397 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -54,9 +54,9 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -77,16 +77,28 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -172,9 +184,9 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -196,17 +208,30 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovq %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -316,10 +341,10 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 @@ -363,7 +388,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v4i64_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -399,7 +425,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX2-FAST-ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -578,75 +604,73 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; ; SSE41-LABEL: trunc_packus_v8i64_v8i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm5 -; SSE41-NEXT: movdqa 16(%rdi), %xmm8 -; SSE41-NEXT: movdqa 32(%rdi), %xmm7 -; SSE41-NEXT: movdqa 48(%rdi), %xmm2 +; SSE41-NEXT: movdqa (%rdi), %xmm2 +; SSE41-NEXT: movdqa 16(%rdi), %xmm7 +; SSE41-NEXT: movdqa 32(%rdi), %xmm6 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pmovsxdq {{.*#+}} xmm6 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm2 -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: xorpd %xmm3, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm7 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm8, %xmm1 -; SSE41-NEXT: xorpd %xmm3, %xmm1 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: xorpd %xmm3, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] -; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm6 ; SSE41-NEXT: xorpd %xmm3, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 @@ -655,16 +679,18 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: xorpd %xmm3, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: xorpd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: xorpd %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -672,47 +698,48 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="25 ; AVX1-LABEL: trunc_packus_v8i64_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i64_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -782,9 +809,9 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -808,7 +835,8 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v2i64_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -820,7 +848,7 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -832,7 +860,7 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -925,9 +953,9 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147549183,2147549183] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -952,7 +980,8 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -965,7 +994,7 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -978,7 +1007,7 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1098,19 +1127,19 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-LABEL: trunc_packus_v4i64_v4i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 @@ -1119,27 +1148,27 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: xorpd %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: xorpd %xmm3, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: packusdw %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1147,7 +1176,8 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v4i64_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -1288,19 +1318,19 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-LABEL: trunc_packus_v4i64_v4i16_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 @@ -1309,35 +1339,36 @@ define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: xorpd %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: xorpd %xmm3, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm1 -; SSE41-NEXT: movq %xmm1, (%rdi) +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm2 +; SSE41-NEXT: movq %xmm2, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v4i64_v4i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -1534,74 +1565,72 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; ; SSE41-LABEL: trunc_packus_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 -; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm8 +; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm4 ; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm5 -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: xorpd %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm7 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm8, %xmm1 -; SSE41-NEXT: xorpd %xmm2, %xmm1 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: xorpd %xmm2, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm7, %xmm1 +; SSE41-NEXT: xorpd %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packusdw %xmm6, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm6 ; SSE41-NEXT: xorpd %xmm2, %xmm6 ; SSE41-NEXT: movapd %xmm6, %xmm7 @@ -1612,46 +1641,49 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="25 ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: packusdw %xmm6, %xmm5 +; SSE41-NEXT: xorpd %xmm4, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 ; SSE41-NEXT: packusdw %xmm5, %xmm1 +; SSE41-NEXT: packusdw %xmm6, %xmm4 +; SSE41-NEXT: packusdw %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v8i64_v8i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [65535,65535] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = [65535,65535] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -2190,9 +2222,9 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -2214,16 +2246,28 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2344,9 +2388,9 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483903,2147483903] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -2368,17 +2412,30 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -2491,19 +2548,19 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-LABEL: trunc_packus_v4i64_v4i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 @@ -2512,27 +2569,27 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: xorpd %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: xorpd %xmm3, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: packusdw %xmm1, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm1 @@ -2541,7 +2598,8 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_packus_v4i64_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -2685,19 +2743,19 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-LABEL: trunc_packus_v4i64_v4i8_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 @@ -2706,36 +2764,37 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: xorpd %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: movapd %xmm4, %xmm5 +; SSE41-NEXT: xorpd %xmm3, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, (%rdi) +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm2 +; SSE41-NEXT: movd %xmm2, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v4i64_v4i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -2940,10 +2999,10 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 @@ -3002,27 +3061,27 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: movapd %xmm4, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 ; SSE41-NEXT: packusdw %xmm6, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: xorpd %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 ; SSE41-NEXT: movapd %xmm3, %xmm4 ; SSE41-NEXT: xorpd %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE41-NEXT: movapd %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: packusdw %xmm6, %xmm5 +; SSE41-NEXT: packusdw %xmm7, %xmm5 ; SSE41-NEXT: packusdw %xmm5, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -3033,25 +3092,26 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [255,255] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5 ; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -3060,19 +3120,19 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; ; AVX2-LABEL: trunc_packus_v8i64_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [255,255,255,255] +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -3227,10 +3287,10 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: movdqa 32(%rdi), %xmm3 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm3, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483903,2147483903] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 @@ -3289,27 +3349,27 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: xorpd %xmm1, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 ; SSE41-NEXT: packusdw %xmm6, %xmm4 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: xorpd %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: movapd %xmm2, %xmm3 ; SSE41-NEXT: xorpd %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm6 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: packusdw %xmm6, %xmm5 +; SSE41-NEXT: packusdw %xmm7, %xmm5 ; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: packuswb %xmm4, %xmm4 ; SSE41-NEXT: movq %xmm4, (%rsi) @@ -3320,25 +3380,26 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [255,255] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5 ; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm3 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -3348,19 +3409,19 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; ; AVX2-LABEL: trunc_packus_v8i64_v8i8_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [255,255,255,255] +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -3615,140 +3676,140 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; ; SSE41-LABEL: trunc_packus_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm12 -; SSE41-NEXT: movdqa 48(%rdi), %xmm11 -; SSE41-NEXT: movdqa 80(%rdi), %xmm10 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 32(%rdi), %xmm11 +; SSE41-NEXT: movdqa 48(%rdi), %xmm10 +; SSE41-NEXT: movdqa 80(%rdi), %xmm9 ; SSE41-NEXT: movdqa 64(%rdi), %xmm6 ; SSE41-NEXT: movdqa 112(%rdi), %xmm5 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm4 ; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm5 ; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm6 +; SSE41-NEXT: movdqa %xmm9, %xmm6 ; SSE41-NEXT: pxor %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm10 -; SSE41-NEXT: pxor %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm2, %xmm12 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm12 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: movdqa %xmm11, %xmm9 +; SSE41-NEXT: pxor %xmm2, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm9 +; SSE41-NEXT: movdqa %xmm10, %xmm11 ; SSE41-NEXT: pxor %xmm2, %xmm11 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pxor %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 +; SSE41-NEXT: movdqa 16(%rdi), %xmm7 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm7, %xmm7 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: xorpd %xmm2, %xmm8 -; SSE41-NEXT: movapd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm0, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: pand %xmm12, %xmm0 ; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: xorpd %xmm2, %xmm7 +; SSE41-NEXT: movapd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm8, %xmm8 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 -; SSE41-NEXT: movapd %xmm11, %xmm1 +; SSE41-NEXT: movapd %xmm10, %xmm1 ; SSE41-NEXT: xorpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm1 +; SSE41-NEXT: pxor %xmm7, %xmm7 ; SSE41-NEXT: packusdw %xmm8, %xmm1 -; SSE41-NEXT: movapd %xmm12, %xmm8 +; SSE41-NEXT: movapd %xmm11, %xmm8 ; SSE41-NEXT: xorpd %xmm2, %xmm8 -; SSE41-NEXT: movapd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE41-NEXT: movapd %xmm8, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm10 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: pxor %xmm8, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 -; SSE41-NEXT: movapd %xmm10, %xmm9 -; SSE41-NEXT: xorpd %xmm2, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm8 +; SSE41-NEXT: movapd %xmm9, %xmm10 +; SSE41-NEXT: xorpd %xmm2, %xmm10 +; SSE41-NEXT: movapd %xmm10, %xmm11 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm2, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm9 -; SSE41-NEXT: packusdw %xmm8, %xmm9 -; SSE41-NEXT: packusdw %xmm9, %xmm1 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm10, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 +; SSE41-NEXT: packusdw %xmm8, %xmm10 +; SSE41-NEXT: packusdw %xmm10, %xmm1 ; SSE41-NEXT: movapd %xmm6, %xmm8 ; SSE41-NEXT: xorpd %xmm2, %xmm8 ; SSE41-NEXT: movapd %xmm8, %xmm9 @@ -3769,17 +3830,17 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 -; SSE41-NEXT: packusdw %xmm8, %xmm6 ; SSE41-NEXT: movapd %xmm4, %xmm5 ; SSE41-NEXT: xorpd %xmm2, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 +; SSE41-NEXT: movapd %xmm5, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 +; SSE41-NEXT: packusdw %xmm8, %xmm6 ; SSE41-NEXT: movapd %xmm3, %xmm4 ; SSE41-NEXT: xorpd %xmm2, %xmm4 ; SSE41-NEXT: movapd %xmm4, %xmm8 @@ -3798,81 +3859,82 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; AVX1-LABEL: trunc_packus_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [255,255] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm2, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 -; AVX1-NEXT: vpand %xmm2, %xmm9, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm8 +; AVX1-NEXT: vblendvpd %xmm8, %xmm7, %xmm3, %xmm7 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm3, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm3, %xmm9 +; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm9 ; AVX1-NEXT: vpand %xmm5, %xmm9, %xmm5 -; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm5 +; AVX1-NEXT: vpackusdw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm7, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm7 +; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm4, %xmm5 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm1, %xmm4 ; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm0, %xmm4 ; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v16i64_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm5 ; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm5 ; AVX2-NEXT: vpand %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] @@ -3887,8 +3949,8 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpmaxsq 64(%rdi), %zmm0, %zmm1 -; AVX512-NEXT: vpmovusqb %zmm1, %xmm1 ; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vpmovusqb %zmm1, %xmm1 ; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vzeroupper @@ -4489,9 +4551,9 @@ define <32 x i8> @trunc_packus_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="2 ; SSE-NEXT: movdqa 96(%rdi), %xmm3 ; SSE-NEXT: packssdw 48(%rdi), %xmm2 ; SSE-NEXT: packssdw 16(%rdi), %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packssdw 112(%rdi), %xmm3 ; SSE-NEXT: packssdw 80(%rdi), %xmm1 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm1 ; SSE-NEXT: retq ; @@ -4503,22 +4565,22 @@ define <32 x i8> @trunc_packus_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="2 ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX1-NEXT: vpackssdw 112(%rdi), %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw 80(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v32i32_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-NEXT: vpackssdw 96(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw 32(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -4526,8 +4588,8 @@ define <32 x i8> @trunc_packus_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="2 ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm1 -; AVX512-NEXT: vpmovusdb %zmm1, %xmm1 ; AVX512-NEXT: vpmaxsd 64(%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vpmovusdb %zmm1, %xmm1 ; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index d0cdbf1e3f08d..6a5b0218e82ae 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -40,8 +40,8 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] @@ -56,9 +56,9 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [2147483647,2147483647] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -68,7 +68,7 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4294967295,0,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] ; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 @@ -94,10 +94,10 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; ; AVX2-LABEL: trunc_ssat_v2i64_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -162,8 +162,8 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] @@ -179,9 +179,9 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [2147483647,2147483647] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [4294967295,0,4294967295,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -191,7 +191,7 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4294967295,0,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] ; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 @@ -219,10 +219,10 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; ; AVX2-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -333,10 +333,10 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] ; SSE41-NEXT: movdqa %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 @@ -357,7 +357,7 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE41-NEXT: movapd %xmm4, %xmm2 ; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [0,4294967295,0,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] ; SSE41-NEXT: movapd %xmm2, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 @@ -418,8 +418,7 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -599,64 +598,64 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-LABEL: trunc_ssat_v8i64_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm5 -; SSE41-NEXT: movdqa 16(%rdi), %xmm8 -; SSE41-NEXT: movdqa 32(%rdi), %xmm7 -; SSE41-NEXT: movdqa 48(%rdi), %xmm2 +; SSE41-NEXT: movdqa 16(%rdi), %xmm7 +; SSE41-NEXT: movdqa 32(%rdi), %xmm6 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [2147483647,2147483647] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm5, %xmm4 ; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [4294967295,0,4294967295,0] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE41-NEXT: movdqa %xmm2, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm7, %xmm5 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: movdqa %xmm2, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm7 +; SSE41-NEXT: movdqa %xmm2, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: xorpd %xmm3, %xmm7 -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm6 = [0,4294967295,0,4294967295] -; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; SSE41-NEXT: movapd %xmm1, %xmm8 +; SSE41-NEXT: xorpd %xmm3, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSE41-NEXT: movapd %xmm8, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: movapd %xmm8, %xmm1 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movapd %xmm7, %xmm1 ; SSE41-NEXT: xorpd %xmm3, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 @@ -665,15 +664,14 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movapd %xmm5, %xmm7 ; SSE41-NEXT: xorpd %xmm3, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 @@ -685,39 +683,40 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_ssat_v8i64_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [2147483647,2147483647] -; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] ; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,2],xmm1[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v8i64_v8i32: @@ -786,8 +785,8 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] @@ -803,9 +802,9 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [32767,32767] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -829,10 +828,12 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -841,10 +842,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -853,10 +854,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] @@ -919,8 +920,8 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] @@ -937,9 +938,9 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [32767,32767] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147516415,2147516415] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -964,10 +965,12 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -977,10 +980,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -990,10 +993,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1105,10 +1108,10 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415] ; SSE41-NEXT: movdqa %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 @@ -1153,13 +1156,15 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -1289,10 +1294,10 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [32767,32767] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415] ; SSE41-NEXT: movdqa %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 @@ -1337,13 +1342,15 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -1530,65 +1537,65 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm7 -; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa (%rdi), %xmm6 ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm8 +; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [32767,32767] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm7, %xmm4 ; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm7 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709518848,18446744073709518848] +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: xorpd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm5 = [18446744073709518848,18446744073709518848] +; SSE41-NEXT: movapd %xmm1, %xmm8 +; SSE41-NEXT: xorpd %xmm2, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] -; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: movapd %xmm8, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: movapd %xmm8, %xmm1 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movapd %xmm7, %xmm1 ; SSE41-NEXT: xorpd %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 @@ -1597,15 +1604,14 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packssdw %xmm7, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm7 ; SSE41-NEXT: xorpd %xmm2, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 ; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 @@ -1617,6 +1623,7 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: packssdw %xmm8, %xmm1 ; SSE41-NEXT: packssdw %xmm7, %xmm5 ; SSE41-NEXT: packssdw %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -1624,31 +1631,33 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" ; ; AVX1-LABEL: trunc_ssat_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm4 = [32767,32767] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [32767,32767] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v8i64_v8i16: @@ -1937,8 +1946,8 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 ; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] ; SSSE3-NEXT: pand %xmm3, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] @@ -1953,9 +1962,9 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [127,127] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -1977,16 +1986,29 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2099,9 +2121,9 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [127,127] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = [2147483775,2147483775] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483775,2147483775] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 @@ -2123,17 +2145,31 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -2241,10 +2277,10 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] ; SSE41-NEXT: movdqa %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 @@ -2290,13 +2326,15 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -2429,10 +2467,10 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] ; SSE41-NEXT: movdqa %xmm6, %xmm7 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 @@ -2478,13 +2516,15 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; ; AVX1-LABEL: trunc_ssat_v4i64_v4i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 @@ -2679,10 +2719,10 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] ; SSE41-NEXT: movdqa %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm3, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm3 @@ -2769,31 +2809,33 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; ; AVX1-LABEL: trunc_ssat_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [127,127] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -2972,10 +3014,10 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; SSE41-NEXT: movdqa 32(%rdi), %xmm3 ; SSE41-NEXT: movdqa 48(%rdi), %xmm8 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [127,127] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm3, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] ; SSE41-NEXT: movdqa %xmm6, %xmm9 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 @@ -3062,31 +3104,33 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; ; AVX1-LABEL: trunc_ssat_v8i64_v8i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [127,127] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rsi) ; AVX1-NEXT: retq @@ -3368,109 +3412,109 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; ; SSE41-LABEL: trunc_ssat_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm12 -; SSE41-NEXT: movdqa 48(%rdi), %xmm11 -; SSE41-NEXT: movdqa 80(%rdi), %xmm10 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 32(%rdi), %xmm11 +; SSE41-NEXT: movdqa 48(%rdi), %xmm10 +; SSE41-NEXT: movdqa 80(%rdi), %xmm9 ; SSE41-NEXT: movdqa 64(%rdi), %xmm6 ; SSE41-NEXT: movdqa 112(%rdi), %xmm5 ; SSE41-NEXT: movdqa 96(%rdi), %xmm4 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [127,127] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [2147483775,2147483775] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 ; SSE41-NEXT: movdqa %xmm5, %xmm4 ; SSE41-NEXT: pxor %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm6, %xmm5 ; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm5 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 -; SSE41-NEXT: movdqa %xmm10, %xmm6 +; SSE41-NEXT: movdqa %xmm9, %xmm6 ; SSE41-NEXT: pxor %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm10 -; SSE41-NEXT: pxor %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm10 -; SSE41-NEXT: movdqa %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm2, %xmm12 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm12 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: movdqa %xmm11, %xmm9 +; SSE41-NEXT: pxor %xmm2, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm9 +; SSE41-NEXT: movdqa %xmm10, %xmm11 ; SSE41-NEXT: pxor %xmm2, %xmm11 -; SSE41-NEXT: movdqa %xmm9, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 +; SSE41-NEXT: movdqa %xmm7, %xmm10 +; SSE41-NEXT: pxor %xmm2, %xmm10 +; SSE41-NEXT: movdqa %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 +; SSE41-NEXT: movdqa 16(%rdi), %xmm7 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm1, %xmm9 -; SSE41-NEXT: xorpd %xmm2, %xmm9 +; SSE41-NEXT: movapd %xmm1, %xmm12 +; SSE41-NEXT: xorpd %xmm2, %xmm12 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm9, %xmm13 +; SSE41-NEXT: movapd %xmm12, %xmm13 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm8, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movapd %xmm11, %xmm1 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12 +; SSE41-NEXT: movapd %xmm10, %xmm1 ; SSE41-NEXT: xorpd %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm13 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm13 @@ -3479,50 +3523,50 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; SSE41-NEXT: pand %xmm13, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 -; SSE41-NEXT: packssdw %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm12, %xmm9 -; SSE41-NEXT: xorpd %xmm2, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm9 -; SSE41-NEXT: movapd %xmm10, %xmm11 -; SSE41-NEXT: xorpd %xmm2, %xmm11 -; SSE41-NEXT: movapd %xmm11, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm1 +; SSE41-NEXT: packssdw %xmm12, %xmm1 +; SSE41-NEXT: movapd %xmm11, %xmm10 +; SSE41-NEXT: xorpd %xmm2, %xmm10 +; SSE41-NEXT: movapd %xmm10, %xmm12 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm12 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: por %xmm11, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 -; SSE41-NEXT: packssdw %xmm9, %xmm11 -; SSE41-NEXT: packssdw %xmm11, %xmm1 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movapd %xmm9, %xmm10 +; SSE41-NEXT: xorpd %xmm2, %xmm10 +; SSE41-NEXT: movapd %xmm10, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm7, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 +; SSE41-NEXT: packssdw %xmm12, %xmm10 ; SSE41-NEXT: movapd %xmm6, %xmm9 ; SSE41-NEXT: xorpd %xmm2, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE41-NEXT: movapd %xmm9, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm11 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9 ; SSE41-NEXT: movapd %xmm5, %xmm6 ; SSE41-NEXT: xorpd %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE41-NEXT: movapd %xmm6, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm11 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: movapd %xmm7, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 +; SSE41-NEXT: packssdw %xmm10, %xmm1 ; SSE41-NEXT: packssdw %xmm9, %xmm6 ; SSE41-NEXT: movapd %xmm4, %xmm5 ; SSE41-NEXT: xorpd %xmm2, %xmm5 @@ -3551,82 +3595,84 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 ; AVX1-LABEL: trunc_ssat_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [127,127] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm2, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 -; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpackssdw %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm8, %xmm6, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8 -; AVX1-NEXT: vblendvpd %xmm8, %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpackssdw %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpackssdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 +; AVX1-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm8 +; AVX1-NEXT: vblendvpd %xmm8, %xmm7, %xmm3, %xmm7 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm3, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: # xmm8 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm3, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm7, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm9 +; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpackssdw %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm4, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm6 +; AVX1-NEXT: vblendvpd %xmm6, %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm1, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v16i64_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [127,127,127,127] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vblendvpd %ymm5, %ymm4, %ymm2, %ymm4 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 -; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpackssdw %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 @@ -4213,9 +4259,9 @@ define <32 x i8> @trunc_ssat_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="256 ; SSE-NEXT: movdqa 96(%rdi), %xmm3 ; SSE-NEXT: packssdw 48(%rdi), %xmm2 ; SSE-NEXT: packssdw 16(%rdi), %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 ; SSE-NEXT: packssdw 112(%rdi), %xmm3 ; SSE-NEXT: packssdw 80(%rdi), %xmm1 +; SSE-NEXT: packsswb %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm3, %xmm1 ; SSE-NEXT: retq ; @@ -4227,22 +4273,22 @@ define <32 x i8> @trunc_ssat_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="256 ; AVX1-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX1-NEXT: vpackssdw 112(%rdi), %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw 80(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v32i32_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-NEXT: vpackssdw 96(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw 32(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -4412,76 +4458,76 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; ; SSE41-LABEL: trunc_ssat_v16i32_v16i24: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [65535,127,65535,127,65535,127,65535,127] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8388607,8388607,8388607,8388607] ; SSE41-NEXT: pminsd %xmm4, %xmm3 ; SSE41-NEXT: pminsd %xmm4, %xmm2 ; SSE41-NEXT: pminsd %xmm4, %xmm1 ; SSE41-NEXT: pminsd %xmm4, %xmm0 -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [0,65408,0,65408,0,65408,0,65408] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4286578688,4286578688,4286578688,4286578688] ; SSE41-NEXT: pmaxsd %xmm4, %xmm0 ; SSE41-NEXT: pmaxsd %xmm4, %xmm1 ; SSE41-NEXT: pmaxsd %xmm4, %xmm2 ; SSE41-NEXT: pmaxsd %xmm4, %xmm3 -; SSE41-NEXT: pextrd $3, %xmm3, %eax -; SSE41-NEXT: movw %ax, 45(%rdi) -; SSE41-NEXT: shrl $16, %eax -; SSE41-NEXT: movb %al, 47(%rdi) +; SSE41-NEXT: pextrd $3, %xmm3, %ecx ; SSE41-NEXT: pextrd $2, %xmm3, %eax +; SSE41-NEXT: movw %cx, 45(%rdi) ; SSE41-NEXT: movw %ax, 42(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 47(%rdi) +; SSE41-NEXT: pextrd $1, %xmm3, %ecx +; SSE41-NEXT: movw %cx, 39(%rdi) ; SSE41-NEXT: shrl $16, %eax ; SSE41-NEXT: movb %al, 44(%rdi) -; SSE41-NEXT: pextrd $1, %xmm3, %eax -; SSE41-NEXT: movw %ax, 39(%rdi) -; SSE41-NEXT: shrl $16, %eax -; SSE41-NEXT: movb %al, 41(%rdi) ; SSE41-NEXT: movd %xmm3, %eax ; SSE41-NEXT: movw %ax, 36(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 41(%rdi) +; SSE41-NEXT: pextrd $3, %xmm2, %ecx +; SSE41-NEXT: movw %cx, 33(%rdi) ; SSE41-NEXT: shrl $16, %eax ; SSE41-NEXT: movb %al, 38(%rdi) -; SSE41-NEXT: pextrd $3, %xmm2, %eax -; SSE41-NEXT: movw %ax, 33(%rdi) -; SSE41-NEXT: shrl $16, %eax -; SSE41-NEXT: movb %al, 35(%rdi) ; SSE41-NEXT: pextrd $2, %xmm2, %eax +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 35(%rdi) +; SSE41-NEXT: pextrd $1, %xmm2, %ecx ; SSE41-NEXT: movw %ax, 30(%rdi) +; SSE41-NEXT: movw %cx, 27(%rdi) ; SSE41-NEXT: shrl $16, %eax ; SSE41-NEXT: movb %al, 32(%rdi) -; SSE41-NEXT: pextrd $1, %xmm2, %eax -; SSE41-NEXT: movw %ax, 27(%rdi) -; SSE41-NEXT: shrl $16, %eax -; SSE41-NEXT: movb %al, 29(%rdi) ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: movw %ax, 24(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 29(%rdi) +; SSE41-NEXT: pextrd $3, %xmm1, %ecx +; SSE41-NEXT: movw %cx, 21(%rdi) ; SSE41-NEXT: shrl $16, %eax ; SSE41-NEXT: movb %al, 26(%rdi) -; SSE41-NEXT: pextrd $3, %xmm1, %eax -; SSE41-NEXT: movw %ax, 21(%rdi) -; SSE41-NEXT: shrl $16, %eax -; SSE41-NEXT: movb %al, 23(%rdi) ; SSE41-NEXT: pextrd $2, %xmm1, %eax ; SSE41-NEXT: movw %ax, 18(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 23(%rdi) +; SSE41-NEXT: pextrd $1, %xmm1, %ecx +; SSE41-NEXT: movw %cx, 15(%rdi) ; SSE41-NEXT: shrl $16, %eax ; SSE41-NEXT: movb %al, 20(%rdi) -; SSE41-NEXT: pextrd $1, %xmm1, %eax -; SSE41-NEXT: movw %ax, 15(%rdi) -; SSE41-NEXT: shrl $16, %eax -; SSE41-NEXT: movb %al, 17(%rdi) ; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 17(%rdi) +; SSE41-NEXT: pextrd $3, %xmm0, %ecx ; SSE41-NEXT: movw %ax, 12(%rdi) +; SSE41-NEXT: movw %cx, 9(%rdi) ; SSE41-NEXT: shrl $16, %eax ; SSE41-NEXT: movb %al, 14(%rdi) -; SSE41-NEXT: pextrd $3, %xmm0, %eax -; SSE41-NEXT: movw %ax, 9(%rdi) -; SSE41-NEXT: shrl $16, %eax -; SSE41-NEXT: movb %al, 11(%rdi) ; SSE41-NEXT: pextrd $2, %xmm0, %eax ; SSE41-NEXT: movw %ax, 6(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 11(%rdi) +; SSE41-NEXT: pextrd $1, %xmm0, %ecx ; SSE41-NEXT: shrl $16, %eax ; SSE41-NEXT: movb %al, 8(%rdi) -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: movw %ax, 3(%rdi) -; SSE41-NEXT: shrl $16, %eax -; SSE41-NEXT: movb %al, 5(%rdi) +; SSE41-NEXT: movw %cx, 3(%rdi) +; SSE41-NEXT: shrl $16, %ecx +; SSE41-NEXT: movb %cl, 5(%rdi) ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: movw %ax, (%rdi) ; SSE41-NEXT: shrl $16, %eax @@ -4502,66 +4548,66 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmaxsd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpextrd $3, %xmm3, %eax -; AVX1-NEXT: movw %ax, 45(%rdi) -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: movb %al, 47(%rdi) +; AVX1-NEXT: vpextrd $3, %xmm3, %ecx ; AVX1-NEXT: vpextrd $2, %xmm3, %eax +; AVX1-NEXT: movw %cx, 45(%rdi) ; AVX1-NEXT: movw %ax, 42(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 47(%rdi) +; AVX1-NEXT: vpextrd $1, %xmm3, %ecx +; AVX1-NEXT: movw %cx, 39(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 44(%rdi) -; AVX1-NEXT: vpextrd $1, %xmm3, %eax -; AVX1-NEXT: movw %ax, 39(%rdi) -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: movb %al, 41(%rdi) ; AVX1-NEXT: vmovd %xmm3, %eax ; AVX1-NEXT: movw %ax, 36(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 41(%rdi) +; AVX1-NEXT: vpextrd $3, %xmm2, %ecx +; AVX1-NEXT: movw %cx, 33(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 38(%rdi) -; AVX1-NEXT: vpextrd $3, %xmm2, %eax -; AVX1-NEXT: movw %ax, 33(%rdi) -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: movb %al, 35(%rdi) ; AVX1-NEXT: vpextrd $2, %xmm2, %eax +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 35(%rdi) +; AVX1-NEXT: vpextrd $1, %xmm2, %ecx ; AVX1-NEXT: movw %ax, 30(%rdi) +; AVX1-NEXT: movw %cx, 27(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 32(%rdi) -; AVX1-NEXT: vpextrd $1, %xmm2, %eax -; AVX1-NEXT: movw %ax, 27(%rdi) -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: movb %al, 29(%rdi) ; AVX1-NEXT: vmovd %xmm2, %eax ; AVX1-NEXT: movw %ax, 24(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 29(%rdi) +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: movw %cx, 21(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 26(%rdi) -; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: movw %ax, 21(%rdi) -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: movb %al, 23(%rdi) ; AVX1-NEXT: vpextrd $2, %xmm1, %eax ; AVX1-NEXT: movw %ax, 18(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 23(%rdi) +; AVX1-NEXT: vpextrd $1, %xmm1, %ecx +; AVX1-NEXT: movw %cx, 15(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 20(%rdi) -; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: movw %ax, 15(%rdi) -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: movb %al, 17(%rdi) ; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 17(%rdi) +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx ; AVX1-NEXT: movw %ax, 12(%rdi) +; AVX1-NEXT: movw %cx, 9(%rdi) ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 14(%rdi) -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: movw %ax, 9(%rdi) -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: movb %al, 11(%rdi) ; AVX1-NEXT: vpextrd $2, %xmm0, %eax ; AVX1-NEXT: movw %ax, 6(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 11(%rdi) +; AVX1-NEXT: vpextrd $1, %xmm0, %ecx ; AVX1-NEXT: shrl $16, %eax ; AVX1-NEXT: movb %al, 8(%rdi) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: movw %ax, 3(%rdi) -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: movb %al, 5(%rdi) +; AVX1-NEXT: movw %cx, 3(%rdi) +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: movb %cl, 5(%rdi) ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: movw %ax, (%rdi) ; AVX1-NEXT: shrl $16, %eax @@ -4574,71 +4620,71 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607] ; AVX2-NEXT: vpminsd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4286578688,4286578688,4286578688,4286578688,4286578688,4286578688,4286578688,4286578688] -; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [4286578688,4286578688,4286578688,4286578688,4286578688,4286578688,4286578688,4286578688] +; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpextrd $3, %xmm2, %eax -; AVX2-NEXT: movw %ax, 45(%rdi) -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movb %al, 47(%rdi) +; AVX2-NEXT: vpextrd $3, %xmm2, %ecx +; AVX2-NEXT: vpmaxsd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: movw %cx, 45(%rdi) ; AVX2-NEXT: vpextrd $2, %xmm2, %eax +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 47(%rdi) +; AVX2-NEXT: vpextrd $1, %xmm2, %ecx ; AVX2-NEXT: movw %ax, 42(%rdi) +; AVX2-NEXT: movw %cx, 39(%rdi) ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: movb %al, 44(%rdi) -; AVX2-NEXT: vpextrd $1, %xmm2, %eax -; AVX2-NEXT: movw %ax, 39(%rdi) -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movb %al, 41(%rdi) ; AVX2-NEXT: vmovd %xmm2, %eax ; AVX2-NEXT: movw %ax, 36(%rdi) +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 41(%rdi) +; AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; AVX2-NEXT: movw %cx, 33(%rdi) ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: movb %al, 38(%rdi) -; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: movw %ax, 33(%rdi) -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movb %al, 35(%rdi) ; AVX2-NEXT: vpextrd $2, %xmm1, %eax ; AVX2-NEXT: movw %ax, 30(%rdi) +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 35(%rdi) +; AVX2-NEXT: vpextrd $1, %xmm1, %ecx +; AVX2-NEXT: movw %cx, 27(%rdi) ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: movb %al, 32(%rdi) -; AVX2-NEXT: vpextrd $1, %xmm1, %eax -; AVX2-NEXT: movw %ax, 27(%rdi) -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movb %al, 29(%rdi) ; AVX2-NEXT: vmovd %xmm1, %eax ; AVX2-NEXT: movw %ax, 24(%rdi) ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 29(%rdi) +; AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; AVX2-NEXT: movw %cx, 21(%rdi) ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: movb %al, 26(%rdi) -; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: movw %ax, 21(%rdi) -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movb %al, 23(%rdi) ; AVX2-NEXT: vpextrd $2, %xmm1, %eax ; AVX2-NEXT: movw %ax, 18(%rdi) +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 23(%rdi) +; AVX2-NEXT: vpextrd $1, %xmm1, %ecx +; AVX2-NEXT: movw %cx, 15(%rdi) ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: movb %al, 20(%rdi) -; AVX2-NEXT: vpextrd $1, %xmm1, %eax -; AVX2-NEXT: movw %ax, 15(%rdi) -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movb %al, 17(%rdi) ; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 17(%rdi) +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx ; AVX2-NEXT: movw %ax, 12(%rdi) +; AVX2-NEXT: movw %cx, 9(%rdi) ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: movb %al, 14(%rdi) -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: movw %ax, 9(%rdi) -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movb %al, 11(%rdi) ; AVX2-NEXT: vpextrd $2, %xmm0, %eax ; AVX2-NEXT: movw %ax, 6(%rdi) +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 11(%rdi) +; AVX2-NEXT: vpextrd $1, %xmm0, %ecx ; AVX2-NEXT: shrl $16, %eax ; AVX2-NEXT: movb %al, 8(%rdi) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: movw %ax, 3(%rdi) -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movb %al, 5(%rdi) +; AVX2-NEXT: movw %cx, 3(%rdi) +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: movb %cl, 5(%rdi) ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: movw %ax, (%rdi) ; AVX2-NEXT: shrl $16, %eax @@ -4675,11 +4721,11 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; AVX512-NEXT: vpextrd $3, %xmm0, %esi ; AVX512-NEXT: movw %si, 9(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm0, %edx +; AVX512-NEXT: vpextrd $1, %xmm0, %eax ; AVX512-NEXT: movw %dx, 6(%rdi) -; AVX512-NEXT: vpextrd $1, %xmm0, %ecx -; AVX512-NEXT: movw %cx, 3(%rdi) -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: movw %ax, (%rdi) +; AVX512-NEXT: movw %ax, 3(%rdi) +; AVX512-NEXT: vmovd %xmm0, %ecx +; AVX512-NEXT: movw %cx, (%rdi) ; AVX512-NEXT: shrl $16, %r15d ; AVX512-NEXT: movb %r15b, 47(%rdi) ; AVX512-NEXT: shrl $16, %r14d @@ -4700,27 +4746,27 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; AVX512-NEXT: vpextrd $3, %xmm0, %r11d ; AVX512-NEXT: movw %r11w, 21(%rdi) ; AVX512-NEXT: vpextrd $2, %xmm0, %r10d +; AVX512-NEXT: vpextrd $1, %xmm0, %r8d ; AVX512-NEXT: movw %r10w, 18(%rdi) -; AVX512-NEXT: vpextrd $1, %xmm0, %r9d -; AVX512-NEXT: movw %r9w, 15(%rdi) -; AVX512-NEXT: vmovd %xmm0, %r8d -; AVX512-NEXT: movw %r8w, 12(%rdi) +; AVX512-NEXT: movw %r8w, 15(%rdi) +; AVX512-NEXT: vmovd %xmm0, %r9d +; AVX512-NEXT: movw %r9w, 12(%rdi) ; AVX512-NEXT: shrl $16, %esi ; AVX512-NEXT: movb %sil, 11(%rdi) ; AVX512-NEXT: shrl $16, %edx ; AVX512-NEXT: movb %dl, 8(%rdi) -; AVX512-NEXT: shrl $16, %ecx -; AVX512-NEXT: movb %cl, 5(%rdi) ; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: movb %al, 2(%rdi) +; AVX512-NEXT: movb %al, 5(%rdi) +; AVX512-NEXT: shrl $16, %ecx +; AVX512-NEXT: movb %cl, 2(%rdi) ; AVX512-NEXT: shrl $16, %r11d ; AVX512-NEXT: movb %r11b, 23(%rdi) ; AVX512-NEXT: shrl $16, %r10d ; AVX512-NEXT: movb %r10b, 20(%rdi) -; AVX512-NEXT: shrl $16, %r9d -; AVX512-NEXT: movb %r9b, 17(%rdi) ; AVX512-NEXT: shrl $16, %r8d -; AVX512-NEXT: movb %r8b, 14(%rdi) +; AVX512-NEXT: movb %r8b, 17(%rdi) +; AVX512-NEXT: shrl $16, %r9d +; AVX512-NEXT: movb %r9b, 14(%rdi) ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r14 ; AVX512-NEXT: popq %r15 diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index a5d83a86f295e..45adace8f8252 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -183,18 +183,18 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 @@ -269,8 +269,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102] ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -292,7 +291,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vpmovqd %zmm1, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -302,7 +301,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512VL-LABEL: trunc_usat_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] ; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} ; AVX512VL-NEXT: vpmovqd %ymm1, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -312,7 +311,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -322,7 +321,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i32: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 -; AVX512BWVL-NEXT: vpmovzxdq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] ; AVX512BWVL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} ; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm0 ; AVX512BWVL-NEXT: vzeroupper @@ -402,41 +401,40 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; SSE41-LABEL: trunc_usat_v8i64_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm4 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm8 ; SSE41-NEXT: movdqa 48(%rdi), %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm1, %xmm7 +; SSE41-NEXT: pxor %xmm6, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 +; SSE41-NEXT: movdqa 32(%rdi), %xmm8 ; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: pxor %xmm6, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE41-NEXT: movdqa 16(%rdi), %xmm8 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pxor %xmm6, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 ; SSE41-NEXT: pxor %xmm4, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] @@ -444,7 +442,8 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; SSE41-NEXT: pand %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[0,2] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2] ; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: retq ; @@ -466,14 +465,14 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm7, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm3[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -483,12 +482,12 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] @@ -796,8 +795,8 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [65535,65535,65535,65535] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342] +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -930,8 +929,8 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [65535,65535,65535,65535] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342] +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -1047,48 +1046,48 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm8 ; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: movdqa 32(%rdi), %xmm7 ; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE41-NEXT: movdqa 48(%rdi), %xmm8 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pxor %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pand %xmm9, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 +; SSE41-NEXT: pxor %xmm7, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3 +; SSE41-NEXT: packusdw %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm9, %xmm3 ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1115,9 +1114,9 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1125,17 +1124,17 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; AVX2-LABEL: trunc_usat_v8i64_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [65535,65535,65535,65535] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm3, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] @@ -1383,7 +1382,7 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { ; ; SSE41-LABEL: trunc_usat_v8i32_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] ; SSE41-NEXT: pminud %xmm2, %xmm1 ; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 @@ -1541,16 +1540,16 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v16i32_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535] ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: pminud %xmm0, %xmm2 ; SSE41-NEXT: movdqa 32(%rdi), %xmm1 ; SSE41-NEXT: pminud %xmm0, %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: pminud %xmm0, %xmm2 +; SSE41-NEXT: movdqa 16(%rdi), %xmm3 +; SSE41-NEXT: pminud %xmm0, %xmm3 ; SSE41-NEXT: pminud (%rdi), %xmm0 -; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: packusdw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v16i32_v16i16: @@ -1558,10 +1557,10 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) { ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [65535,65535,65535,65535] ; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminud 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminud 48(%rdi), %xmm0, %xmm3 ; AVX1-NEXT: vpminud 32(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -2121,24 +2120,24 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 +; SSE41-NEXT: movdqa (%rdi), %xmm7 ; SSE41-NEXT: movdqa 16(%rdi), %xmm2 ; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm9 +; SSE41-NEXT: movdqa %xmm2, %xmm8 +; SSE41-NEXT: pxor %xmm6, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 +; SSE41-NEXT: movdqa 48(%rdi), %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm2 ; SSE41-NEXT: pxor %xmm6, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 @@ -2146,17 +2145,17 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm6, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm5, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm8, %xmm2 +; SSE41-NEXT: movdqa %xmm9, %xmm7 +; SSE41-NEXT: pxor %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm7 ; SSE41-NEXT: pxor %xmm4, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] @@ -2164,7 +2163,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; SSE41-NEXT: pand %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 +; SSE41-NEXT: packusdw %xmm7, %xmm3 ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -2181,20 +2180,20 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: # xmm6 = mem[0,0] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [255,255] +; AVX1-NEXT: # xmm8 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm7, %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm8, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 @@ -2303,42 +2302,42 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 ; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm6 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm7, %xmm9 -; SSE41-NEXT: pxor %xmm5, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: packusdw %xmm9, %xmm7 ; SSE41-NEXT: movdqa %xmm6, %xmm8 ; SSE41-NEXT: pxor %xmm5, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: movapd %xmm2, %xmm8 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 +; SSE41-NEXT: movdqa 48(%rdi), %xmm9 +; SSE41-NEXT: movdqa %xmm7, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 +; SSE41-NEXT: packusdw %xmm8, %xmm6 +; SSE41-NEXT: movdqa %xmm9, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm7 ; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] @@ -2346,10 +2345,10 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; SSE41-NEXT: pand %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm7 -; SSE41-NEXT: packuswb %xmm7, %xmm7 -; SSE41-NEXT: movq %xmm7, (%rsi) +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm6 +; SSE41-NEXT: packuswb %xmm6, %xmm6 +; SSE41-NEXT: movq %xmm6, (%rsi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8_store: @@ -2363,20 +2362,20 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: # xmm6 = mem[0,0] +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = [255,255] +; AVX1-NEXT: # xmm8 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm7, %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm8, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 @@ -2542,62 +2541,62 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; SSE41-NEXT: movdqa 112(%rdi), %xmm5 ; SSE41-NEXT: movdqa 64(%rdi), %xmm8 ; SSE41-NEXT: movdqa 80(%rdi), %xmm9 -; SSE41-NEXT: movdqa (%rdi), %xmm12 +; SSE41-NEXT: movdqa (%rdi), %xmm11 ; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm10 -; SSE41-NEXT: movdqa 48(%rdi), %xmm11 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm13 -; SSE41-NEXT: pxor %xmm7, %xmm13 +; SSE41-NEXT: movdqa %xmm2, %xmm10 +; SSE41-NEXT: pxor %xmm7, %xmm10 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm10 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 -; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13 -; SSE41-NEXT: movdqa %xmm12, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm12 +; SSE41-NEXT: movdqa 32(%rdi), %xmm10 +; SSE41-NEXT: movdqa %xmm11, %xmm2 ; SSE41-NEXT: pxor %xmm7, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm2 -; SSE41-NEXT: packusdw %xmm13, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm12 -; SSE41-NEXT: pxor %xmm7, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm6, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm11 +; SSE41-NEXT: movdqa %xmm11, %xmm13 +; SSE41-NEXT: pxor %xmm7, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm13 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 -; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm12 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm13 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm13 +; SSE41-NEXT: packusdw %xmm12, %xmm2 ; SSE41-NEXT: movdqa %xmm10, %xmm11 ; SSE41-NEXT: pxor %xmm7, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm11 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm11, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 -; SSE41-NEXT: packusdw %xmm12, %xmm11 -; SSE41-NEXT: packusdw %xmm11, %xmm2 +; SSE41-NEXT: packusdw %xmm13, %xmm11 ; SSE41-NEXT: movdqa %xmm9, %xmm10 ; SSE41-NEXT: pxor %xmm7, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] ; SSE41-NEXT: pcmpeqd %xmm6, %xmm10 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 ; SSE41-NEXT: pand %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm4, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 +; SSE41-NEXT: packusdw %xmm11, %xmm2 ; SSE41-NEXT: movdqa %xmm8, %xmm9 ; SSE41-NEXT: pxor %xmm7, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] @@ -2685,23 +2684,23 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] ; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 -; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm6 +; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6 ; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 -; AVX2-NEXT: vblendvpd %ymm6, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vblendvpd %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm5, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm7, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm5, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm7, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] @@ -2925,7 +2924,7 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; ; SSE41-LABEL: trunc_usat_v8i32_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] ; SSE41-NEXT: pminud %xmm2, %xmm1 ; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 @@ -3015,7 +3014,7 @@ define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { ; ; SSE41-LABEL: trunc_usat_v8i32_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] ; SSE41-NEXT: pminud %xmm2, %xmm1 ; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 @@ -3129,16 +3128,16 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v16i32_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] ; SSE41-NEXT: movdqa 16(%rdi), %xmm2 ; SSE41-NEXT: pminud %xmm1, %xmm2 ; SSE41-NEXT: movdqa (%rdi), %xmm0 ; SSE41-NEXT: pminud %xmm1, %xmm0 -; SSE41-NEXT: packusdw %xmm2, %xmm0 -; SSE41-NEXT: movdqa 48(%rdi), %xmm2 -; SSE41-NEXT: pminud %xmm1, %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pminud %xmm1, %xmm3 ; SSE41-NEXT: pminud 32(%rdi), %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm3, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3147,10 +3146,10 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(ptr %p0) { ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255] ; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminud 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminud 48(%rdi), %xmm0, %xmm3 ; AVX1-NEXT: vpminud 32(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3230,16 +3229,16 @@ define void @trunc_usat_v16i32_v16i8_store(ptr %p0, ptr %p1) { ; ; SSE41-LABEL: trunc_usat_v16i32_v16i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255] ; SSE41-NEXT: movdqa 16(%rdi), %xmm1 ; SSE41-NEXT: pminud %xmm0, %xmm1 ; SSE41-NEXT: movdqa (%rdi), %xmm2 ; SSE41-NEXT: pminud %xmm0, %xmm2 -; SSE41-NEXT: packusdw %xmm1, %xmm2 -; SSE41-NEXT: movdqa 48(%rdi), %xmm1 -; SSE41-NEXT: pminud %xmm0, %xmm1 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pminud %xmm0, %xmm3 ; SSE41-NEXT: pminud 32(%rdi), %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm0 ; SSE41-NEXT: packuswb %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm2, (%rsi) ; SSE41-NEXT: retq @@ -3249,10 +3248,10 @@ define void @trunc_usat_v16i32_v16i8_store(ptr %p0, ptr %p1) { ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255] ; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminud 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminud 48(%rdi), %xmm0, %xmm3 ; AVX1-NEXT: vpminud 32(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) ; AVX1-NEXT: retq @@ -3424,7 +3423,7 @@ define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) { ; ; SSE41-LABEL: trunc_usat_v16i16_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pminuw %xmm2, %xmm1 ; SSE41-NEXT: pminuw %xmm2, %xmm0 ; SSE41-NEXT: packuswb %xmm1, %xmm0 @@ -3515,16 +3514,16 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v32i16_v32i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: movdqa 48(%rdi), %xmm2 ; SSE41-NEXT: pminuw %xmm0, %xmm2 ; SSE41-NEXT: movdqa 32(%rdi), %xmm1 ; SSE41-NEXT: pminuw %xmm0, %xmm1 -; SSE41-NEXT: packuswb %xmm2, %xmm1 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: pminuw %xmm0, %xmm2 +; SSE41-NEXT: movdqa 16(%rdi), %xmm3 +; SSE41-NEXT: pminuw %xmm0, %xmm3 ; SSE41-NEXT: pminuw (%rdi), %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v32i16_v32i8: @@ -3532,10 +3531,10 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpminuw 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminuw (%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminuw 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminuw 48(%rdi), %xmm0, %xmm3 ; AVX1-NEXT: vpminuw 32(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -3559,7 +3558,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; ; AVX512VL-LABEL: trunc_usat_v32i16_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX512VL-NEXT: vpminuw (%rdi), %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 @@ -3668,7 +3667,7 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; ; SSE41-LABEL: trunc_usat_v32i32_v32i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] ; SSE41-NEXT: movdqa 80(%rdi), %xmm0 ; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: movdqa 64(%rdi), %xmm1 @@ -3697,14 +3696,14 @@ define <32 x i8> @trunc_usat_v32i32_v32i8(ptr %p0) { ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255] ; AVX1-NEXT: vpminud 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpminud (%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpminud 48(%rdi), %xmm0, %xmm3 +; AVX1-NEXT: vpminud 32(%rdi), %xmm0, %xmm4 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminud 48(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpminud 32(%rdi), %xmm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpminud 80(%rdi), %xmm0, %xmm3 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminud 80(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpminud 64(%rdi), %xmm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminud 64(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpminud 112(%rdi), %xmm0, %xmm3 ; AVX1-NEXT: vpminud 96(%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 46f770a349d96..02cbc1d21848e 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -134,7 +134,7 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { ; ; AVX1-LABEL: trunc8i64_8i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -184,7 +184,7 @@ define void @trunc8i64_8i8(<8 x i64> %a, ptr %b) { ; ; SSE41-LABEL: trunc8i64_8i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 @@ -198,7 +198,7 @@ define void @trunc8i64_8i8(<8 x i64> %a, ptr %b) { ; ; AVX1-LABEL: trunc8i64_8i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -457,7 +457,7 @@ define void @trunc8i32_8i8(<8 x i32> %a, ptr %b) { ; ; SSE41-LABEL: trunc8i32_8i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 @@ -565,7 +565,7 @@ define void @trunc16i32_16i16(<16 x i32> %a, ptr %b) { ; ; AVX1-LABEL: trunc16i32_16i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 @@ -741,7 +741,7 @@ define void @trunc16i32_16i8(<16 x i32> %a, ptr %b) { ; ; SSE41-LABEL: trunc16i32_16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = [255,255,255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 @@ -754,7 +754,7 @@ define void @trunc16i32_16i8(<16 x i32> %a, ptr %b) { ; ; AVX1-LABEL: trunc16i32_16i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -922,23 +922,14 @@ entry: ;PR25684 define void @trunc16i16_16i8(<16 x i16> %a, ptr %b) { -; SSE2-SSSE3-LABEL: trunc16i16_16i8: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rdi) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i16_16i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: movdqu %xmm0, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: trunc16i16_16i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i16_16i8: ; AVX1: # %bb.0: # %entry @@ -1122,35 +1113,22 @@ entry: } define void @trunc32i16_32i8(<32 x i16> %a, ptr %b) { -; SSE2-SSSE3-LABEL: trunc32i16_32i8: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: movdqu %xmm2, 16(%rdi) -; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rdi) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc32i16_32i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: movdqu %xmm2, 16(%rdi) -; SSE41-NEXT: movdqu %xmm0, (%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: trunc32i16_32i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: movdqu %xmm2, 16(%rdi) +; SSE-NEXT: movdqu %xmm0, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc32i16_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 @@ -1537,33 +1515,21 @@ entry: } define <32 x i8> @trunc2x16i16_32i8(<16 x i16> %a, <16 x i16> %b) { -; SSE2-SSSE3-LABEL: trunc2x16i16_32i8: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x16i16_32i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: packuswb %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: trunc2x16i16_32i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc2x16i16_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 @@ -1621,21 +1587,13 @@ entry: } define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { -; SSE2-SSSE3-LABEL: trunc2x8i16_16i8: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x8i16_16i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: trunc2x8i16_16i8: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc2x8i16_16i8: ; AVX1: # %bb.0: # %entry @@ -1647,7 +1605,7 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { ; ; AVX2-LABEL: trunc2x8i16_16i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1655,7 +1613,7 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512F-LABEL: trunc2x8i16_16i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1663,7 +1621,7 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-LABEL: trunc2x8i16_16i8: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1871,10 +1829,10 @@ define void @PR34773(ptr %a0, ptr %a1) { ; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, 16(%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1884,10 +1842,10 @@ define void @PR34773(ptr %a0, ptr %a1) { ; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpmovdb %zmm0, 16(%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2122,7 +2080,7 @@ define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, ptr %p) a ; ; AVX1-LABEL: store_merge_split: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index ad73bb6886b9f..d3006fd9b0ddb 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -122,7 +122,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -150,7 +150,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovsxbq {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -323,7 +323,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -351,7 +351,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovsxbq {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -548,7 +548,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -789,7 +789,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1015,7 +1015,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1033,7 +1033,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1051,7 +1051,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1245,7 +1245,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1263,7 +1263,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1281,7 +1281,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1458,7 +1458,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1473,7 +1473,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1488,7 +1488,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1658,7 +1658,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1673,7 +1673,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1688,7 +1688,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1765,44 +1765,29 @@ define <2 x i64> @foldv2i64() nounwind { ; SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: foldv2i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: foldv2i64: -; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: foldv2i64: -; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] -; AVX512CD-NEXT: retq +; AVX-LABEL: foldv2i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] +; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] +; AVX512VPOPCNTDQ-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] +; AVX512VPOPCNTDQVL-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] +; BITALG_NOVLX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] +; BITALG-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X86-SSE-LABEL: foldv2i64: @@ -1819,44 +1804,29 @@ define <2 x i64> @foldv2i64u() nounwind { ; SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: foldv2i64u: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv2i64u: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: foldv2i64u: -; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: foldv2i64u: -; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] -; AVX512CD-NEXT: retq +; AVX-LABEL: foldv2i64u: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] +; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] +; AVX512VPOPCNTDQ-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] +; AVX512VPOPCNTDQVL-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] +; BITALG_NOVLX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,0] +; BITALG-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X86-SSE-LABEL: foldv2i64u: @@ -1873,44 +1843,29 @@ define <4 x i32> @foldv4i32() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: foldv4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: foldv4i32: -; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: foldv4i32: -; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] -; AVX512CD-NEXT: retq +; AVX-LABEL: foldv4i32: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv4i32: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv4i32: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] +; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] +; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] +; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; BITALG-NEXT: retq ; ; X86-SSE-LABEL: foldv4i32: @@ -1927,44 +1882,29 @@ define <4 x i32> @foldv4i32u() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: foldv4i32u: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv4i32u: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: foldv4i32u: -; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: foldv4i32u: -; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] -; AVX512CD-NEXT: retq +; AVX-LABEL: foldv4i32u: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv4i32u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv4i32u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] +; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] +; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbq {{.*#+}} xmm0 = [8,32] +; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; BITALG-NEXT: retq ; ; X86-SSE-LABEL: foldv4i32u: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll index 3c35f7b7fb751..6ece6def9dd5a 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -47,8 +47,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -122,8 +121,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; X86-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X86-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-AVX-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X86-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -172,8 +170,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -247,8 +244,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; X86-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X86-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-AVX-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X86-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -305,8 +301,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -392,8 +387,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; X86-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X86-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-AVX-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X86-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -454,8 +448,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -541,8 +534,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; X86-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X86-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-AVX-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X86-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -598,8 +590,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -615,10 +606,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -636,8 +626,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -692,8 +681,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; X86-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X86-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-AVX-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X86-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -746,8 +734,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -763,10 +750,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -784,8 +770,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -840,8 +825,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; X86-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X86-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-AVX-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X86-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -888,8 +872,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -902,10 +885,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -920,8 +902,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -936,8 +917,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -950,10 +930,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -985,8 +964,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; X86-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X86-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-AVX-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X86-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1030,8 +1008,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1044,10 +1021,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1062,8 +1038,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1078,8 +1053,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1092,10 +1066,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1127,8 +1100,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; X86-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X86-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X86-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-AVX-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X86-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X86-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X86-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1140,44 +1112,19 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { } define <4 x i64> @foldv4i64() nounwind { -; AVX1-LABEL: foldv4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: foldv4i64: -; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: foldv4i64: -; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] -; AVX512CD-NEXT: retq -; -; AVX512VPOPCNTDQ-LABEL: foldv4i64: -; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] -; AVX512VPOPCNTDQ-NEXT: retq -; -; AVX512VPOPCNTDQVL-LABEL: foldv4i64: -; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] -; AVX512VPOPCNTDQVL-NEXT: retq +; AVX-LABEL: foldv4i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] +; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] +; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] ; BITALG-NEXT: retq ; ; X86-AVX-LABEL: foldv4i64: @@ -1189,44 +1136,19 @@ define <4 x i64> @foldv4i64() nounwind { } define <4 x i64> @foldv4i64u() nounwind { -; AVX1-LABEL: foldv4i64u: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv4i64u: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: foldv4i64u: -; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: foldv4i64u: -; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] -; AVX512CD-NEXT: retq -; -; AVX512VPOPCNTDQ-LABEL: foldv4i64u: -; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] -; AVX512VPOPCNTDQ-NEXT: retq -; -; AVX512VPOPCNTDQVL-LABEL: foldv4i64u: -; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] -; AVX512VPOPCNTDQVL-NEXT: retq +; AVX-LABEL: foldv4i64u: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] +; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbq {{.*#+}} ymm0 = [8,0,64,0] +; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] ; BITALG-NEXT: retq ; ; X86-AVX-LABEL: foldv4i64u: @@ -1238,99 +1160,19 @@ define <4 x i64> @foldv4i64u() nounwind { } define <8 x i32> @foldv8i32() nounwind { -; AVX1-LABEL: foldv8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: foldv8i32: -; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: foldv8i32: -; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX512CD-NEXT: retq -; -; AVX512VPOPCNTDQ-LABEL: foldv8i32: -; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX512VPOPCNTDQ-NEXT: retq -; -; AVX512VPOPCNTDQVL-LABEL: foldv8i32: -; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX512VPOPCNTDQVL-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv8i32: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv8i32: -; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; BITALG-NEXT: retq -; -; X86-AVX-LABEL: foldv8i32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; X86-AVX-NEXT: retl +; ALL-LABEL: foldv8i32: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; ALL-NEXT: ret{{[l|q]}} %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out } define <8 x i32> @foldv8i32u() nounwind { -; AVX1-LABEL: foldv8i32u: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: foldv8i32u: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: foldv8i32u: -; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: foldv8i32u: -; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX512CD-NEXT: retq -; -; AVX512VPOPCNTDQ-LABEL: foldv8i32u: -; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX512VPOPCNTDQ-NEXT: retq -; -; AVX512VPOPCNTDQVL-LABEL: foldv8i32u: -; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX512VPOPCNTDQVL-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv8i32u: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv8i32u: -; BITALG: # %bb.0: -; BITALG-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; BITALG-NEXT: retq -; -; X86-AVX-LABEL: foldv8i32u: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; X86-AVX-NEXT: retl +; ALL-LABEL: foldv8i32u: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; ALL-NEXT: ret{{[l|q]}} %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out } @@ -1375,5 +1217,3 @@ declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1) declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1) declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll index 97b988880fac4..873a1529f7d9e 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -31,10 +31,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -91,10 +90,9 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -151,10 +149,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -219,10 +216,9 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -269,8 +265,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 ; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -299,10 +294,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -318,10 +312,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -368,8 +361,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 ; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -398,10 +390,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -417,10 +408,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -468,8 +458,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 @@ -491,10 +480,9 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -507,10 +495,9 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -526,8 +513,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 @@ -564,8 +550,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 @@ -587,10 +572,9 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -603,10 +587,9 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -622,8 +605,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll index 97124f0a9d8d9..6c0dade9c1054 100644 --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -9,39 +9,22 @@ ; we don't need to flip the sign bits in order to map to signed pcmpgt*. define <2 x i1> @ugt_v2i64(<2 x i64> %x, <2 x i64> %y) { -; SSE2-LABEL: ugt_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: psrlq $1, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: ugt_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ugt_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: psrlq $1, %xmm0 +; SSE-NEXT: psrlq $1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: ugt_v2i64: ; AVX: # %bb.0: @@ -56,39 +39,22 @@ define <2 x i1> @ugt_v2i64(<2 x i64> %x, <2 x i64> %y) { } define <2 x i1> @ult_v2i64(<2 x i64> %x, <2 x i64> %y) { -; SSE2-LABEL: ult_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: psrlq $1, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: ult_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ult_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: psrlq $1, %xmm0 +; SSE-NEXT: psrlq $1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: ult_v2i64: ; AVX: # %bb.0: @@ -103,43 +69,24 @@ define <2 x i1> @ult_v2i64(<2 x i64> %x, <2 x i64> %y) { } define <2 x i1> @uge_v2i64(<2 x i64> %x, <2 x i64> %y) { -; SSE2-LABEL: uge_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: psrlq $1, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uge_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: uge_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: psrlq $1, %xmm0 +; SSE-NEXT: psrlq $1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: uge_v2i64: ; AVX: # %bb.0: @@ -156,43 +103,24 @@ define <2 x i1> @uge_v2i64(<2 x i64> %x, <2 x i64> %y) { } define <2 x i1> @ule_v2i64(<2 x i64> %x, <2 x i64> %y) { -; SSE2-LABEL: ule_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: psrlq $1, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: ule_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: psrlq $1, %xmm0 -; SSE41-NEXT: psrlq $1, %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: ule_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: psrlq $1, %xmm0 +; SSE-NEXT: psrlq $1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: ule_v2i64: ; AVX: # %bb.0: @@ -438,7 +366,7 @@ define <16 x i1> @ugt_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX2-LABEL: ugt_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -475,7 +403,7 @@ define <16 x i1> @ult_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX2-LABEL: ult_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -513,7 +441,7 @@ define <16 x i1> @uge_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX2-LABEL: uge_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -552,7 +480,7 @@ define <16 x i1> @ule_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX2-LABEL: ule_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -584,7 +512,7 @@ define <8 x i16> @PR47448_uge(i16 signext %0) { ; SSE41-NEXT: movd %edi, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] ; SSE41-NEXT: pmaxuw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index bd1a48ba5d6ec..68b63af8e4564 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2493,7 +2493,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [63,63] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [63,63] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll index 54dc107fd0c10..440eaecb3b653 100644 --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -460,7 +460,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) { ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: retl @@ -487,7 +487,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) { ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq @@ -518,7 +518,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) { ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: retl @@ -545,7 +545,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) { ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq @@ -576,7 +576,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) { ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: retl @@ -603,7 +603,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) { ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq @@ -634,7 +634,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) { ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: retl @@ -661,7 +661,7 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) { ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index 17315c436188a..db45cb94697da 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -47,7 +47,7 @@ define void @test2(ptr %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq (%rdi,%rsi,8), %rax -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] ; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX1-NEXT: vmovupd %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -57,9 +57,9 @@ define void @test2(ptr %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { ; AVX2: ## %bb.0: ## %bb ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: movq (%rdi,%rsi,8), %rax ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; AVX2-NEXT: movq (%rdi,%rsi,8), %rax ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovupd %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -69,8 +69,8 @@ define void @test2(ptr %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { ; AVX512: ## %bb.0: ## %bb ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 -; AVX512-NEXT: movq (%rdi,%rsi,8), %rax ; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; AVX512-NEXT: movq (%rdi,%rsi,8), %rax ; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 {%k1} = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX512-NEXT: vmovupd %ymm0, (%rax) ; AVX512-NEXT: vzeroupper @@ -110,9 +110,9 @@ define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17, <4 x i16> %t ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] ; AVX2-NEXT: vpmulld %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [715827882,715827882,715827882,715827882] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1431655764,1431655764,1431655764,1431655764] ; AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655764,1431655764,1431655764,1431655764] -; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpminud %xmm4, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 @@ -176,7 +176,7 @@ define <32 x i8> @PR22706(<32 x i1> %x) { ; AVX512-LABEL: PR22706: ; AVX512: ## %bb.0: ; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512-NEXT: vpblendvb %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512-NEXT: retq %tmp = select <32 x i1> %x, <32 x i8> , <32 x i8> @@ -241,11 +241,11 @@ define void @blendv_split(ptr %p, <8 x i32> %cond, <8 x i32> %a, <8 x i32> %x, < ; AVX512-NEXT: vpsrld $31, %ymm0, %ymm0 ; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero -; AVX512-NEXT: vpslld %xmm2, %ymm1, %ymm2 -; AVX512-NEXT: vpslld %xmm0, %ymm1, %ymm2 {%k1} -; AVX512-NEXT: vmovdqu %ymm2, (%rdi) +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero +; AVX512-NEXT: vpslld %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX512-NEXT: vpslld %xmm2, %ymm1, %ymm0 {%k1} +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %signbits = ashr <8 x i32> %cond, @@ -373,7 +373,7 @@ define void @vselect_concat_splat() { ; AVX512: ## %bb.0: ## %entry ; AVX512-NEXT: vmovups (%rax), %ymm0 ; AVX512-NEXT: vmovups (%rax), %xmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,1,4,7,10] +; AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [0,3,6,9,1,4,7,10] ; AVX512-NEXT: vmovaps %ymm2, %ymm3 ; AVX512-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3 ; AVX512-NEXT: vmovups 32, %xmm4 @@ -382,7 +382,7 @@ define void @vselect_concat_splat() { ; AVX512-NEXT: kshiftlw $4, %k0, %k1 ; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,11,14,1,9,12,15,2] +; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [8,11,14,1,9,12,15,2] ; AVX512-NEXT: vpermi2ps 0, %ymm4, %ymm1 ; AVX512-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; AVX512-NEXT: vmovups %ymm0, (%rax) diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll index 34bda718db8f6..2fbe2fba22c4d 100644 --- a/llvm/test/CodeGen/X86/vselect-constants.ll +++ b/llvm/test/CodeGen/X86/vselect-constants.ll @@ -83,7 +83,7 @@ define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) { ; AVX-LABEL: cmp_sel_Cplus1_or_C_vec: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [42,18446744073709551614] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,0,4294967294,4294967295] ; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %cond = icmp eq <4 x i32> %x, %y @@ -292,7 +292,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) { ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,0] +; AVX-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll index cb0542ca7cea8..3bd161dc9a4da 100644 --- a/llvm/test/CodeGen/X86/vselect-minmax.ll +++ b/llvm/test/CodeGen/X86/vselect-minmax.ll @@ -9537,7 +9537,7 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test181: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9606,7 +9606,7 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test182: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9675,7 +9675,7 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test183: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9744,7 +9744,7 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test184: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10047,7 +10047,7 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test189: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10116,7 +10116,7 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test190: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10185,7 +10185,7 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test191: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10254,7 +10254,7 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test192: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vselect-packss.ll b/llvm/test/CodeGen/X86/vselect-packss.ll index 5b14e2782ee1c..13a7a81cef3cb 100644 --- a/llvm/test/CodeGen/X86/vselect-packss.ll +++ b/llvm/test/CodeGen/X86/vselect-packss.ll @@ -200,8 +200,8 @@ define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8 ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,0,3,2] -; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,0,3,2] ; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: packssdw %xmm8, %xmm7 @@ -219,11 +219,11 @@ define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8 ; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: packssdw %xmm4, %xmm3 ; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: packssdw %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: packssdw %xmm2, %xmm0 @@ -240,12 +240,12 @@ define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8 ; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 ; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm7 ; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm6 -; SSE42-NEXT: packssdw %xmm7, %xmm6 ; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm5 ; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm4 +; SSE42-NEXT: packssdw %xmm7, %xmm6 ; SSE42-NEXT: packssdw %xmm5, %xmm4 -; SSE42-NEXT: packssdw %xmm6, %xmm4 ; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm3 +; SSE42-NEXT: packssdw %xmm6, %xmm4 ; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm2 ; SSE42-NEXT: packssdw %xmm3, %xmm2 ; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm1 diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index ab487ed888981..725b387db8c45 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -514,9 +514,9 @@ define <4 x i64> @blend_splat1_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x ; AVX512F-LABEL: blend_splat1_mask_cond_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -531,7 +531,8 @@ define <4 x i64> @blend_splat1_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpsllq $63, %xmm3, %xmm3 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: # xmm4 = mem[0,0] ; XOP-NEXT: vpshaq %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpsllq $63, %xmm0, %xmm0 ; XOP-NEXT: vpshaq %xmm4, %xmm0, %xmm0 @@ -554,9 +555,9 @@ define <4 x i32> @blend_splat1_mask_cond_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x ; AVX512F-LABEL: blend_splat1_mask_cond_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vpblendmd %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -643,7 +644,7 @@ define <16 x i8> @blend_splat1_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x ; ; AVX512F-LABEL: blend_splat1_mask_cond_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 @@ -651,7 +652,7 @@ define <16 x i8> @blend_splat1_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x ; ; AVX512VL-LABEL: blend_splat1_mask_cond_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & (xmm2 ^ xmm1)) @@ -681,7 +682,7 @@ define <2 x i64> @blend_splatmax_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2 ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX512F-NEXT: vptestnmq %zmm3, %zmm0, %k1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -716,9 +717,9 @@ define <8 x i32> @blend_splatmax_mask_cond_v8i32(<8 x i32> %x, <8 x i32> %y, <8 ; AVX512F-LABEL: blend_splatmax_mask_cond_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vpblendmd %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -803,7 +804,7 @@ define <32 x i8> @blend_splatmax_mask_cond_v32i8(<32 x i8> %x, <32 x i8> %y, <32 ; ; AVX512VL-LABEL: blend_splatmax_mask_cond_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm2 ^ ymm1)) @@ -843,9 +844,9 @@ define <4 x i64> @blend_splat_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i ; AVX512F-LABEL: blend_splat_mask_cond_v4i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1 +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -860,7 +861,8 @@ define <4 x i64> @blend_splat_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i ; XOP: # %bb.0: ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpsllq $62, %xmm3, %xmm3 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551553,18446744073709551553] +; XOP-NEXT: # xmm4 = mem[0,0] ; XOP-NEXT: vpshaq %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpsllq $62, %xmm0, %xmm0 ; XOP-NEXT: vpshaq %xmm4, %xmm0, %xmm0 @@ -883,9 +885,9 @@ define <4 x i32> @blend_splat_mask_cond_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i ; AVX512F-LABEL: blend_splat_mask_cond_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1 +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vpblendmd %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -972,7 +974,7 @@ define <16 x i8> @blend_splat_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x ; ; AVX512F-LABEL: blend_splat_mask_cond_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 @@ -980,7 +982,7 @@ define <16 x i8> @blend_splat_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x ; ; AVX512VL-LABEL: blend_splat_mask_cond_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & (xmm2 ^ xmm1)) @@ -1002,7 +1004,7 @@ define <16 x i8> @blend_splat_mask_cond_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x define <2 x i64> @blend_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { ; AVX1-LABEL: blend_mask_cond_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4] ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 @@ -1019,7 +1021,7 @@ define <2 x i64> @blend_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,4] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4] ; AVX512F-NEXT: vptestnmq %zmm3, %zmm0, %k1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1182,7 +1184,7 @@ define <4 x i64> @blend_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm3 = [2,4,32768,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [2,4,32768,1] ; AVX512F-NEXT: vptestnmq %zmm3, %zmm0, %k1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1229,7 +1231,7 @@ define <8 x i32> @blend_mask_cond_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm3 = [1,2,8,4,8,1024,2,4096] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,8,4,8,1024,2,4096] ; AVX512F-NEXT: vptestnmd %zmm3, %zmm0, %k1 ; AVX512F-NEXT: vpblendmd %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1386,12 +1388,12 @@ define void @store_blend_load_v4i64(ptr %a0, ptr %a1, ptr %a2) { ; AVX2-LABEL: store_blend_load_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovapd (%rsi), %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovapd (%rsi), %ymm2 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775815,9223372036854775815,9223372036854775815,9223372036854775815] -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1418,12 +1420,13 @@ define void @store_blend_load_v4i64(ptr %a0, ptr %a1, ptr %a2) { ; ; XOP-LABEL: store_blend_load_v4i64: ; XOP: # %bb.0: -; XOP-NEXT: vmovapd (%rsi), %ymm0 -; XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7] -; XOP-NEXT: vpcomltuq 16(%rdi), %xmm1, %xmm2 -; XOP-NEXT: vpcomltuq (%rdi), %xmm1, %xmm1 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; XOP-NEXT: vblendvpd %ymm1, (%rdi), %ymm0, %ymm0 +; XOP-NEXT: vmovddup {{.*#+}} xmm0 = [7,7] +; XOP-NEXT: # xmm0 = mem[0,0] +; XOP-NEXT: vpcomltuq 16(%rdi), %xmm0, %xmm1 +; XOP-NEXT: vmovapd (%rsi), %ymm2 +; XOP-NEXT: vpcomltuq (%rdi), %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vblendvpd %ymm0, (%rdi), %ymm2, %ymm0 ; XOP-NEXT: vmovapd %ymm0, (%rdx) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq @@ -1486,12 +1489,12 @@ define void @store_blend_load_v8i32(ptr %a0, ptr %a1, ptr %a2) { ; ; XOP-LABEL: store_blend_load_v8i32: ; XOP: # %bb.0: -; XOP-NEXT: vmovaps (%rsi), %ymm0 -; XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] -; XOP-NEXT: vpcomltud 16(%rdi), %xmm1, %xmm2 -; XOP-NEXT: vpcomltud (%rdi), %xmm1, %xmm1 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; XOP-NEXT: vblendvps %ymm1, (%rdi), %ymm0, %ymm0 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [7,7,7,7] +; XOP-NEXT: vpcomltud 16(%rdi), %xmm0, %xmm1 +; XOP-NEXT: vmovaps (%rsi), %ymm2 +; XOP-NEXT: vpcomltud (%rdi), %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOP-NEXT: vblendvps %ymm0, (%rdi), %ymm2, %ymm0 ; XOP-NEXT: vmovaps %ymm0, (%rdx) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq @@ -1657,8 +1660,8 @@ define void @PR46531(ptr %x, ptr %y, ptr %z) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqu (%rsi), %xmm0 ; AVX512F-NEXT: vmovdqu (%rdx), %xmm1 -; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm2 ; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1 +; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm2 ; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu %xmm0, (%rdi) @@ -1667,11 +1670,11 @@ define void @PR46531(ptr %x, ptr %y, ptr %z) { ; ; AVX512VL-LABEL: PR46531: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqu (%rsi), %xmm0 -; AVX512VL-NEXT: vmovdqu (%rdx), %xmm1 -; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1 -; AVX512VL-NEXT: vpxor %xmm0, %xmm1, %xmm2 -; AVX512VL-NEXT: vpord %xmm0, %xmm1, %xmm2 {%k1} +; AVX512VL-NEXT: vmovdqu (%rdx), %xmm0 +; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 +; AVX512VL-NEXT: vmovdqu (%rsi), %xmm1 +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX512VL-NEXT: vpord %xmm1, %xmm0, %xmm2 {%k1} ; AVX512VL-NEXT: vmovdqu %xmm2, (%rdi) ; AVX512VL-NEXT: retq ; @@ -1723,7 +1726,7 @@ define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) { ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20] +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20] ; AVX1-NEXT: vandnps %ymm4, %ymm2, %ymm5 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm5, %ymm0, %ymm0 @@ -1751,12 +1754,12 @@ define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) { ; AVX512F-LABEL: PR110875: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vmovq %rdi, %xmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-NEXT: vmovq %rdi, %xmm0 -; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 ; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -1769,11 +1772,11 @@ define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) { ; AVX512VL-LABEL: PR110875: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm2 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] -; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 ; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -1807,7 +1810,7 @@ define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) { ; XOP-NEXT: vpcomeqb %xmm5, %xmm4, %xmm4 ; XOP-NEXT: vpcomeqb %xmm5, %xmm2, %xmm2 ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; XOP-NEXT: vbroadcastss {{.*#+}} ymm4 = [20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20] +; XOP-NEXT: vmovdqa {{.*#+}} ymm4 = [20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20] ; XOP-NEXT: vpcmov %ymm2, %ymm4, %ymm0, %ymm0 ; XOP-NEXT: vpcmov %ymm3, %ymm4, %ymm1, %ymm1 ; XOP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll index ba51e1fc90c14..478a1b8c7e593 100644 --- a/llvm/test/CodeGen/X86/vselect-post-combine.ll +++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll @@ -4,8 +4,8 @@ define ptr @test_mul(ptr %addr) { ; AVX2-LABEL: test_mul: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = [255,0] -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpblendvb %xmm0, (%rdi), %xmm1, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vmovdqu %ymm0, 0 diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index be6ee8f689958..f4663b310c542 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -480,7 +480,7 @@ define <16 x i8> @test26(<16 x i8> %a, <16 x i8> %b) { ; ; AVX2-LABEL: test26: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = select <16 x i1> , <16 x i8> %a, <16 x i8> %b @@ -694,12 +694,12 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; AVX-LABEL: simplify_select: ; AVX: # %bb.0: ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX-NEXT: vmovd %edi, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 +; AVX-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX-NEXT: retq %a = insertelement <2 x i32> , i32 %x, i32 1 @@ -715,33 +715,21 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; Test to make sure we don't try to insert a new setcc to swap the operands ; of select with all zeros LHS if the setcc has additional users. define void @vselect_allzeros_LHS_multiple_use_setcc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, ptr %p1, ptr %p2) { -; SSE2-LABEL: vselect_allzeros_LHS_multiple_use_setcc: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm3, (%rdi) -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: retq -; -; SSE41-LABEL: vselect_allzeros_LHS_multiple_use_setcc: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pandn %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, (%rdi) -; SSE41-NEXT: movdqa %xmm0, (%rsi) -; SSE41-NEXT: retq +; SSE-LABEL: vselect_allzeros_LHS_multiple_use_setcc: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, (%rdi) +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: retq ; ; AVX-LABEL: vselect_allzeros_LHS_multiple_use_setcc: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,2,4,8] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8] ; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll index 912ff750d9e91..c64f7e8279df9 100644 --- a/llvm/test/CodeGen/X86/vshift-6.ll +++ b/llvm/test/CodeGen/X86/vshift-6.ll @@ -35,11 +35,11 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) { ; X86-NEXT: pcmpeqd %xmm3, %xmm3 ; X86-NEXT: psllw $5, %xmm1 ; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 ; X86-NEXT: pxor %xmm0, %xmm0 ; X86-NEXT: pcmpgtb %xmm1, %xmm0 ; X86-NEXT: pxor %xmm0, %xmm3 ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pxor %xmm2, %xmm2 ; X86-NEXT: por %xmm3, %xmm0 ; X86-NEXT: paddb %xmm1, %xmm1 ; X86-NEXT: pxor %xmm3, %xmm3 @@ -67,11 +67,11 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) { ; X64-NEXT: pcmpeqd %xmm2, %xmm2 ; X64-NEXT: psllw $5, %xmm1 ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-NEXT: pxor %xmm3, %xmm3 ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtb %xmm1, %xmm0 ; X64-NEXT: pxor %xmm0, %xmm2 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: pxor %xmm3, %xmm3 ; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: paddb %xmm1, %xmm1 ; X64-NEXT: pxor %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/wide-integer-cmp.ll b/llvm/test/CodeGen/X86/wide-integer-cmp.ll index a15d633d85381..8e92bf763ce89 100644 --- a/llvm/test/CodeGen/X86/wide-integer-cmp.ll +++ b/llvm/test/CodeGen/X86/wide-integer-cmp.ll @@ -5,10 +5,10 @@ define i32 @branch_eq(i64 %a, i64 %b) { ; CHECK-LABEL: branch_eq: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %bb1 ; CHECK-NEXT: movl $1, %eax @@ -29,9 +29,9 @@ define i32 @branch_slt(i64 %a, i64 %b) { ; CHECK-LABEL: branch_slt: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: jge .LBB1_2 ; CHECK-NEXT: # %bb.1: # %bb1 ; CHECK-NEXT: movl $1, %eax @@ -52,9 +52,9 @@ define i32 @branch_ule(i64 %a, i64 %b) { ; CHECK-LABEL: branch_ule: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: jb .LBB2_2 ; CHECK-NEXT: # %bb.1: # %bb1 ; CHECK-NEXT: movl $1, %eax @@ -75,9 +75,9 @@ define i32 @set_gt(i64 %a, i64 %b) { ; CHECK-LABEL: set_gt: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: setl %al ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: retl @@ -90,28 +90,20 @@ entry: define i32 @test_wide(i128 %a, i128 %b) { ; CHECK-LABEL: test_wide: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: .cfi_offset %esi, -8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: jge .LBB4_2 ; CHECK-NEXT: # %bb.1: # %bb1 ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: popl %esi -; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl ; CHECK-NEXT: .LBB4_2: # %bb2 -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: movl $2, %eax -; CHECK-NEXT: popl %esi -; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl entry: %cmp = icmp slt i128 %a, %b diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 3c98eba69ae5b..cc3a8ef42d72f 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -280,21 +280,21 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esi), %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %edi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%eax) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%esi) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, (%esi) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl @@ -412,21 +412,21 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%eax), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shldl %cl, %esi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %edi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%eax) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, 4(%esi) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl @@ -460,28 +460,28 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi -; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %ebx -; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-NEXT: movl (%edx), %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx -; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%edx), %esi +; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl -; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: sarl $31, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al -; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %ebx, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %ebx, %esi -; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, 4(%edx) -; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %edi, %edx +; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi +; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, 4(%ebx) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, (%ebx) ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx @@ -544,21 +544,21 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esi), %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %esi, %edi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %edi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %edi, %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, 4(%eax) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%esi) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, (%esi) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl @@ -655,54 +655,50 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: subl $60, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %edx -; FALLBACK16-NEXT: movl 4(%ecx), %esi -; FALLBACK16-NEXT: movl 8(%ecx), %edi -; FALLBACK16-NEXT: movl 12(%ecx), %ecx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movb %ah, %al +; FALLBACK16-NEXT: movups (%ecx), %xmm0 +; FALLBACK16-NEXT: movzbl (%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, %eax ; FALLBACK16-NEXT: shlb $3, %al -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 +; FALLBACK16-NEXT: xorps %xmm1, %xmm1 +; FALLBACK16-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $12, %ah -; FALLBACK16-NEXT: movzbl %ah, %ebp -; FALLBACK16-NEXT: movl 20(%esp,%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %ebx +; FALLBACK16-NEXT: andb $12, %cl +; FALLBACK16-NEXT: movzbl %cl, %edi +; FALLBACK16-NEXT: movl 16(%esp,%edi), %edx ; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 20(%esp,%edi), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %eax, %edx ; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: movl 24(%esp,%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK16-NEXT: leal (%ecx,%ecx), %esi ; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movl 16(%esp,%ebp), %ebx +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl 24(%esp,%edi), %ebp +; FALLBACK16-NEXT: movl %ebp, %ebx ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %esi, %esi +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 28(%esp,%edi), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %edi ; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi +; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; FALLBACK16-NEXT: movl 28(%esp,%ebp), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp +; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movl %ebx, 12(%edx) -; FALLBACK16-NEXT: movl %ebp, 8(%edx) +; FALLBACK16-NEXT: movl %ebp, 4(%edx) +; FALLBACK16-NEXT: movl %edi, 8(%edx) ; FALLBACK16-NEXT: movl %esi, (%edx) -; FALLBACK16-NEXT: movl %edi, 4(%edx) ; FALLBACK16-NEXT: addl $60, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi @@ -719,35 +715,30 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: subl $44, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK17-NEXT: movl (%edx), %esi -; FALLBACK17-NEXT: movl 4(%edx), %edi -; FALLBACK17-NEXT: movl 8(%edx), %ebx -; FALLBACK17-NEXT: movl 12(%edx), %edx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movb %ch, %cl +; FALLBACK17-NEXT: movups (%ecx), %xmm0 +; FALLBACK17-NEXT: movzbl (%eax), %eax +; FALLBACK17-NEXT: movl %eax, %ecx ; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, (%esp) -; FALLBACK17-NEXT: andb $12, %ch -; FALLBACK17-NEXT: movzbl %ch, %ebx -; FALLBACK17-NEXT: movl 8(%esp,%ebx), %esi -; FALLBACK17-NEXT: movl (%esp,%ebx), %edx -; FALLBACK17-NEXT: movl 4(%esp,%ebx), %ebp -; FALLBACK17-NEXT: movl %ebp, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi -; FALLBACK17-NEXT: movl 12(%esp,%ebx), %ebx +; FALLBACK17-NEXT: xorps %xmm1, %xmm1 +; FALLBACK17-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, (%esp) +; FALLBACK17-NEXT: andb $12, %al +; FALLBACK17-NEXT: movzbl %al, %esi +; FALLBACK17-NEXT: movl 12(%esp,%esi), %eax +; FALLBACK17-NEXT: movl 8(%esp,%esi), %edi +; FALLBACK17-NEXT: movl %edi, %edx +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl 4(%esp,%esi), %ebx +; FALLBACK17-NEXT: movl %ebx, %ebp +; FALLBACK17-NEXT: shrdl %cl, %edi, %ebp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK17-NEXT: movl (%esp,%esi), %esi ; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi -; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx -; FALLBACK17-NEXT: shrl %cl, %ebx -; FALLBACK17-NEXT: movl %esi, 8(%eax) -; FALLBACK17-NEXT: movl %ebx, 12(%eax) -; FALLBACK17-NEXT: movl %edx, (%eax) -; FALLBACK17-NEXT: movl %edi, 4(%eax) +; FALLBACK17-NEXT: shrl %cl, %eax +; FALLBACK17-NEXT: movl %ebp, 4(%edi) +; FALLBACK17-NEXT: movl %edx, 8(%edi) +; FALLBACK17-NEXT: movl %eax, 12(%edi) +; FALLBACK17-NEXT: movl %esi, (%edi) ; FALLBACK17-NEXT: addl $44, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi @@ -764,44 +755,38 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: subl $44, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl (%ecx), %edx -; FALLBACK18-NEXT: movl 4(%ecx), %esi -; FALLBACK18-NEXT: movl 8(%ecx), %edi -; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %ebx -; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: movups (%ecx), %xmm0 +; FALLBACK18-NEXT: movzbl (%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, %eax ; FALLBACK18-NEXT: shlb $3, %al -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, (%esp) -; FALLBACK18-NEXT: andb $12, %bl -; FALLBACK18-NEXT: movzbl %bl, %esi -; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi -; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx -; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: orl %ebp, %ecx -; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp +; FALLBACK18-NEXT: xorps %xmm1, %xmm1 +; FALLBACK18-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, (%esp) +; FALLBACK18-NEXT: andb $12, %cl +; FALLBACK18-NEXT: movzbl %cl, %edi +; FALLBACK18-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK18-NEXT: movl %eax, %ecx +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: movl 4(%esp,%edi), %ebp +; FALLBACK18-NEXT: movl 8(%esp,%edi), %esi +; FALLBACK18-NEXT: leal (,%ebp,2), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx +; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK18-NEXT: movl 12(%esp,%edi), %edi +; FALLBACK18-NEXT: shrxl %eax, %edi, %eax ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %ebp, %edi -; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx -; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi -; FALLBACK18-NEXT: shrxl %eax, %esi, %eax +; FALLBACK18-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK18-NEXT: orl %ebx, %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %edx -; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK18-NEXT: orl %ebp, %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK18-NEXT: movl %eax, 12(%esi) -; FALLBACK18-NEXT: movl %edx, 8(%esi) -; FALLBACK18-NEXT: movl %edi, (%esi) ; FALLBACK18-NEXT: movl %ecx, 4(%esi) +; FALLBACK18-NEXT: movl %edi, 8(%esi) +; FALLBACK18-NEXT: movl %edx, (%esi) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -816,38 +801,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $44, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK19-NEXT: movl (%edx), %esi -; FALLBACK19-NEXT: movl 4(%edx), %edi -; FALLBACK19-NEXT: movl 8(%edx), %ebx -; FALLBACK19-NEXT: movl 12(%edx), %edx -; FALLBACK19-NEXT: movzbl (%ecx), %eax +; FALLBACK19-NEXT: movups (%ecx), %xmm0 +; FALLBACK19-NEXT: movzbl (%eax), %eax ; FALLBACK19-NEXT: movl %eax, %ecx ; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, (%esp) +; FALLBACK19-NEXT: xorps %xmm1, %xmm1 +; FALLBACK19-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm0, (%esp) ; FALLBACK19-NEXT: andb $12, %al -; FALLBACK19-NEXT: movzbl %al, %eax -; FALLBACK19-NEXT: movl 8(%esp,%eax), %ebx -; FALLBACK19-NEXT: movl (%esp,%eax), %edx -; FALLBACK19-NEXT: movl 4(%esp,%eax), %esi -; FALLBACK19-NEXT: movl %esi, %edi -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi -; FALLBACK19-NEXT: movl 12(%esp,%eax), %eax -; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK19-NEXT: movl %ebx, 8(%ebp) -; FALLBACK19-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK19-NEXT: movl %eax, 12(%ebp) +; FALLBACK19-NEXT: movzbl %al, %esi +; FALLBACK19-NEXT: movl 12(%esp,%esi), %edx +; FALLBACK19-NEXT: movl 8(%esp,%esi), %edi +; FALLBACK19-NEXT: movl %edi, %eax +; FALLBACK19-NEXT: shrdl %cl, %edx, %eax +; FALLBACK19-NEXT: movl 4(%esp,%esi), %ebx +; FALLBACK19-NEXT: movl %ebx, %ebp +; FALLBACK19-NEXT: shrdl %cl, %edi, %ebp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK19-NEXT: movl (%esp,%esi), %esi +; FALLBACK19-NEXT: shrxl %ecx, %edx, %edx ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, (%ebp) -; FALLBACK19-NEXT: movl %edi, 4(%ebp) +; FALLBACK19-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK19-NEXT: movl %ebp, 4(%edi) +; FALLBACK19-NEXT: movl %eax, 8(%edi) +; FALLBACK19-NEXT: movl %edx, 12(%edi) +; FALLBACK19-NEXT: movl %esi, (%edi) ; FALLBACK19-NEXT: addl $44, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi @@ -873,42 +853,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $12, %cl ; FALLBACK20-NEXT: movzbl %cl, %edi -; FALLBACK20-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl 20(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 16(%esp,%edi), %edx ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 20(%esp,%edi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %eax, %edx ; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: leal (%ecx,%ecx), %esi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %esi +; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebp +; FALLBACK20-NEXT: movl %ebp, %ebx ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: movl 28(%esp,%edi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebp +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 28(%esp,%edi), %ebx +; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi ; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %esi, %ebp +; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %ebp, %ebp ; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %esi, %ebx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl %edi, 12(%edx) -; FALLBACK20-NEXT: movl %ebx, 4(%edx) -; FALLBACK20-NEXT: movl %ebp, 8(%edx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, (%edx) +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl %ebx, 12(%edx) +; FALLBACK20-NEXT: movl %ebp, 4(%edx) +; FALLBACK20-NEXT: movl %edi, 8(%edx) +; FALLBACK20-NEXT: movl %esi, (%edx) ; FALLBACK20-NEXT: addl $60, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi @@ -923,32 +902,32 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: pushl %edi ; FALLBACK21-NEXT: pushl %esi ; FALLBACK21-NEXT: subl $44, %esp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK21-NEXT: movups (%edx), %xmm0 -; FALLBACK21-NEXT: movzbl (%ecx), %edx -; FALLBACK21-NEXT: movl %edx, %ecx +; FALLBACK21-NEXT: movups (%ecx), %xmm0 +; FALLBACK21-NEXT: movzbl (%eax), %eax +; FALLBACK21-NEXT: movl %eax, %ecx ; FALLBACK21-NEXT: shlb $3, %cl ; FALLBACK21-NEXT: xorps %xmm1, %xmm1 ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, (%esp) -; FALLBACK21-NEXT: andb $12, %dl -; FALLBACK21-NEXT: movzbl %dl, %ebx -; FALLBACK21-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK21-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK21-NEXT: movl %ebp, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi -; FALLBACK21-NEXT: movl (%esp,%ebx), %esi -; FALLBACK21-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK21-NEXT: movl %eax, %ebx -; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %ebx, 4(%ebp) -; FALLBACK21-NEXT: movl %edi, 8(%ebp) -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: shrl %cl, %edx -; FALLBACK21-NEXT: movl %edx, 12(%ebp) -; FALLBACK21-NEXT: movl %esi, (%ebp) +; FALLBACK21-NEXT: andb $12, %al +; FALLBACK21-NEXT: movzbl %al, %esi +; FALLBACK21-NEXT: movl 12(%esp,%esi), %eax +; FALLBACK21-NEXT: movl 8(%esp,%esi), %edi +; FALLBACK21-NEXT: movl %edi, %edx +; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl 4(%esp,%esi), %ebx +; FALLBACK21-NEXT: movl %ebx, %ebp +; FALLBACK21-NEXT: shrdl %cl, %edi, %ebp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK21-NEXT: movl (%esp,%esi), %esi +; FALLBACK21-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK21-NEXT: shrl %cl, %eax +; FALLBACK21-NEXT: movl %ebp, 4(%edi) +; FALLBACK21-NEXT: movl %edx, 8(%edi) +; FALLBACK21-NEXT: movl %eax, 12(%edi) +; FALLBACK21-NEXT: movl %esi, (%edi) ; FALLBACK21-NEXT: addl $44, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -979,7 +958,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: notb %cl ; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp ; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx +; FALLBACK22-NEXT: leal (,%ebp,2), %edx ; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx ; FALLBACK22-NEXT: orl %ebx, %edx ; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx @@ -1011,33 +990,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi ; FALLBACK23-NEXT: subl $44, %esp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK23-NEXT: movups (%edx), %xmm0 -; FALLBACK23-NEXT: movzbl (%ecx), %edx -; FALLBACK23-NEXT: movl %edx, %ecx +; FALLBACK23-NEXT: movups (%ecx), %xmm0 +; FALLBACK23-NEXT: movzbl (%eax), %eax +; FALLBACK23-NEXT: movl %eax, %ecx ; FALLBACK23-NEXT: shlb $3, %cl ; FALLBACK23-NEXT: xorps %xmm1, %xmm1 ; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, (%esp) -; FALLBACK23-NEXT: andb $12, %dl -; FALLBACK23-NEXT: movzbl %dl, %ebx -; FALLBACK23-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK23-NEXT: movl %ebp, %edi -; FALLBACK23-NEXT: shrdl %cl, %edx, %edi -; FALLBACK23-NEXT: movl (%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %ebx -; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK23-NEXT: movl %ebx, 4(%ebp) -; FALLBACK23-NEXT: movl %edi, 8(%ebp) +; FALLBACK23-NEXT: andb $12, %al +; FALLBACK23-NEXT: movzbl %al, %esi +; FALLBACK23-NEXT: movl 12(%esp,%esi), %edx +; FALLBACK23-NEXT: movl 8(%esp,%esi), %edi +; FALLBACK23-NEXT: movl %edi, %eax +; FALLBACK23-NEXT: shrdl %cl, %edx, %eax +; FALLBACK23-NEXT: movl 4(%esp,%esi), %ebx +; FALLBACK23-NEXT: movl %ebx, %ebp +; FALLBACK23-NEXT: shrdl %cl, %edi, %ebp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK23-NEXT: movl (%esp,%esi), %esi ; FALLBACK23-NEXT: shrxl %ecx, %edx, %edx -; FALLBACK23-NEXT: movl %edx, 12(%ebp) ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl %esi, (%ebp) +; FALLBACK23-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK23-NEXT: movl %ebp, 4(%edi) +; FALLBACK23-NEXT: movl %eax, 8(%edi) +; FALLBACK23-NEXT: movl %edx, 12(%edi) +; FALLBACK23-NEXT: movl %esi, (%edi) ; FALLBACK23-NEXT: addl $44, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi @@ -1063,42 +1042,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $12, %cl ; FALLBACK24-NEXT: movzbl %cl, %edi -; FALLBACK24-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl 20(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 16(%esp,%edi), %edx ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 20(%esp,%edi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %eax, %edx ; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: leal (%ecx,%ecx), %esi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %esi +; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebp +; FALLBACK24-NEXT: movl %ebp, %ebx ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: movl 28(%esp,%edi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebp +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 28(%esp,%edi), %ebx +; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi ; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %esi, %ebp +; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %ebp, %ebp ; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %esi, %ebx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl %edi, 12(%edx) -; FALLBACK24-NEXT: movl %ebx, 4(%edx) -; FALLBACK24-NEXT: movl %ebp, 8(%edx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, (%edx) +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl %ebx, 12(%edx) +; FALLBACK24-NEXT: movl %ebp, 4(%edx) +; FALLBACK24-NEXT: movl %edi, 8(%edx) +; FALLBACK24-NEXT: movl %esi, (%edx) ; FALLBACK24-NEXT: addl $60, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi @@ -1113,32 +1091,32 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: pushl %edi ; FALLBACK25-NEXT: pushl %esi ; FALLBACK25-NEXT: subl $44, %esp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK25-NEXT: vmovups (%edx), %xmm0 -; FALLBACK25-NEXT: movzbl (%ecx), %edx -; FALLBACK25-NEXT: movl %edx, %ecx +; FALLBACK25-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK25-NEXT: movzbl (%eax), %eax +; FALLBACK25-NEXT: movl %eax, %ecx ; FALLBACK25-NEXT: shlb $3, %cl ; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK25-NEXT: andb $12, %dl -; FALLBACK25-NEXT: movzbl %dl, %ebx -; FALLBACK25-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK25-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK25-NEXT: movl %ebp, %edi -; FALLBACK25-NEXT: shrdl %cl, %edx, %edi -; FALLBACK25-NEXT: movl (%esp,%ebx), %esi -; FALLBACK25-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK25-NEXT: movl %eax, %ebx -; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %ebx, 4(%ebp) -; FALLBACK25-NEXT: movl %edi, 8(%ebp) -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: shrl %cl, %edx -; FALLBACK25-NEXT: movl %edx, 12(%ebp) -; FALLBACK25-NEXT: movl %esi, (%ebp) +; FALLBACK25-NEXT: andb $12, %al +; FALLBACK25-NEXT: movzbl %al, %esi +; FALLBACK25-NEXT: movl 12(%esp,%esi), %eax +; FALLBACK25-NEXT: movl 8(%esp,%esi), %edi +; FALLBACK25-NEXT: movl %edi, %edx +; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl 4(%esp,%esi), %ebx +; FALLBACK25-NEXT: movl %ebx, %ebp +; FALLBACK25-NEXT: shrdl %cl, %edi, %ebp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK25-NEXT: movl (%esp,%esi), %esi +; FALLBACK25-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK25-NEXT: shrl %cl, %eax +; FALLBACK25-NEXT: movl %ebp, 4(%edi) +; FALLBACK25-NEXT: movl %edx, 8(%edi) +; FALLBACK25-NEXT: movl %eax, 12(%edi) +; FALLBACK25-NEXT: movl %esi, (%edi) ; FALLBACK25-NEXT: addl $44, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -1169,7 +1147,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: notb %cl ; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp ; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx +; FALLBACK26-NEXT: leal (,%ebp,2), %edx ; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx ; FALLBACK26-NEXT: orl %ebx, %edx ; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx @@ -1201,33 +1179,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi ; FALLBACK27-NEXT: subl $44, %esp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK27-NEXT: vmovups (%edx), %xmm0 -; FALLBACK27-NEXT: movzbl (%ecx), %edx -; FALLBACK27-NEXT: movl %edx, %ecx +; FALLBACK27-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK27-NEXT: movzbl (%eax), %eax +; FALLBACK27-NEXT: movl %eax, %ecx ; FALLBACK27-NEXT: shlb $3, %cl ; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK27-NEXT: andb $12, %dl -; FALLBACK27-NEXT: movzbl %dl, %ebx -; FALLBACK27-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK27-NEXT: movl %ebp, %edi -; FALLBACK27-NEXT: shrdl %cl, %edx, %edi -; FALLBACK27-NEXT: movl (%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %ebx -; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK27-NEXT: movl %ebx, 4(%ebp) -; FALLBACK27-NEXT: movl %edi, 8(%ebp) +; FALLBACK27-NEXT: andb $12, %al +; FALLBACK27-NEXT: movzbl %al, %esi +; FALLBACK27-NEXT: movl 12(%esp,%esi), %edx +; FALLBACK27-NEXT: movl 8(%esp,%esi), %edi +; FALLBACK27-NEXT: movl %edi, %eax +; FALLBACK27-NEXT: shrdl %cl, %edx, %eax +; FALLBACK27-NEXT: movl 4(%esp,%esi), %ebx +; FALLBACK27-NEXT: movl %ebx, %ebp +; FALLBACK27-NEXT: shrdl %cl, %edi, %ebp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK27-NEXT: movl (%esp,%esi), %esi ; FALLBACK27-NEXT: shrxl %ecx, %edx, %edx -; FALLBACK27-NEXT: movl %edx, 12(%ebp) ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl %esi, (%ebp) +; FALLBACK27-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK27-NEXT: movl %ebp, 4(%edi) +; FALLBACK27-NEXT: movl %eax, 8(%edi) +; FALLBACK27-NEXT: movl %edx, 12(%edi) +; FALLBACK27-NEXT: movl %esi, (%edi) ; FALLBACK27-NEXT: addl $44, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi @@ -1253,42 +1231,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $12, %cl ; FALLBACK28-NEXT: movzbl %cl, %edi -; FALLBACK28-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl 20(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 16(%esp,%edi), %edx ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 20(%esp,%edi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %eax, %edx ; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: leal (%ecx,%ecx), %esi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %esi +; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebp +; FALLBACK28-NEXT: movl %ebp, %ebx ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: movl 28(%esp,%edi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebp +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 28(%esp,%edi), %ebx +; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi ; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %esi, %ebp +; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %ebp, %ebp ; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %esi, %ebx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl %edi, 12(%edx) -; FALLBACK28-NEXT: movl %ebx, 4(%edx) -; FALLBACK28-NEXT: movl %ebp, 8(%edx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, (%edx) +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl %ebx, 12(%edx) +; FALLBACK28-NEXT: movl %ebp, 4(%edx) +; FALLBACK28-NEXT: movl %edi, 8(%edx) +; FALLBACK28-NEXT: movl %esi, (%edx) ; FALLBACK28-NEXT: addl $60, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi @@ -1303,32 +1280,32 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: pushl %edi ; FALLBACK29-NEXT: pushl %esi ; FALLBACK29-NEXT: subl $44, %esp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK29-NEXT: vmovups (%edx), %xmm0 -; FALLBACK29-NEXT: movzbl (%ecx), %edx -; FALLBACK29-NEXT: movl %edx, %ecx +; FALLBACK29-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK29-NEXT: movzbl (%eax), %eax +; FALLBACK29-NEXT: movl %eax, %ecx ; FALLBACK29-NEXT: shlb $3, %cl ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK29-NEXT: andb $12, %dl -; FALLBACK29-NEXT: movzbl %dl, %ebx -; FALLBACK29-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK29-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK29-NEXT: movl %ebp, %edi -; FALLBACK29-NEXT: shrdl %cl, %edx, %edi -; FALLBACK29-NEXT: movl (%esp,%ebx), %esi -; FALLBACK29-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK29-NEXT: movl %eax, %ebx -; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %ebx, 4(%ebp) -; FALLBACK29-NEXT: movl %edi, 8(%ebp) -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: shrl %cl, %edx -; FALLBACK29-NEXT: movl %edx, 12(%ebp) -; FALLBACK29-NEXT: movl %esi, (%ebp) +; FALLBACK29-NEXT: andb $12, %al +; FALLBACK29-NEXT: movzbl %al, %esi +; FALLBACK29-NEXT: movl 12(%esp,%esi), %eax +; FALLBACK29-NEXT: movl 8(%esp,%esi), %edi +; FALLBACK29-NEXT: movl %edi, %edx +; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl 4(%esp,%esi), %ebx +; FALLBACK29-NEXT: movl %ebx, %ebp +; FALLBACK29-NEXT: shrdl %cl, %edi, %ebp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK29-NEXT: movl (%esp,%esi), %esi +; FALLBACK29-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK29-NEXT: shrl %cl, %eax +; FALLBACK29-NEXT: movl %ebp, 4(%edi) +; FALLBACK29-NEXT: movl %edx, 8(%edi) +; FALLBACK29-NEXT: movl %eax, 12(%edi) +; FALLBACK29-NEXT: movl %esi, (%edi) ; FALLBACK29-NEXT: addl $44, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -1359,7 +1336,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: notb %cl ; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp ; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx +; FALLBACK30-NEXT: leal (,%ebp,2), %edx ; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx ; FALLBACK30-NEXT: orl %ebx, %edx ; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx @@ -1391,33 +1368,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi ; FALLBACK31-NEXT: subl $44, %esp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK31-NEXT: vmovups (%edx), %xmm0 -; FALLBACK31-NEXT: movzbl (%ecx), %edx -; FALLBACK31-NEXT: movl %edx, %ecx +; FALLBACK31-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK31-NEXT: movzbl (%eax), %eax +; FALLBACK31-NEXT: movl %eax, %ecx ; FALLBACK31-NEXT: shlb $3, %cl ; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK31-NEXT: andb $12, %dl -; FALLBACK31-NEXT: movzbl %dl, %ebx -; FALLBACK31-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK31-NEXT: movl %ebp, %edi -; FALLBACK31-NEXT: shrdl %cl, %edx, %edi -; FALLBACK31-NEXT: movl (%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %ebx -; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK31-NEXT: movl %ebx, 4(%ebp) -; FALLBACK31-NEXT: movl %edi, 8(%ebp) +; FALLBACK31-NEXT: andb $12, %al +; FALLBACK31-NEXT: movzbl %al, %esi +; FALLBACK31-NEXT: movl 12(%esp,%esi), %edx +; FALLBACK31-NEXT: movl 8(%esp,%esi), %edi +; FALLBACK31-NEXT: movl %edi, %eax +; FALLBACK31-NEXT: shrdl %cl, %edx, %eax +; FALLBACK31-NEXT: movl 4(%esp,%esi), %ebx +; FALLBACK31-NEXT: movl %ebx, %ebp +; FALLBACK31-NEXT: shrdl %cl, %edi, %ebp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK31-NEXT: movl (%esp,%esi), %esi ; FALLBACK31-NEXT: shrxl %ecx, %edx, %edx -; FALLBACK31-NEXT: movl %edx, 12(%ebp) ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl %esi, (%ebp) +; FALLBACK31-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK31-NEXT: movl %ebp, 4(%edi) +; FALLBACK31-NEXT: movl %eax, 8(%edi) +; FALLBACK31-NEXT: movl %edx, 12(%edi) +; FALLBACK31-NEXT: movl %esi, (%edi) ; FALLBACK31-NEXT: addl $44, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi @@ -1510,37 +1487,19 @@ define void @lshr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; ; X86-SSE2-LABEL: lshr_16bytes_dwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %edi -; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $32, %esp +; X86-SSE2-NEXT: subl $44, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: movl (%edx), %esi -; X86-SSE2-NEXT: movl 4(%edx), %edi -; X86-SSE2-NEXT: movl 8(%edx), %ebx -; X86-SSE2-NEXT: movl 12(%edx), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 ; X86-SSE2-NEXT: movzbl (%ecx), %ecx -; X86-SSE2-NEXT: xorps %xmm0, %xmm0 -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, (%esp) +; X86-SSE2-NEXT: xorps %xmm1, %xmm1 +; X86-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, (%esp) ; X86-SSE2-NEXT: andl $3, %ecx -; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx -; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi -; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi -; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl %edi, 12(%eax) -; X86-SSE2-NEXT: movl %edx, (%eax) -; X86-SSE2-NEXT: movl %esi, 4(%eax) -; X86-SSE2-NEXT: addl $32, %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %edi -; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: movups (%esp,%ecx,4), %xmm0 +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $44, %esp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: lshr_16bytes_dwordOff: @@ -1670,57 +1629,52 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: subl $60, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %ebx -; FALLBACK16-NEXT: movl 4(%ecx), %esi -; FALLBACK16-NEXT: movl 8(%ecx), %edi -; FALLBACK16-NEXT: movl 12(%ecx), %ecx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movb %ah, %dh -; FALLBACK16-NEXT: shlb $3, %dh -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 +; FALLBACK16-NEXT: movups (%ecx), %xmm0 +; FALLBACK16-NEXT: movzbl (%eax), %ecx +; FALLBACK16-NEXT: movb %cl, %ch +; FALLBACK16-NEXT: shlb $3, %ch +; FALLBACK16-NEXT: xorps %xmm1, %xmm1 +; FALLBACK16-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $12, %ah -; FALLBACK16-NEXT: negb %ah -; FALLBACK16-NEXT: movsbl %ah, %ebp -; FALLBACK16-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%esp,%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movb %dh, %dl -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: shrl %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx +; FALLBACK16-NEXT: andb $12, %cl +; FALLBACK16-NEXT: negb %cl +; FALLBACK16-NEXT: movsbl %cl, %ebp ; FALLBACK16-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: movl 40(%esp,%ebp), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %ch, %dl +; FALLBACK16-NEXT: notb %dl +; FALLBACK16-NEXT: movl 40(%esp,%ebp), %eax +; FALLBACK16-NEXT: movl %eax, %esi ; FALLBACK16-NEXT: shrl %esi -; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: orl %edi, %esi -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 8(%eax) -; FALLBACK16-NEXT: movl %ebp, 12(%eax) -; FALLBACK16-NEXT: movl %ebx, 4(%eax) +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 36(%esp,%ebp), %ebx +; FALLBACK16-NEXT: movl %ebx, %edi +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl 32(%esp,%ebp), %eax +; FALLBACK16-NEXT: movl %eax, %ebp +; FALLBACK16-NEXT: shrl %ebp +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK16-NEXT: orl %ebx, %ebp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl %eax, (%edx) +; FALLBACK16-NEXT: movl %ebp, 4(%edx) +; FALLBACK16-NEXT: movl %edi, 8(%edx) +; FALLBACK16-NEXT: movl %esi, 12(%edx) ; FALLBACK16-NEXT: addl $60, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi @@ -1730,45 +1684,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK17-LABEL: shl_16bytes: ; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebp ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $32, %esp +; FALLBACK17-NEXT: subl $44, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK17-NEXT: movl (%edx), %esi -; FALLBACK17-NEXT: movl 4(%edx), %edi -; FALLBACK17-NEXT: movl 8(%edx), %ebx -; FALLBACK17-NEXT: movl 12(%edx), %edx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movb %ch, %cl +; FALLBACK17-NEXT: movups (%ecx), %xmm0 +; FALLBACK17-NEXT: movzbl (%eax), %eax +; FALLBACK17-NEXT: movl %eax, %ecx ; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, (%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $12, %ch -; FALLBACK17-NEXT: negb %ch -; FALLBACK17-NEXT: movsbl %ch, %edi -; FALLBACK17-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK17-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK17-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK17-NEXT: shldl %cl, %edi, %esi -; FALLBACK17-NEXT: shldl %cl, %ebx, %edi -; FALLBACK17-NEXT: shll %cl, %ebx -; FALLBACK17-NEXT: movl %esi, 8(%eax) -; FALLBACK17-NEXT: movl %edx, 12(%eax) -; FALLBACK17-NEXT: movl %ebx, (%eax) -; FALLBACK17-NEXT: movl %edi, 4(%eax) -; FALLBACK17-NEXT: addl $32, %esp +; FALLBACK17-NEXT: xorps %xmm1, %xmm1 +; FALLBACK17-NEXT: movaps %xmm1, (%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: andb $12, %al +; FALLBACK17-NEXT: negb %al +; FALLBACK17-NEXT: movsbl %al, %esi +; FALLBACK17-NEXT: movl 24(%esp,%esi), %edx +; FALLBACK17-NEXT: movl 28(%esp,%esi), %eax +; FALLBACK17-NEXT: shldl %cl, %edx, %eax +; FALLBACK17-NEXT: movl 20(%esp,%esi), %edi +; FALLBACK17-NEXT: shldl %cl, %edi, %edx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK17-NEXT: movl 16(%esp,%esi), %esi +; FALLBACK17-NEXT: movl %esi, %ebp +; FALLBACK17-NEXT: shll %cl, %ebp +; FALLBACK17-NEXT: shldl %cl, %esi, %edi +; FALLBACK17-NEXT: movl %edi, 4(%ebx) +; FALLBACK17-NEXT: movl %edx, 8(%ebx) +; FALLBACK17-NEXT: movl %eax, 12(%ebx) +; FALLBACK17-NEXT: movl %ebp, (%ebx) +; FALLBACK17-NEXT: addl $44, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx +; FALLBACK17-NEXT: popl %ebp ; FALLBACK17-NEXT: retl ; ; FALLBACK18-LABEL: shl_16bytes: @@ -1780,45 +1731,40 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: subl $44, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl (%ecx), %edx -; FALLBACK18-NEXT: movl 4(%ecx), %esi -; FALLBACK18-NEXT: movl 8(%ecx), %edi -; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %eax +; FALLBACK18-NEXT: movups (%ecx), %xmm0 +; FALLBACK18-NEXT: movzbl (%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: xorps %xmm1, %xmm1 +; FALLBACK18-NEXT: movaps %xmm1, (%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: andb $12, %cl +; FALLBACK18-NEXT: negb %cl +; FALLBACK18-NEXT: movsbl %cl, %ecx +; FALLBACK18-NEXT: shlxl %eax, 28(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 24(%esp,%ecx), %edx +; FALLBACK18-NEXT: shlxl %eax, %edx, %edi ; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: shlb $3, %bl -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, (%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $12, %al -; FALLBACK18-NEXT: negb %al -; FALLBACK18-NEXT: movsbl %al, %edx -; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi -; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: notb %al -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %eax, %edi, %edi -; FALLBACK18-NEXT: orl %esi, %edi -; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx -; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx +; FALLBACK18-NEXT: notb %bl ; FALLBACK18-NEXT: shrl %edx -; FALLBACK18-NEXT: shrxl %eax, %edx, %edx +; FALLBACK18-NEXT: shrxl %ebx, %edx, %edx ; FALLBACK18-NEXT: orl %esi, %edx +; FALLBACK18-NEXT: movl 20(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl %esi, %ebp +; FALLBACK18-NEXT: shrl %ebp +; FALLBACK18-NEXT: shrxl %ebx, %ebp, %ebp +; FALLBACK18-NEXT: orl %edi, %ebp +; FALLBACK18-NEXT: shlxl %eax, %esi, %esi +; FALLBACK18-NEXT: movl 16(%esp,%ecx), %ecx +; FALLBACK18-NEXT: shlxl %eax, %ecx, %eax ; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl %ebp, (%ecx) -; FALLBACK18-NEXT: movl %eax, 8(%ecx) -; FALLBACK18-NEXT: movl %edx, 12(%ecx) -; FALLBACK18-NEXT: movl %edi, 4(%ecx) +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK18-NEXT: orl %esi, %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK18-NEXT: movl %eax, (%esi) +; FALLBACK18-NEXT: movl %ecx, 4(%esi) +; FALLBACK18-NEXT: movl %ebp, 8(%esi) +; FALLBACK18-NEXT: movl %edx, 12(%esi) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -1833,37 +1779,32 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $44, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK19-NEXT: movl (%edx), %esi -; FALLBACK19-NEXT: movl 4(%edx), %edi -; FALLBACK19-NEXT: movl 8(%edx), %ebx -; FALLBACK19-NEXT: movl 12(%edx), %edx -; FALLBACK19-NEXT: movzbl (%ecx), %eax +; FALLBACK19-NEXT: movups (%ecx), %xmm0 +; FALLBACK19-NEXT: movzbl (%eax), %eax ; FALLBACK19-NEXT: movl %eax, %ecx ; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, (%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: xorps %xmm1, %xmm1 +; FALLBACK19-NEXT: movaps %xmm1, (%esp) +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: andb $12, %al ; FALLBACK19-NEXT: negb %al -; FALLBACK19-NEXT: movsbl %al, %eax -; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi -; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx -; FALLBACK19-NEXT: shldl %cl, %esi, %edx -; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi -; FALLBACK19-NEXT: movl 20(%esp,%eax), %eax -; FALLBACK19-NEXT: shldl %cl, %eax, %esi -; FALLBACK19-NEXT: shldl %cl, %edi, %eax -; FALLBACK19-NEXT: shlxl %ecx, %edi, %ecx -; FALLBACK19-NEXT: movl %esi, 8(%ebp) -; FALLBACK19-NEXT: movl %edx, 12(%ebp) -; FALLBACK19-NEXT: movl %ecx, (%ebp) -; FALLBACK19-NEXT: movl %eax, 4(%ebp) +; FALLBACK19-NEXT: movsbl %al, %esi +; FALLBACK19-NEXT: movl 24(%esp,%esi), %edx +; FALLBACK19-NEXT: movl 28(%esp,%esi), %eax +; FALLBACK19-NEXT: shldl %cl, %edx, %eax +; FALLBACK19-NEXT: movl 20(%esp,%esi), %edi +; FALLBACK19-NEXT: shldl %cl, %edi, %edx +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK19-NEXT: movl 16(%esp,%esi), %esi +; FALLBACK19-NEXT: shlxl %ecx, %esi, %ebp +; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: shldl %cl, %esi, %edi +; FALLBACK19-NEXT: movl %edi, 4(%ebx) +; FALLBACK19-NEXT: movl %edx, 8(%ebx) +; FALLBACK19-NEXT: movl %eax, 12(%ebx) +; FALLBACK19-NEXT: movl %ebp, (%ebx) ; FALLBACK19-NEXT: addl $44, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi @@ -1882,45 +1823,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movl %ecx, %eax -; FALLBACK20-NEXT: shlb $3, %al +; FALLBACK20-NEXT: movb %cl, %ch +; FALLBACK20-NEXT: shlb $3, %ch ; FALLBACK20-NEXT: xorps %xmm1, %xmm1 ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $12, %cl ; FALLBACK20-NEXT: negb %cl -; FALLBACK20-NEXT: movsbl %cl, %edi -; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: movsbl %cl, %ebp +; FALLBACK20-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %ch, %dl ; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: movl 40(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %esi +; FALLBACK20-NEXT: movl 40(%esp,%ebp), %eax +; FALLBACK20-NEXT: movl %eax, %esi ; FALLBACK20-NEXT: shrl %esi -; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: movl 32(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 36(%esp,%ebp), %ebx ; FALLBACK20-NEXT: movl %ebx, %edi ; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebp, %edi -; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK20-NEXT: movl 32(%esp,%ebp), %eax +; FALLBACK20-NEXT: movl %eax, %ebp ; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK20-NEXT: orl %ebx, %ebp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: movl %eax, (%edx) ; FALLBACK20-NEXT: movl %ebp, 4(%edx) @@ -1942,30 +1883,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: subl $44, %esp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK21-NEXT: movups (%edx), %xmm0 -; FALLBACK21-NEXT: movzbl (%ecx), %edx -; FALLBACK21-NEXT: movl %edx, %ecx +; FALLBACK21-NEXT: movups (%ecx), %xmm0 +; FALLBACK21-NEXT: movzbl (%eax), %eax +; FALLBACK21-NEXT: movl %eax, %ecx ; FALLBACK21-NEXT: shlb $3, %cl ; FALLBACK21-NEXT: xorps %xmm1, %xmm1 ; FALLBACK21-NEXT: movaps %xmm1, (%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: andb $12, %dl -; FALLBACK21-NEXT: negb %dl -; FALLBACK21-NEXT: movsbl %dl, %edi -; FALLBACK21-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK21-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK21-NEXT: shldl %cl, %esi, %edx -; FALLBACK21-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK21-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK21-NEXT: shldl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %ebx, %ebp +; FALLBACK21-NEXT: andb $12, %al +; FALLBACK21-NEXT: negb %al +; FALLBACK21-NEXT: movsbl %al, %esi +; FALLBACK21-NEXT: movl 24(%esp,%esi), %edx +; FALLBACK21-NEXT: movl 28(%esp,%esi), %eax +; FALLBACK21-NEXT: shldl %cl, %edx, %eax +; FALLBACK21-NEXT: movl 20(%esp,%esi), %edi +; FALLBACK21-NEXT: shldl %cl, %edi, %edx +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK21-NEXT: movl 16(%esp,%esi), %esi +; FALLBACK21-NEXT: movl %esi, %ebp ; FALLBACK21-NEXT: shll %cl, %ebp -; FALLBACK21-NEXT: shldl %cl, %ebx, %edi -; FALLBACK21-NEXT: movl %edi, 4(%eax) -; FALLBACK21-NEXT: movl %esi, 8(%eax) -; FALLBACK21-NEXT: movl %edx, 12(%eax) -; FALLBACK21-NEXT: movl %ebp, (%eax) +; FALLBACK21-NEXT: shldl %cl, %esi, %edi +; FALLBACK21-NEXT: movl %edi, 4(%ebx) +; FALLBACK21-NEXT: movl %edx, 8(%ebx) +; FALLBACK21-NEXT: movl %eax, 12(%ebx) +; FALLBACK21-NEXT: movl %ebp, (%ebx) ; FALLBACK21-NEXT: addl $44, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -2032,30 +1973,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: subl $44, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK23-NEXT: movups (%edx), %xmm0 -; FALLBACK23-NEXT: movzbl (%ecx), %edx -; FALLBACK23-NEXT: movl %edx, %ecx +; FALLBACK23-NEXT: movups (%ecx), %xmm0 +; FALLBACK23-NEXT: movzbl (%eax), %eax +; FALLBACK23-NEXT: movl %eax, %ecx ; FALLBACK23-NEXT: shlb $3, %cl ; FALLBACK23-NEXT: xorps %xmm1, %xmm1 ; FALLBACK23-NEXT: movaps %xmm1, (%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: andb $12, %dl -; FALLBACK23-NEXT: negb %dl -; FALLBACK23-NEXT: movsbl %dl, %edi -; FALLBACK23-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK23-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK23-NEXT: shldl %cl, %esi, %edx -; FALLBACK23-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK23-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK23-NEXT: shldl %cl, %edi, %esi -; FALLBACK23-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK23-NEXT: andb $12, %al +; FALLBACK23-NEXT: negb %al +; FALLBACK23-NEXT: movsbl %al, %esi +; FALLBACK23-NEXT: movl 24(%esp,%esi), %edx +; FALLBACK23-NEXT: movl 28(%esp,%esi), %eax +; FALLBACK23-NEXT: shldl %cl, %edx, %eax +; FALLBACK23-NEXT: movl 20(%esp,%esi), %edi +; FALLBACK23-NEXT: shldl %cl, %edi, %edx +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK23-NEXT: movl 16(%esp,%esi), %esi +; FALLBACK23-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: shldl %cl, %ebx, %edi -; FALLBACK23-NEXT: movl %edi, 4(%eax) -; FALLBACK23-NEXT: movl %esi, 8(%eax) -; FALLBACK23-NEXT: movl %edx, 12(%eax) -; FALLBACK23-NEXT: movl %ebp, (%eax) +; FALLBACK23-NEXT: shldl %cl, %esi, %edi +; FALLBACK23-NEXT: movl %edi, 4(%ebx) +; FALLBACK23-NEXT: movl %edx, 8(%ebx) +; FALLBACK23-NEXT: movl %eax, 12(%ebx) +; FALLBACK23-NEXT: movl %ebp, (%ebx) ; FALLBACK23-NEXT: addl $44, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi @@ -2074,45 +2015,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movl %ecx, %eax -; FALLBACK24-NEXT: shlb $3, %al +; FALLBACK24-NEXT: movb %cl, %ch +; FALLBACK24-NEXT: shlb $3, %ch ; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $12, %cl ; FALLBACK24-NEXT: negb %cl -; FALLBACK24-NEXT: movsbl %cl, %edi -; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl %eax, %edx +; FALLBACK24-NEXT: movsbl %cl, %ebp +; FALLBACK24-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %ch, %dl ; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: movl 40(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %esi +; FALLBACK24-NEXT: movl 40(%esp,%ebp), %eax +; FALLBACK24-NEXT: movl %eax, %esi ; FALLBACK24-NEXT: shrl %esi -; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: movl 32(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 36(%esp,%ebp), %ebx ; FALLBACK24-NEXT: movl %ebx, %edi ; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebp, %edi -; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK24-NEXT: movl 32(%esp,%ebp), %eax +; FALLBACK24-NEXT: movl %eax, %ebp ; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK24-NEXT: orl %ebx, %ebp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax ; FALLBACK24-NEXT: movl %eax, (%edx) ; FALLBACK24-NEXT: movl %ebp, 4(%edx) @@ -2134,30 +2075,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: subl $44, %esp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK25-NEXT: vmovups (%edx), %xmm0 -; FALLBACK25-NEXT: movzbl (%ecx), %edx -; FALLBACK25-NEXT: movl %edx, %ecx +; FALLBACK25-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK25-NEXT: movzbl (%eax), %eax +; FALLBACK25-NEXT: movl %eax, %ecx ; FALLBACK25-NEXT: shlb $3, %cl ; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK25-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: andb $12, %dl -; FALLBACK25-NEXT: negb %dl -; FALLBACK25-NEXT: movsbl %dl, %edi -; FALLBACK25-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK25-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK25-NEXT: shldl %cl, %esi, %edx -; FALLBACK25-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK25-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK25-NEXT: shldl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %ebx, %ebp +; FALLBACK25-NEXT: andb $12, %al +; FALLBACK25-NEXT: negb %al +; FALLBACK25-NEXT: movsbl %al, %esi +; FALLBACK25-NEXT: movl 24(%esp,%esi), %edx +; FALLBACK25-NEXT: movl 28(%esp,%esi), %eax +; FALLBACK25-NEXT: shldl %cl, %edx, %eax +; FALLBACK25-NEXT: movl 20(%esp,%esi), %edi +; FALLBACK25-NEXT: shldl %cl, %edi, %edx +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK25-NEXT: movl 16(%esp,%esi), %esi +; FALLBACK25-NEXT: movl %esi, %ebp ; FALLBACK25-NEXT: shll %cl, %ebp -; FALLBACK25-NEXT: shldl %cl, %ebx, %edi -; FALLBACK25-NEXT: movl %edi, 4(%eax) -; FALLBACK25-NEXT: movl %esi, 8(%eax) -; FALLBACK25-NEXT: movl %edx, 12(%eax) -; FALLBACK25-NEXT: movl %ebp, (%eax) +; FALLBACK25-NEXT: shldl %cl, %esi, %edi +; FALLBACK25-NEXT: movl %edi, 4(%ebx) +; FALLBACK25-NEXT: movl %edx, 8(%ebx) +; FALLBACK25-NEXT: movl %eax, 12(%ebx) +; FALLBACK25-NEXT: movl %ebp, (%ebx) ; FALLBACK25-NEXT: addl $44, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -2224,30 +2165,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: subl $44, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK27-NEXT: vmovups (%edx), %xmm0 -; FALLBACK27-NEXT: movzbl (%ecx), %edx -; FALLBACK27-NEXT: movl %edx, %ecx +; FALLBACK27-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK27-NEXT: movzbl (%eax), %eax +; FALLBACK27-NEXT: movl %eax, %ecx ; FALLBACK27-NEXT: shlb $3, %cl ; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK27-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: andb $12, %dl -; FALLBACK27-NEXT: negb %dl -; FALLBACK27-NEXT: movsbl %dl, %edi -; FALLBACK27-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK27-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK27-NEXT: shldl %cl, %esi, %edx -; FALLBACK27-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK27-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK27-NEXT: shldl %cl, %edi, %esi -; FALLBACK27-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK27-NEXT: andb $12, %al +; FALLBACK27-NEXT: negb %al +; FALLBACK27-NEXT: movsbl %al, %esi +; FALLBACK27-NEXT: movl 24(%esp,%esi), %edx +; FALLBACK27-NEXT: movl 28(%esp,%esi), %eax +; FALLBACK27-NEXT: shldl %cl, %edx, %eax +; FALLBACK27-NEXT: movl 20(%esp,%esi), %edi +; FALLBACK27-NEXT: shldl %cl, %edi, %edx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK27-NEXT: movl 16(%esp,%esi), %esi +; FALLBACK27-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: shldl %cl, %ebx, %edi -; FALLBACK27-NEXT: movl %edi, 4(%eax) -; FALLBACK27-NEXT: movl %esi, 8(%eax) -; FALLBACK27-NEXT: movl %edx, 12(%eax) -; FALLBACK27-NEXT: movl %ebp, (%eax) +; FALLBACK27-NEXT: shldl %cl, %esi, %edi +; FALLBACK27-NEXT: movl %edi, 4(%ebx) +; FALLBACK27-NEXT: movl %edx, 8(%ebx) +; FALLBACK27-NEXT: movl %eax, 12(%ebx) +; FALLBACK27-NEXT: movl %ebp, (%ebx) ; FALLBACK27-NEXT: addl $44, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi @@ -2266,45 +2207,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movl %ecx, %eax -; FALLBACK28-NEXT: shlb $3, %al +; FALLBACK28-NEXT: movb %cl, %ch +; FALLBACK28-NEXT: shlb $3, %ch ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $12, %cl ; FALLBACK28-NEXT: negb %cl -; FALLBACK28-NEXT: movsbl %cl, %edi -; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl %eax, %edx +; FALLBACK28-NEXT: movsbl %cl, %ebp +; FALLBACK28-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %ch, %dl ; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: movl 40(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %esi +; FALLBACK28-NEXT: movl 40(%esp,%ebp), %eax +; FALLBACK28-NEXT: movl %eax, %esi ; FALLBACK28-NEXT: shrl %esi -; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: movl 32(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 36(%esp,%ebp), %ebx ; FALLBACK28-NEXT: movl %ebx, %edi ; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebp, %edi -; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK28-NEXT: movl 32(%esp,%ebp), %eax +; FALLBACK28-NEXT: movl %eax, %ebp ; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK28-NEXT: orl %ebx, %ebp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: movl %eax, (%edx) ; FALLBACK28-NEXT: movl %ebp, 4(%edx) @@ -2326,30 +2267,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: subl $44, %esp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK29-NEXT: vmovups (%edx), %xmm0 -; FALLBACK29-NEXT: movzbl (%ecx), %edx -; FALLBACK29-NEXT: movl %edx, %ecx +; FALLBACK29-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK29-NEXT: movzbl (%eax), %eax +; FALLBACK29-NEXT: movl %eax, %ecx ; FALLBACK29-NEXT: shlb $3, %cl ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: andb $12, %dl -; FALLBACK29-NEXT: negb %dl -; FALLBACK29-NEXT: movsbl %dl, %edi -; FALLBACK29-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK29-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK29-NEXT: shldl %cl, %esi, %edx -; FALLBACK29-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK29-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK29-NEXT: shldl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %ebx, %ebp +; FALLBACK29-NEXT: andb $12, %al +; FALLBACK29-NEXT: negb %al +; FALLBACK29-NEXT: movsbl %al, %esi +; FALLBACK29-NEXT: movl 24(%esp,%esi), %edx +; FALLBACK29-NEXT: movl 28(%esp,%esi), %eax +; FALLBACK29-NEXT: shldl %cl, %edx, %eax +; FALLBACK29-NEXT: movl 20(%esp,%esi), %edi +; FALLBACK29-NEXT: shldl %cl, %edi, %edx +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK29-NEXT: movl 16(%esp,%esi), %esi +; FALLBACK29-NEXT: movl %esi, %ebp ; FALLBACK29-NEXT: shll %cl, %ebp -; FALLBACK29-NEXT: shldl %cl, %ebx, %edi -; FALLBACK29-NEXT: movl %edi, 4(%eax) -; FALLBACK29-NEXT: movl %esi, 8(%eax) -; FALLBACK29-NEXT: movl %edx, 12(%eax) -; FALLBACK29-NEXT: movl %ebp, (%eax) +; FALLBACK29-NEXT: shldl %cl, %esi, %edi +; FALLBACK29-NEXT: movl %edi, 4(%ebx) +; FALLBACK29-NEXT: movl %edx, 8(%ebx) +; FALLBACK29-NEXT: movl %eax, 12(%ebx) +; FALLBACK29-NEXT: movl %ebp, (%ebx) ; FALLBACK29-NEXT: addl $44, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -2416,30 +2357,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: subl $44, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK31-NEXT: vmovups (%edx), %xmm0 -; FALLBACK31-NEXT: movzbl (%ecx), %edx -; FALLBACK31-NEXT: movl %edx, %ecx +; FALLBACK31-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK31-NEXT: movzbl (%eax), %eax +; FALLBACK31-NEXT: movl %eax, %ecx ; FALLBACK31-NEXT: shlb $3, %cl ; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK31-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: andb $12, %dl -; FALLBACK31-NEXT: negb %dl -; FALLBACK31-NEXT: movsbl %dl, %edi -; FALLBACK31-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK31-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK31-NEXT: shldl %cl, %esi, %edx -; FALLBACK31-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK31-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK31-NEXT: shldl %cl, %edi, %esi -; FALLBACK31-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK31-NEXT: andb $12, %al +; FALLBACK31-NEXT: negb %al +; FALLBACK31-NEXT: movsbl %al, %esi +; FALLBACK31-NEXT: movl 24(%esp,%esi), %edx +; FALLBACK31-NEXT: movl 28(%esp,%esi), %eax +; FALLBACK31-NEXT: shldl %cl, %edx, %eax +; FALLBACK31-NEXT: movl 20(%esp,%esi), %edi +; FALLBACK31-NEXT: shldl %cl, %edi, %edx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK31-NEXT: movl 16(%esp,%esi), %esi +; FALLBACK31-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: shldl %cl, %ebx, %edi -; FALLBACK31-NEXT: movl %edi, 4(%eax) -; FALLBACK31-NEXT: movl %esi, 8(%eax) -; FALLBACK31-NEXT: movl %edx, 12(%eax) -; FALLBACK31-NEXT: movl %ebp, (%eax) +; FALLBACK31-NEXT: shldl %cl, %esi, %edi +; FALLBACK31-NEXT: movl %edi, 4(%ebx) +; FALLBACK31-NEXT: movl %edx, 8(%ebx) +; FALLBACK31-NEXT: movl %eax, 12(%ebx) +; FALLBACK31-NEXT: movl %ebp, (%ebx) ; FALLBACK31-NEXT: addl $44, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi @@ -2533,40 +2474,22 @@ define void @shl_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; ; X86-SSE2-LABEL: shl_16bytes_dwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %edi -; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $32, %esp +; X86-SSE2-NEXT: subl $44, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: movl (%edx), %esi -; X86-SSE2-NEXT: movl 4(%edx), %edi -; X86-SSE2-NEXT: movl 8(%edx), %ebx -; X86-SSE2-NEXT: movl 12(%edx), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 ; X86-SSE2-NEXT: movzbl (%ecx), %ecx -; X86-SSE2-NEXT: xorps %xmm0, %xmm0 -; X86-SSE2-NEXT: movaps %xmm0, (%esp) -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: xorps %xmm1, %xmm1 +; X86-SSE2-NEXT: movaps %xmm1, (%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: shlb $2, %cl ; X86-SSE2-NEXT: andb $12, %cl ; X86-SSE2-NEXT: negb %cl ; X86-SSE2-NEXT: movsbl %cl, %ecx -; X86-SSE2-NEXT: movl 16(%esp,%ecx), %edx -; X86-SSE2-NEXT: movl 20(%esp,%ecx), %esi -; X86-SSE2-NEXT: movl 28(%esp,%ecx), %edi -; X86-SSE2-NEXT: movl 24(%esp,%ecx), %ecx -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl %edi, 12(%eax) -; X86-SSE2-NEXT: movl %edx, (%eax) -; X86-SSE2-NEXT: movl %esi, 4(%eax) -; X86-SSE2-NEXT: addl $32, %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %edi -; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: movups 16(%esp,%ecx), %xmm0 +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $44, %esp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: shl_16bytes_dwordOff: @@ -2620,24 +2543,24 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes: ; X64-NO-SHLD-NO-BMI2: # %bb.0: ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 -; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8 -; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rsi,%rsi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl -; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi -; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi -; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: sarq %cl, %r8 -; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al -; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi -; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rdi -; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) -; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, (%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes: @@ -2719,32 +2642,34 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: andb $12, %ah -; X86-NO-SHLD-NO-BMI2-NEXT: movzbl %ah, %ebp -; X86-NO-SHLD-NO-BMI2-NEXT: movl 20(%esp,%ebp), %esi -; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movzbl %ah, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl 16(%esp,%ebx), %ebp +; X86-NO-SHLD-NO-BMI2-NEXT: movl 20(%esp,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %edx ; X86-NO-SHLD-NO-BMI2-NEXT: notb %dl -; X86-NO-SHLD-NO-BMI2-NEXT: movl 24(%esp,%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: movl 24(%esp,%ebx), %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ecx,%ecx), %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi -; X86-NO-SHLD-NO-BMI2-NEXT: movl 16(%esp,%ebp), %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx -; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-SHLD-NO-BMI2-NEXT: addl %esi, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %esi -; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-SHLD-NO-BMI2-NEXT: movl 28(%esp,%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl 28(%esp,%ebx), %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ebx,%ebx), %ebp ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NO-SHLD-NO-BMI2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx @@ -2769,38 +2694,38 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-HAVE-SHLD-NO-BMI2-NEXT: subl $44, %esp ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edi -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%edx), %ebx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%edx), %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movb (%ecx), %ch -; X86-HAVE-SHLD-NO-BMI2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%esp) -; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-SHLD-NO-BMI2-NEXT: andb $12, %ch -; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl %ch, %ebx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%esp,%ebx), %esi -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esp,%ebx), %edx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esp,%ebx), %ebp -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebp, %edi -; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edi -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%esp,%ebx), %ebx -; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebx, %esi -; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebp, %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %ebx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: andb $12, %al +; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl %al, %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%esp,%esi), %eax +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esp,%esi), %edi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%esp,%esi), %ebx +; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebx, %eax +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esp,%esi), %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %edi, %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 8(%eax) -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%eax) -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax) -; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, 4(%ebp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: addl $44, %esp ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi @@ -2872,41 +2797,41 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: subl $44, %esp -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%edx), %ebx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%edx), %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%ecx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%eax), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, (%esp) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: andb $12, %al -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl %al, %eax -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%eax), %ebx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esp,%eax), %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%eax), %esi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, %edi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%eax), %eax -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %eax, %ebx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, 8(%ebp) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %eax, %eax -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl %al, %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%edi), %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%edi), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%edi), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %ebx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esp,%edi), %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %ebx, %ebx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%ebp) -; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%ebp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: addl $44, %esp ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi @@ -2925,24 +2850,24 @@ define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff: ; X64-NO-SHLD-NO-BMI2: # %bb.0: ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 -; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8 -; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rsi,%rsi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl -; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi -; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi -; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: sarq %cl, %r8 -; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al -; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi -; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rdi -; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) -; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, (%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff: @@ -3022,14 +2947,8 @@ define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andl $3, %ecx -; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx -; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi -; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi -; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl %edi, 12(%eax) -; X86-SSE2-NEXT: movl %edx, (%eax) -; X86-SSE2-NEXT: movl %esi, 4(%eax) +; X86-SSE2-NEXT: movups (%esp,%ecx,4), %xmm0 +; X86-SSE2-NEXT: movups %xmm0, (%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -3110,164 +3029,153 @@ define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: lshr_32bytes: ; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 +; FALLBACK0-NEXT: movups (%rdi), %xmm0 +; FALLBACK0-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK0-NEXT: movzbl (%rsi), %ecx +; FALLBACK0-NEXT: leal (,%rcx,8), %eax +; FALLBACK0-NEXT: xorps %xmm2, %xmm2 +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: andb $24, %cl +; FALLBACK0-NEXT: movzbl %cl, %r9d +; FALLBACK0-NEXT: movq -72(%rsp,%r9), %r10 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r8 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi +; FALLBACK0-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movq -56(%rsp,%r9), %r11 +; FALLBACK0-NEXT: movq %r11, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 +; FALLBACK0-NEXT: movq -48(%rsp,%r9), %r9 +; FALLBACK0-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: shlq %cl, %r14 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r8 +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: orq %rbx, %r14 +; FALLBACK0-NEXT: addq %r11, %r11 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: orq %r8, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r9 ; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %r11, 8(%rdx) +; FALLBACK0-NEXT: movq %r14, 16(%rdx) ; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: lshr_32bytes: ; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movups (%rdi), %xmm0 +; FALLBACK1-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK1-NEXT: movzbl (%rsi), %eax +; FALLBACK1-NEXT: leal (,%rax,8), %ecx +; FALLBACK1-NEXT: xorps %xmm2, %xmm2 +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK1-NEXT: andb $24, %al +; FALLBACK1-NEXT: movzbl %al, %eax +; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK1-NEXT: movq %rdi, %r8 +; FALLBACK1-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK1-NEXT: movq %r9, %r10 +; FALLBACK1-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK1-NEXT: shrdq %cl, %r9, %rax ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shrq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) +; FALLBACK1-NEXT: shrq %cl, %rsi +; FALLBACK1-NEXT: movq %r10, 8(%rdx) +; FALLBACK1-NEXT: movq %r8, 16(%rdx) +; FALLBACK1-NEXT: movq %rsi, 24(%rdx) +; FALLBACK1-NEXT: movq %rax, (%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: lshr_32bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movups (%rdi), %xmm0 +; FALLBACK2-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK2-NEXT: movzbl (%rsi), %ecx +; FALLBACK2-NEXT: leal (,%rcx,8), %eax +; FALLBACK2-NEXT: xorps %xmm2, %xmm2 +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK2-NEXT: andb $24, %cl +; FALLBACK2-NEXT: movzbl %cl, %ecx +; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi +; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: orq %rsi, %rdi ; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax +; FALLBACK2-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK2-NEXT: orq %r9, %rcx +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rax, %r8, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rax, 8(%rdx) +; FALLBACK2-NEXT: movq %rcx, 16(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes: ; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movups (%rdi), %xmm0 +; FALLBACK3-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK3-NEXT: movzbl (%rsi), %eax +; FALLBACK3-NEXT: leal (,%rax,8), %ecx +; FALLBACK3-NEXT: xorps %xmm2, %xmm2 +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) +; FALLBACK3-NEXT: andb $24, %al +; FALLBACK3-NEXT: movzbl %al, %eax +; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK3-NEXT: movq %rdi, %r8 +; FALLBACK3-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK3-NEXT: movq %r9, %r10 +; FALLBACK3-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK3-NEXT: shrdq %cl, %r9, %rax +; FALLBACK3-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK3-NEXT: movq %r10, 8(%rdx) +; FALLBACK3-NEXT: movq %r8, 16(%rdx) +; FALLBACK3-NEXT: movq %rcx, 24(%rdx) +; FALLBACK3-NEXT: movq %rax, (%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: lshr_32bytes: ; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %r14 ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 @@ -3280,38 +3188,39 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: andb $24, %cl ; FALLBACK4-NEXT: movzbl %cl, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK4-NEXT: movq -72(%rsp,%r9), %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r8 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 +; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r11 +; FALLBACK4-NEXT: movq %r11, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK4-NEXT: shrq %cl, %rbx +; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r9 +; FALLBACK4-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx +; FALLBACK4-NEXT: shlq %cl, %r14 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: orq %r10, %rdi +; FALLBACK4-NEXT: orq %rbx, %r14 +; FALLBACK4-NEXT: addq %r11, %r11 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: shlq %cl, %r11 +; FALLBACK4-NEXT: orq %r8, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %r11, 8(%rdx) +; FALLBACK4-NEXT: movq %r14, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) ; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: popq %r14 ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: lshr_32bytes: @@ -3331,17 +3240,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK5-NEXT: movq %rdi, %r8 ; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r10 +; FALLBACK5-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK5-NEXT: movq %r9, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK5-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK5-NEXT: shrdq %cl, %r9, %rax ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: shrq %cl, %rsi ; FALLBACK5-NEXT: movq %r10, 8(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: movq %rax, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: lshr_32bytes: @@ -3398,20 +3307,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK7-NEXT: movq %rdi, %r8 ; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r10 +; FALLBACK7-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK7-NEXT: movq %r9, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK7-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK7-NEXT: shrdq %cl, %r9, %rax +; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rcx ; FALLBACK7-NEXT: movq %r10, 8(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: movq %rcx, 24(%rdx) +; FALLBACK7-NEXT: movq %rax, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: lshr_32bytes: ; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %r14 ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: movzbl (%rsi), %ecx @@ -3421,38 +3331,39 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: andb $24, %cl ; FALLBACK8-NEXT: movzbl %cl, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r8 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 +; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r11 +; FALLBACK8-NEXT: movq %r11, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK8-NEXT: shrq %cl, %rbx +; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r9 +; FALLBACK8-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx +; FALLBACK8-NEXT: shlq %cl, %r14 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: orq %r10, %rdi +; FALLBACK8-NEXT: orq %rbx, %r14 +; FALLBACK8-NEXT: addq %r11, %r11 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: shlq %cl, %r11 +; FALLBACK8-NEXT: orq %r8, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %r11, 8(%rdx) +; FALLBACK8-NEXT: movq %r14, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) ; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: popq %r14 ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; @@ -3470,17 +3381,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK9-NEXT: movq %rdi, %r8 ; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r10 +; FALLBACK9-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK9-NEXT: movq %r9, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK9-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK9-NEXT: shrdq %cl, %r9, %rax ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %rsi ; FALLBACK9-NEXT: movq %r10, 8(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: movq %rax, (%rdx) ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; @@ -3533,21 +3444,22 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK11-NEXT: movq %rdi, %r8 ; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r10 +; FALLBACK11-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK11-NEXT: movq %r9, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK11-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK11-NEXT: shrdq %cl, %r9, %rax +; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rcx ; FALLBACK11-NEXT: movq %r10, 8(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: movq %rcx, 24(%rdx) +; FALLBACK11-NEXT: movq %rax, (%rdx) ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: lshr_32bytes: ; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %r14 ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK12-NEXT: movzbl (%rsi), %ecx @@ -3557,38 +3469,39 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: andb $24, %cl ; FALLBACK12-NEXT: movzbl %cl, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r8 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 +; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r11 +; FALLBACK12-NEXT: movq %r11, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK12-NEXT: shrq %cl, %rbx +; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx +; FALLBACK12-NEXT: shlq %cl, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: orq %r10, %rdi +; FALLBACK12-NEXT: orq %rbx, %r14 +; FALLBACK12-NEXT: addq %r11, %r11 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: shlq %cl, %r11 +; FALLBACK12-NEXT: orq %r8, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %r11, 8(%rdx) +; FALLBACK12-NEXT: movq %r14, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: popq %r14 ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; @@ -3606,17 +3519,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK13-NEXT: movq %rdi, %r8 ; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK13-NEXT: movq %rax, %r10 +; FALLBACK13-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK13-NEXT: movq %r9, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK13-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK13-NEXT: shrdq %cl, %r9, %rax ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: shrq %cl, %rsi ; FALLBACK13-NEXT: movq %r10, 8(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: movq %rax, (%rdx) ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; @@ -3669,16 +3582,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK15-NEXT: movq %rdi, %r8 ; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r10 +; FALLBACK15-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK15-NEXT: movq %r9, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK15-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK15-NEXT: shrdq %cl, %r9, %rax +; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rcx ; FALLBACK15-NEXT: movq %r10, 8(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: movq %rcx, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, (%rdx) ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; @@ -3690,119 +3603,99 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $108, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK16-NEXT: movl (%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ebp), %ecx +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movups (%ecx), %xmm0 +; FALLBACK16-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK16-NEXT: movzbl (%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, %eax +; FALLBACK16-NEXT: shlb $3, %al +; FALLBACK16-NEXT: xorps %xmm2, %xmm2 +; FALLBACK16-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: andb $28, %cl +; FALLBACK16-NEXT: movzbl %cl, %ebx +; FALLBACK16-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 36(%esp,%ebx), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%ebp), %ecx +; FALLBACK16-NEXT: movb %al, %ah +; FALLBACK16-NEXT: notb %ah +; FALLBACK16-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK16-NEXT: movb %ah, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 44(%esp,%ebx), %edi +; FALLBACK16-NEXT: movl %edi, %ebp +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl 48(%esp,%ebx), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ebp), %edi -; FALLBACK16-NEXT: movl 16(%ebp), %ebx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movl 20(%ebp), %esi -; FALLBACK16-NEXT: movl 24(%ebp), %ecx -; FALLBACK16-NEXT: movl 28(%ebp), %ebp -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movb %ah, %dh -; FALLBACK16-NEXT: shlb $3, %dh -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $28, %ah -; FALLBACK16-NEXT: movzbl %ah, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax -; FALLBACK16-NEXT: movl %eax, %ebx -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movb %dh, %dl -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: movl %eax, %ebx -; FALLBACK16-NEXT: addl %eax, %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %esi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp -; FALLBACK16-NEXT: movl %ebp, %esi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: movl 48(%esp,%eax), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%eax,%eax), %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %esi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 52(%esp,%eax), %edi -; FALLBACK16-NEXT: movl %edi, %ebx -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl 56(%esp,%eax), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %eax -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebx, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK16-NEXT: movb %ah, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 40(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %edx +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: movb %ah, %cl ; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: movl %esi, %eax -; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi -; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 52(%esp,%ebx), %edx +; FALLBACK16-NEXT: movl %edx, %ebp +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 56(%esp,%ebx), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK16-NEXT: movb %ah, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movb %ah, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: orl %ebp, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 60(%esp,%ebx), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %ebx +; FALLBACK16-NEXT: movb %ah, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: addl %esi, %esi +; FALLBACK16-NEXT: movb %ah, %cl ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %ebx, 28(%eax) -; FALLBACK16-NEXT: movl %esi, 24(%eax) -; FALLBACK16-NEXT: movl %edi, 16(%eax) +; FALLBACK16-NEXT: movl %edi, 28(%eax) +; FALLBACK16-NEXT: movl %esi, 4(%eax) +; FALLBACK16-NEXT: movl %ebx, 24(%eax) +; FALLBACK16-NEXT: movl %edx, 16(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl %ebp, 8(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 8(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 12(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, (%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) ; FALLBACK16-NEXT: addl $108, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi @@ -3816,73 +3709,60 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $92, %esp +; FALLBACK17-NEXT: subl $108, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl (%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ebp), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ebp), %esi -; FALLBACK17-NEXT: movl 12(%ebp), %edi -; FALLBACK17-NEXT: movl 16(%ebp), %ebx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movl 20(%ebp), %edx -; FALLBACK17-NEXT: movl 24(%ebp), %eax -; FALLBACK17-NEXT: movl 28(%ebp), %ebp -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movb %ch, %cl +; FALLBACK17-NEXT: movups (%ecx), %xmm0 +; FALLBACK17-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK17-NEXT: movzbl (%eax), %eax +; FALLBACK17-NEXT: movl %eax, %ecx ; FALLBACK17-NEXT: shlb $3, %cl +; FALLBACK17-NEXT: xorps %xmm2, %xmm2 +; FALLBACK17-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $28, %ch -; FALLBACK17-NEXT: movzbl %ch, %ebp -; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: andb $28, %al +; FALLBACK17-NEXT: movzbl %al, %edi +; FALLBACK17-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK17-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %esi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 40(%esp,%edi), %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 24(%ebp) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %esi -; FALLBACK17-NEXT: shrl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl %ebx, 16(%ebp) -; FALLBACK17-NEXT: movl %edi, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK17-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK17-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK17-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK17-NEXT: movl %esi, %edx +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK17-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK17-NEXT: shrdl %cl, %esi, %edi +; FALLBACK17-NEXT: shrl %cl, %eax +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl %edx, 4(%ecx) +; FALLBACK17-NEXT: movl %ebp, 24(%ecx) +; FALLBACK17-NEXT: movl %eax, 28(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %esi, (%ebp) +; FALLBACK17-NEXT: movl %eax, 16(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $92, %esp +; FALLBACK17-NEXT: movl %eax, 20(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 8(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 12(%ecx) +; FALLBACK17-NEXT: movl %edi, (%ecx) +; FALLBACK17-NEXT: addl $108, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx @@ -3896,99 +3776,82 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %esi -; FALLBACK18-NEXT: movl 12(%eax), %edi -; FALLBACK18-NEXT: movl 16(%eax), %ebp -; FALLBACK18-NEXT: movzbl (%ebx), %ebx -; FALLBACK18-NEXT: movl 20(%eax), %edx -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl 28(%eax), %eax -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movups (%ecx), %xmm0 +; FALLBACK18-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK18-NEXT: movzbl (%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, %edx +; FALLBACK18-NEXT: shlb $3, %dl +; FALLBACK18-NEXT: xorps %xmm2, %xmm2 +; FALLBACK18-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: movzbl %bl, %edi +; FALLBACK18-NEXT: andb $28, %cl +; FALLBACK18-NEXT: movzbl %cl, %edi +; FALLBACK18-NEXT: shrxl %edx, 32(%esp,%edi), %ecx +; FALLBACK18-NEXT: movl %edx, %eax +; FALLBACK18-NEXT: notb %al ; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %esi, %edx -; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %ebx, %ecx -; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx +; FALLBACK18-NEXT: shlxl %eax, %esi, %esi +; FALLBACK18-NEXT: orl %ecx, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %ecx, %ecx +; FALLBACK18-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: movl 44(%esp,%edi), %ecx +; FALLBACK18-NEXT: shrxl %edx, %ecx, %ebx ; FALLBACK18-NEXT: orl %ebx, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: addl %ecx, %ecx +; FALLBACK18-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK18-NEXT: movl 40(%esp,%edi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %edx, %eax, %ebx +; FALLBACK18-NEXT: orl %ebx, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: movl 56(%esp,%edi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %ebp, %ebx, %eax +; FALLBACK18-NEXT: movl %ebp, %ecx +; FALLBACK18-NEXT: movl 52(%esp,%edi), %ebx +; FALLBACK18-NEXT: shrxl %edx, %ebx, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: addl %ebx, %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %ebp, %ebx +; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx +; FALLBACK18-NEXT: shrxl %edx, %edi, %eax ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) +; FALLBACK18-NEXT: movl %ecx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK18-NEXT: orl %ebp, %edi ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: addl %ecx, %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK18-NEXT: orl %esi, %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: movl %eax, 28(%edx) +; FALLBACK18-NEXT: movl %ecx, 4(%edx) +; FALLBACK18-NEXT: movl %edi, 24(%edx) +; FALLBACK18-NEXT: movl %ebx, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edx) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -4002,74 +3865,59 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $92, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK19-NEXT: subl $108, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %esi -; FALLBACK19-NEXT: movl 12(%ecx), %edi -; FALLBACK19-NEXT: movl 16(%ecx), %ebp -; FALLBACK19-NEXT: movzbl (%ebx), %ebx -; FALLBACK19-NEXT: movl 20(%ecx), %edx -; FALLBACK19-NEXT: movl 24(%ecx), %eax -; FALLBACK19-NEXT: movl 28(%ecx), %ecx -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, %ecx +; FALLBACK19-NEXT: movups (%ecx), %xmm0 +; FALLBACK19-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK19-NEXT: movzbl (%eax), %eax +; FALLBACK19-NEXT: movl %eax, %ecx ; FALLBACK19-NEXT: shlb $3, %cl +; FALLBACK19-NEXT: xorps %xmm2, %xmm2 +; FALLBACK19-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $28, %bl -; FALLBACK19-NEXT: movzbl %bl, %ebp -; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %esi, %eax -; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: andb $28, %al +; FALLBACK19-NEXT: movzbl %al, %edi +; FALLBACK19-NEXT: movl 48(%esp,%edi), %edx +; FALLBACK19-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK19-NEXT: movl %eax, %esi +; FALLBACK19-NEXT: shrdl %cl, %edx, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 40(%esp,%edi), %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl %edx, %esi ; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi -; FALLBACK19-NEXT: shrdl %cl, %edi, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: shrxl %ecx, %edi, %eax -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl %ebx, 16(%ebp) -; FALLBACK19-NEXT: movl %esi, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK19-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK19-NEXT: movl %eax, %ebx +; FALLBACK19-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK19-NEXT: shrdl %cl, %eax, %edx +; FALLBACK19-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK19-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) +; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK19-NEXT: movl %esi, 4(%eax) +; FALLBACK19-NEXT: movl %ebp, 24(%eax) +; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK19-NEXT: movl %esi, 28(%eax) +; FALLBACK19-NEXT: movl %edx, 16(%eax) ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, (%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 4(%ebp) -; FALLBACK19-NEXT: addl $92, %esp +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: shrdl %cl, %edx, %edi +; FALLBACK19-NEXT: movl %ebx, 20(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 8(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 12(%eax) +; FALLBACK19-NEXT: movl %edi, (%eax) +; FALLBACK19-NEXT: addl $108, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx @@ -4096,79 +3944,79 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $28, %cl -; FALLBACK20-NEXT: movzbl %cl, %edi -; FALLBACK20-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movzbl %cl, %ebx +; FALLBACK20-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %esi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %esi +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 36(%esp,%ebx), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %ah +; FALLBACK20-NEXT: notb %ah +; FALLBACK20-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK20-NEXT: movb %ah, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 44(%esp,%ebx), %edi +; FALLBACK20-NEXT: movl %edi, %ebp ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 48(%esp,%ebx), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %esi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, %ebx +; FALLBACK20-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK20-NEXT: movb %ah, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 40(%esp,%ebx), %esi +; FALLBACK20-NEXT: movl %esi, %edx ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %ebp, %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %ebx +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %edi, %edi +; FALLBACK20-NEXT: movb %ah, %cl +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 52(%esp,%ebx), %edx +; FALLBACK20-NEXT: movl %edx, %ebp ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, (%esp) # 4-byte Spill +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 56(%esp,%ebx), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: movb %ah, %cl ; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebp, %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %edi, %ebp +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movb %ah, %cl +; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: orl %ebp, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 60(%esp,%ebx), %edi +; FALLBACK20-NEXT: leal (%edi,%edi), %ebx +; FALLBACK20-NEXT: movb %ah, %cl +; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: movb %ah, %cl ; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %ebx, 28(%eax) +; FALLBACK20-NEXT: movl %edi, 28(%eax) ; FALLBACK20-NEXT: movl %esi, 4(%eax) -; FALLBACK20-NEXT: movl %edi, 24(%eax) -; FALLBACK20-NEXT: movl %ebp, 16(%eax) +; FALLBACK20-NEXT: movl %ebx, 24(%eax) +; FALLBACK20-NEXT: movl %edx, 16(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 20(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4204,45 +4052,45 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: andb $28, %al -; FALLBACK21-NEXT: movzbl %al, %ebp -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK21-NEXT: movzbl %al, %edi +; FALLBACK21-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK21-NEXT: movl 44(%esp,%edi), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl 40(%esp,%edi), %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK21-NEXT: movl 52(%esp,%edi), %eax ; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK21-NEXT: shrdl %cl, %ebp, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl %edi, %esi -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %esi, 4(%ebp) -; FALLBACK21-NEXT: movl %ebx, 24(%ebp) -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx +; FALLBACK21-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK21-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK21-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK21-NEXT: movl %esi, %edx +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK21-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK21-NEXT: shrdl %cl, %esi, %edi ; FALLBACK21-NEXT: shrl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 28(%ebp) +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movl %edx, 4(%ecx) +; FALLBACK21-NEXT: movl %ebp, 24(%ecx) +; FALLBACK21-NEXT: movl %eax, 28(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) +; FALLBACK21-NEXT: movl %eax, 16(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) +; FALLBACK21-NEXT: movl %eax, 20(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) +; FALLBACK21-NEXT: movl %eax, 8(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %edx, (%ebp) +; FALLBACK21-NEXT: movl %eax, 12(%ecx) +; FALLBACK21-NEXT: movl %edi, (%ecx) ; FALLBACK21-NEXT: addl $108, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -4360,44 +4208,44 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: andb $28, %al -; FALLBACK23-NEXT: movzbl %al, %ebx -; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edi -; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK23-NEXT: movzbl %al, %edi +; FALLBACK23-NEXT: movl 48(%esp,%edi), %edx +; FALLBACK23-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK23-NEXT: movl %eax, %esi +; FALLBACK23-NEXT: shrdl %cl, %edx, %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK23-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK23-NEXT: movl %eax, %ebx +; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK23-NEXT: shrdl %cl, %eax, %edx +; FALLBACK23-NEXT: movl 60(%esp,%edi), %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK23-NEXT: shrdl %cl, %eax, %esi +; FALLBACK23-NEXT: movl 32(%esp,%edi), %edi ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl %ebx, 4(%eax) +; FALLBACK23-NEXT: movl %esi, 4(%eax) ; FALLBACK23-NEXT: movl %ebp, 24(%eax) -; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK23-NEXT: movl %ebx, 28(%eax) -; FALLBACK23-NEXT: movl %esi, 16(%eax) -; FALLBACK23-NEXT: movl %edi, 20(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 8(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 12(%eax) +; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK23-NEXT: movl %esi, 28(%eax) +; FALLBACK23-NEXT: movl %edx, 16(%eax) ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, (%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: shrdl %cl, %edx, %edi +; FALLBACK23-NEXT: movl %ebx, 20(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 8(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 12(%eax) +; FALLBACK23-NEXT: movl %edi, (%eax) ; FALLBACK23-NEXT: addl $108, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi @@ -4422,79 +4270,79 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $28, %cl -; FALLBACK24-NEXT: movzbl %cl, %edi -; FALLBACK24-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %esi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %esi +; FALLBACK24-NEXT: movzbl %cl, %ebx +; FALLBACK24-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 36(%esp,%ebx), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %esi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, %ebx +; FALLBACK24-NEXT: movb %al, %ah +; FALLBACK24-NEXT: notb %ah +; FALLBACK24-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK24-NEXT: movb %ah, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 44(%esp,%ebx), %edi +; FALLBACK24-NEXT: movl %edi, %ebp ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %ebp, %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %ebx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 48(%esp,%ebx), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK24-NEXT: movb %ah, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 40(%esp,%ebx), %esi +; FALLBACK24-NEXT: movl %esi, %edx ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %edi, %edi +; FALLBACK24-NEXT: movb %ah, %cl ; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 52(%esp,%ebx), %edx +; FALLBACK24-NEXT: movl %edx, %ebp +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 56(%esp,%ebx), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK24-NEXT: movb %ah, %cl +; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebp, %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %edi, %ebp +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movb %ah, %cl +; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: orl %ebp, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 60(%esp,%ebx), %edi +; FALLBACK24-NEXT: leal (%edi,%edi), %ebx +; FALLBACK24-NEXT: movb %ah, %cl +; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: movb %ah, %cl ; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %ebx, 28(%eax) +; FALLBACK24-NEXT: movl %edi, 28(%eax) ; FALLBACK24-NEXT: movl %esi, 4(%eax) -; FALLBACK24-NEXT: movl %edi, 24(%eax) -; FALLBACK24-NEXT: movl %ebp, 16(%eax) +; FALLBACK24-NEXT: movl %ebx, 24(%eax) +; FALLBACK24-NEXT: movl %edx, 16(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 20(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4528,45 +4376,45 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: andb $28, %al -; FALLBACK25-NEXT: movzbl %al, %ebp -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK25-NEXT: movzbl %al, %edi +; FALLBACK25-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK25-NEXT: movl 44(%esp,%edi), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl 40(%esp,%edi), %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK25-NEXT: movl 52(%esp,%edi), %eax ; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK25-NEXT: shrdl %cl, %ebp, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl %edi, %esi -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %esi, 4(%ebp) -; FALLBACK25-NEXT: movl %ebx, 24(%ebp) -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx +; FALLBACK25-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK25-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK25-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK25-NEXT: movl %esi, %edx +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK25-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK25-NEXT: shrdl %cl, %esi, %edi ; FALLBACK25-NEXT: shrl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 28(%ebp) +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: movl %edx, 4(%ecx) +; FALLBACK25-NEXT: movl %ebp, 24(%ecx) +; FALLBACK25-NEXT: movl %eax, 28(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) +; FALLBACK25-NEXT: movl %eax, 16(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) +; FALLBACK25-NEXT: movl %eax, 20(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) +; FALLBACK25-NEXT: movl %eax, 8(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %edx, (%ebp) +; FALLBACK25-NEXT: movl %eax, 12(%ecx) +; FALLBACK25-NEXT: movl %edi, (%ecx) ; FALLBACK25-NEXT: addl $108, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -4680,44 +4528,44 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: andb $28, %al -; FALLBACK27-NEXT: movzbl %al, %ebx -; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edi -; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK27-NEXT: movzbl %al, %edi +; FALLBACK27-NEXT: movl 48(%esp,%edi), %edx +; FALLBACK27-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK27-NEXT: movl %eax, %esi +; FALLBACK27-NEXT: shrdl %cl, %edx, %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK27-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK27-NEXT: movl %eax, %ebx +; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK27-NEXT: shrdl %cl, %eax, %edx +; FALLBACK27-NEXT: movl 60(%esp,%edi), %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK27-NEXT: shrdl %cl, %eax, %esi +; FALLBACK27-NEXT: movl 32(%esp,%edi), %edi ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl %ebx, 4(%eax) +; FALLBACK27-NEXT: movl %esi, 4(%eax) ; FALLBACK27-NEXT: movl %ebp, 24(%eax) -; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK27-NEXT: movl %ebx, 28(%eax) -; FALLBACK27-NEXT: movl %esi, 16(%eax) -; FALLBACK27-NEXT: movl %edi, 20(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 8(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 12(%eax) +; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK27-NEXT: movl %esi, 28(%eax) +; FALLBACK27-NEXT: movl %edx, 16(%eax) ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, (%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: shrdl %cl, %edx, %edi +; FALLBACK27-NEXT: movl %ebx, 20(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 8(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 12(%eax) +; FALLBACK27-NEXT: movl %edi, (%eax) ; FALLBACK27-NEXT: addl $108, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi @@ -4743,79 +4591,79 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $28, %cl -; FALLBACK28-NEXT: movzbl %cl, %edi -; FALLBACK28-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movzbl %cl, %ebx +; FALLBACK28-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %esi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %esi +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 36(%esp,%ebx), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %ah +; FALLBACK28-NEXT: notb %ah +; FALLBACK28-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK28-NEXT: movb %ah, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 44(%esp,%ebx), %edi +; FALLBACK28-NEXT: movl %edi, %ebp ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 48(%esp,%ebx), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %esi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, %ebx +; FALLBACK28-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK28-NEXT: movb %ah, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 40(%esp,%ebx), %esi +; FALLBACK28-NEXT: movl %esi, %edx ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %ebp, %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %ebx +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %edi, %edi +; FALLBACK28-NEXT: movb %ah, %cl +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 52(%esp,%ebx), %edx +; FALLBACK28-NEXT: movl %edx, %ebp ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, (%esp) # 4-byte Spill +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 56(%esp,%ebx), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: movb %ah, %cl ; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebp, %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %edi, %ebp +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movb %ah, %cl +; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: orl %ebp, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 60(%esp,%ebx), %edi +; FALLBACK28-NEXT: leal (%edi,%edi), %ebx +; FALLBACK28-NEXT: movb %ah, %cl +; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: movb %ah, %cl ; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %ebx, 28(%eax) +; FALLBACK28-NEXT: movl %edi, 28(%eax) ; FALLBACK28-NEXT: movl %esi, 4(%eax) -; FALLBACK28-NEXT: movl %edi, 24(%eax) -; FALLBACK28-NEXT: movl %ebp, 16(%eax) +; FALLBACK28-NEXT: movl %ebx, 24(%eax) +; FALLBACK28-NEXT: movl %edx, 16(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 20(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4849,45 +4697,45 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: andb $28, %al -; FALLBACK29-NEXT: movzbl %al, %ebp -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK29-NEXT: movzbl %al, %edi +; FALLBACK29-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK29-NEXT: movl 44(%esp,%edi), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl 40(%esp,%edi), %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK29-NEXT: movl 52(%esp,%edi), %eax ; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK29-NEXT: shrdl %cl, %ebp, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl %edi, %esi -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %esi, 4(%ebp) -; FALLBACK29-NEXT: movl %ebx, 24(%ebp) -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx +; FALLBACK29-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK29-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK29-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK29-NEXT: movl %esi, %edx +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK29-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK29-NEXT: shrdl %cl, %esi, %edi ; FALLBACK29-NEXT: shrl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 28(%ebp) +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: movl %edx, 4(%ecx) +; FALLBACK29-NEXT: movl %ebp, 24(%ecx) +; FALLBACK29-NEXT: movl %eax, 28(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) +; FALLBACK29-NEXT: movl %eax, 16(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) +; FALLBACK29-NEXT: movl %eax, 20(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) +; FALLBACK29-NEXT: movl %eax, 8(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %edx, (%ebp) +; FALLBACK29-NEXT: movl %eax, 12(%ecx) +; FALLBACK29-NEXT: movl %edi, (%ecx) ; FALLBACK29-NEXT: addl $108, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -5001,44 +4849,44 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: andb $28, %al -; FALLBACK31-NEXT: movzbl %al, %ebx -; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edi -; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK31-NEXT: movzbl %al, %edi +; FALLBACK31-NEXT: movl 48(%esp,%edi), %edx +; FALLBACK31-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK31-NEXT: movl %eax, %esi +; FALLBACK31-NEXT: shrdl %cl, %edx, %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK31-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK31-NEXT: movl %eax, %ebx +; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK31-NEXT: shrdl %cl, %eax, %edx +; FALLBACK31-NEXT: movl 60(%esp,%edi), %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK31-NEXT: shrdl %cl, %eax, %esi +; FALLBACK31-NEXT: movl 32(%esp,%edi), %edi ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl %ebx, 4(%eax) +; FALLBACK31-NEXT: movl %esi, 4(%eax) ; FALLBACK31-NEXT: movl %ebp, 24(%eax) -; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK31-NEXT: movl %ebx, 28(%eax) -; FALLBACK31-NEXT: movl %esi, 16(%eax) -; FALLBACK31-NEXT: movl %edi, 20(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 8(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 12(%eax) +; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK31-NEXT: movl %esi, 28(%eax) +; FALLBACK31-NEXT: movl %edx, 16(%eax) ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, (%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: shrdl %cl, %edx, %edi +; FALLBACK31-NEXT: movl %ebx, 20(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 8(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 12(%eax) +; FALLBACK31-NEXT: movl %edi, (%eax) ; FALLBACK31-NEXT: addl $108, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi @@ -5057,167 +4905,156 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: lshr_32bytes_dwordOff: ; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: movl %esi, %eax +; FALLBACK0-NEXT: movups (%rdi), %xmm0 +; FALLBACK0-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK0-NEXT: movzbl (%rsi), %ecx +; FALLBACK0-NEXT: movl %ecx, %eax ; FALLBACK0-NEXT: shlb $5, %al -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 +; FALLBACK0-NEXT: xorps %xmm2, %xmm2 +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $6, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: andb $6, %cl +; FALLBACK0-NEXT: movzbl %cl, %r9d +; FALLBACK0-NEXT: movq -72(%rsp,%r9,4), %r10 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r8 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi +; FALLBACK0-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %r11 +; FALLBACK0-NEXT: movq %r11, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 +; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %r9 +; FALLBACK0-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: shlq %cl, %r14 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r8 +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: orq %rbx, %r14 +; FALLBACK0-NEXT: addq %r11, %r11 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: orq %r8, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r9 ; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %r11, 8(%rdx) +; FALLBACK0-NEXT: movq %r14, 16(%rdx) ; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: lshr_32bytes_dwordOff: ; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: movl %esi, %ecx +; FALLBACK1-NEXT: movups (%rdi), %xmm0 +; FALLBACK1-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK1-NEXT: movzbl (%rsi), %eax +; FALLBACK1-NEXT: movl %eax, %ecx ; FALLBACK1-NEXT: shlb $5, %cl -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: xorps %xmm2, %xmm2 +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $6, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: shrq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) +; FALLBACK1-NEXT: andb $6, %al +; FALLBACK1-NEXT: movzbl %al, %eax +; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK1-NEXT: movq %rdi, %r8 +; FALLBACK1-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK1-NEXT: movq %r9, %r10 +; FALLBACK1-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK1-NEXT: shrdq %cl, %r9, %rax +; FALLBACK1-NEXT: shrq %cl, %rsi +; FALLBACK1-NEXT: movq %r10, 8(%rdx) +; FALLBACK1-NEXT: movq %r8, 16(%rdx) +; FALLBACK1-NEXT: movq %rsi, 24(%rdx) +; FALLBACK1-NEXT: movq %rax, (%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: lshr_32bytes_dwordOff: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: movl %esi, %eax +; FALLBACK2-NEXT: movups (%rdi), %xmm0 +; FALLBACK2-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK2-NEXT: movzbl (%rsi), %ecx +; FALLBACK2-NEXT: movl %ecx, %eax ; FALLBACK2-NEXT: shlb $5, %al -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: xorps %xmm2, %xmm2 +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK2-NEXT: andb $6, %cl +; FALLBACK2-NEXT: movzbl %cl, %ecx +; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi +; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: orq %rsi, %rdi ; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax +; FALLBACK2-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK2-NEXT: orq %r9, %rcx +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rax, %r8, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rax, 8(%rdx) +; FALLBACK2-NEXT: movq %rcx, 16(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes_dwordOff: ; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: movl %esi, %ecx +; FALLBACK3-NEXT: movups (%rdi), %xmm0 +; FALLBACK3-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK3-NEXT: movzbl (%rsi), %eax +; FALLBACK3-NEXT: movl %eax, %ecx ; FALLBACK3-NEXT: shlb $5, %cl -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: xorps %xmm2, %xmm2 +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $6, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) +; FALLBACK3-NEXT: andb $6, %al +; FALLBACK3-NEXT: movzbl %al, %eax +; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK3-NEXT: movq %rdi, %r8 +; FALLBACK3-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK3-NEXT: movq %r9, %r10 +; FALLBACK3-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK3-NEXT: shrdq %cl, %r9, %rax +; FALLBACK3-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK3-NEXT: movq %r10, 8(%rdx) +; FALLBACK3-NEXT: movq %r8, 16(%rdx) +; FALLBACK3-NEXT: movq %rcx, 24(%rdx) +; FALLBACK3-NEXT: movq %rax, (%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: lshr_32bytes_dwordOff: ; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %r14 ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 @@ -5231,38 +5068,39 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: andb $6, %cl ; FALLBACK4-NEXT: movzbl %cl, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK4-NEXT: movq -72(%rsp,%r9,4), %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r8 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 +; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r11 +; FALLBACK4-NEXT: movq %r11, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK4-NEXT: shrq %cl, %rbx +; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r9 +; FALLBACK4-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx +; FALLBACK4-NEXT: shlq %cl, %r14 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: orq %r10, %rdi +; FALLBACK4-NEXT: orq %rbx, %r14 +; FALLBACK4-NEXT: addq %r11, %r11 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: shlq %cl, %r11 +; FALLBACK4-NEXT: orq %r8, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %r11, 8(%rdx) +; FALLBACK4-NEXT: movq %r14, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) ; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: popq %r14 ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: lshr_32bytes_dwordOff: @@ -5283,16 +5121,16 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK5-NEXT: movq %rdi, %r8 ; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK5-NEXT: movq %rax, %r10 +; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK5-NEXT: movq %r9, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK5-NEXT: shrdq %cl, %r9, %rax ; FALLBACK5-NEXT: shrq %cl, %rsi ; FALLBACK5-NEXT: movq %r10, 8(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: movq %rax, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: lshr_32bytes_dwordOff: @@ -5351,20 +5189,21 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK7-NEXT: movq %rdi, %r8 ; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK7-NEXT: movq %rax, %r10 +; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK7-NEXT: movq %r9, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK7-NEXT: shrdq %cl, %r9, %rax +; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rcx ; FALLBACK7-NEXT: movq %r10, 8(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: movq %rcx, 24(%rdx) +; FALLBACK7-NEXT: movq %rax, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: lshr_32bytes_dwordOff: ; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %r14 ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: movzbl (%rsi), %ecx @@ -5375,38 +5214,39 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: andb $6, %cl ; FALLBACK8-NEXT: movzbl %cl, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK8-NEXT: movq -72(%rsp,%r9,4), %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r8 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 +; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r11 +; FALLBACK8-NEXT: movq %r11, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK8-NEXT: shrq %cl, %rbx +; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r9 +; FALLBACK8-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx +; FALLBACK8-NEXT: shlq %cl, %r14 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: orq %r10, %rdi +; FALLBACK8-NEXT: orq %rbx, %r14 +; FALLBACK8-NEXT: addq %r11, %r11 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: shlq %cl, %r11 +; FALLBACK8-NEXT: orq %r8, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %r11, 8(%rdx) +; FALLBACK8-NEXT: movq %r14, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) ; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: popq %r14 ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; @@ -5425,16 +5265,16 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK9-NEXT: movq %rdi, %r8 ; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK9-NEXT: movq %rax, %r10 +; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK9-NEXT: movq %r9, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK9-NEXT: shrdq %cl, %r9, %rax ; FALLBACK9-NEXT: shrq %cl, %rsi ; FALLBACK9-NEXT: movq %r10, 8(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: movq %rax, (%rdx) ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; @@ -5489,21 +5329,22 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK11-NEXT: movq %rdi, %r8 ; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK11-NEXT: movq %rax, %r10 +; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK11-NEXT: movq %r9, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK11-NEXT: shrdq %cl, %r9, %rax +; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rcx ; FALLBACK11-NEXT: movq %r10, 8(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: movq %rcx, 24(%rdx) +; FALLBACK11-NEXT: movq %rax, (%rdx) ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: lshr_32bytes_dwordOff: ; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %r14 ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK12-NEXT: movzbl (%rsi), %ecx @@ -5514,38 +5355,39 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: andb $6, %cl ; FALLBACK12-NEXT: movzbl %cl, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK12-NEXT: movq -72(%rsp,%r9,4), %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r8 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 +; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r11 +; FALLBACK12-NEXT: movq %r11, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK12-NEXT: shrq %cl, %rbx +; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx +; FALLBACK12-NEXT: shlq %cl, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: orq %r10, %rdi +; FALLBACK12-NEXT: orq %rbx, %r14 +; FALLBACK12-NEXT: addq %r11, %r11 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: shlq %cl, %r11 +; FALLBACK12-NEXT: orq %r8, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %r11, 8(%rdx) +; FALLBACK12-NEXT: movq %r14, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: popq %r14 ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; @@ -5564,16 +5406,16 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK13-NEXT: movq %rdi, %r8 ; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK13-NEXT: movq %rax, %r10 +; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK13-NEXT: movq %r9, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK13-NEXT: shrdq %cl, %r9, %rax ; FALLBACK13-NEXT: shrq %cl, %rsi ; FALLBACK13-NEXT: movq %r10, 8(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: movq %rax, (%rdx) ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; @@ -5628,79 +5470,39 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK15-NEXT: movq %rdi, %r8 ; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK15-NEXT: movq %rax, %r10 +; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK15-NEXT: movq %r9, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK15-NEXT: shrdq %cl, %r9, %rax +; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rcx ; FALLBACK15-NEXT: movq %r10, 8(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: movq %rcx, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, (%rdx) ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; ; X86-SSE2-LABEL: lshr_32bytes_dwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %edi -; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $92, %esp +; X86-SSE2-NEXT: subl $76, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%eax), %esi -; X86-SSE2-NEXT: movl 12(%eax), %edi -; X86-SSE2-NEXT: movl 16(%eax), %ebx -; X86-SSE2-NEXT: movl 20(%eax), %ebp -; X86-SSE2-NEXT: movl 24(%eax), %edx -; X86-SSE2-NEXT: movl 28(%eax), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movzbl (%eax), %eax -; X86-SSE2-NEXT: xorps %xmm0, %xmm0 -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $7, %eax -; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi -; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi -; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx -; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp -; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx -; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $92, %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %edi -; X86-SSE2-NEXT: popl %ebx -; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movups 16(%edx), %xmm1 +; X86-SSE2-NEXT: movzbl (%ecx), %ecx +; X86-SSE2-NEXT: xorps %xmm2, %xmm2 +; X86-SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-SSE2-NEXT: andl $7, %ecx +; X86-SSE2-NEXT: movups (%esp,%ecx,4), %xmm0 +; X86-SSE2-NEXT: movups 16(%esp,%ecx,4), %xmm1 +; X86-SSE2-NEXT: movups %xmm1, 16(%eax) +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $76, %esp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: lshr_32bytes_dwordOff: @@ -5755,27 +5557,19 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: lshr_32bytes_qwordOff: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movq (%rdi), %rax -; X64-SSE2-NEXT: movq 8(%rdi), %rcx -; X64-SSE2-NEXT: movq 16(%rdi), %r8 -; X64-SSE2-NEXT: movq 24(%rdi), %rdi -; X64-SSE2-NEXT: movzbl (%rsi), %esi -; X64-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-SSE2-NEXT: movups (%rdi), %xmm0 +; X64-SSE2-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movzbl (%rsi), %eax +; X64-SSE2-NEXT: xorps %xmm2, %xmm2 +; X64-SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: andl $3, %esi -; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax -; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi -; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi -; X64-SSE2-NEXT: movq %rsi, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) -; X64-SSE2-NEXT: movq %rcx, 8(%rdx) +; X64-SSE2-NEXT: andl $3, %eax +; X64-SSE2-NEXT: movups -72(%rsp,%rax,8), %xmm0 +; X64-SSE2-NEXT: movups -56(%rsp,%rax,8), %xmm1 +; X64-SSE2-NEXT: movups %xmm1, 16(%rdx) +; X64-SSE2-NEXT: movups %xmm0, (%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: lshr_32bytes_qwordOff: @@ -5812,64 +5606,24 @@ define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no ; ; X86-SSE2-LABEL: lshr_32bytes_qwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %edi -; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $92, %esp -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%eax), %esi -; X86-SSE2-NEXT: movl 12(%eax), %edi -; X86-SSE2-NEXT: movl 16(%eax), %ebx -; X86-SSE2-NEXT: movl 20(%eax), %ebp -; X86-SSE2-NEXT: movl 24(%eax), %edx -; X86-SSE2-NEXT: movl 28(%eax), %ecx +; X86-SSE2-NEXT: subl $76, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movzbl (%eax), %eax -; X86-SSE2-NEXT: xorps %xmm0, %xmm0 -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $3, %eax -; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi -; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi -; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx -; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp -; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx -; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $92, %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %edi -; X86-SSE2-NEXT: popl %ebx -; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movups 16(%edx), %xmm1 +; X86-SSE2-NEXT: movzbl (%ecx), %ecx +; X86-SSE2-NEXT: xorps %xmm2, %xmm2 +; X86-SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-SSE2-NEXT: andl $3, %ecx +; X86-SSE2-NEXT: movups (%esp,%ecx,8), %xmm0 +; X86-SSE2-NEXT: movups 16(%esp,%ecx,8), %xmm1 +; X86-SSE2-NEXT: movups %xmm1, 16(%eax) +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $76, %esp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: lshr_32bytes_qwordOff: @@ -5924,164 +5678,148 @@ define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: shl_32bytes: ; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movups (%rdi), %xmm0 +; FALLBACK0-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK0-NEXT: movzbl (%rsi), %ecx +; FALLBACK0-NEXT: leal (,%rcx,8), %eax +; FALLBACK0-NEXT: xorps %xmm2, %xmm2 +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: negb %sil -; FALLBACK0-NEXT: movsbq %sil, %r10 -; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 -; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: andb $24, %cl +; FALLBACK0-NEXT: negb %cl +; FALLBACK0-NEXT: movsbq %cl, %r8 +; FALLBACK0-NEXT: movq -16(%rsp,%r8), %r9 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: shlq %cl, %r9 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq %r8, %r9 -; FALLBACK0-NEXT: shrq %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 -; FALLBACK0-NEXT: movq %r10, %rbx -; FALLBACK0-NEXT: shrq %rbx +; FALLBACK0-NEXT: movq -24(%rsp,%r8), %r10 +; FALLBACK0-NEXT: movq %r10, %rdi +; FALLBACK0-NEXT: shrq %rdi ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: orq %r11, %rbx +; FALLBACK0-NEXT: shrq %cl, %rdi +; FALLBACK0-NEXT: orq %r9, %rdi ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: shrq %rdi +; FALLBACK0-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK0-NEXT: movq -32(%rsp,%r8), %r8 +; FALLBACK0-NEXT: movq %r8, %r11 +; FALLBACK0-NEXT: shrq %r11 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %rdi, 16(%rdx) -; FALLBACK0-NEXT: movq %rbx, 24(%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: orq %r10, %r11 +; FALLBACK0-NEXT: movq %r9, %r10 +; FALLBACK0-NEXT: shrq %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: orq %r8, %r10 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r9 +; FALLBACK0-NEXT: movq %r9, (%rdx) +; FALLBACK0-NEXT: movq %r10, 8(%rdx) +; FALLBACK0-NEXT: movq %r11, 16(%rdx) +; FALLBACK0-NEXT: movq %rdi, 24(%rdx) ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: shl_32bytes: ; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movups (%rdi), %xmm0 +; FALLBACK1-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK1-NEXT: movzbl (%rsi), %eax +; FALLBACK1-NEXT: leal (,%rax,8), %ecx +; FALLBACK1-NEXT: xorps %xmm2, %xmm2 +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: negb %sil -; FALLBACK1-NEXT: movsbq %sil, %rax +; FALLBACK1-NEXT: andb $24, %al +; FALLBACK1-NEXT: negb %al +; FALLBACK1-NEXT: movsbq %al, %rax ; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK1-NEXT: shldq %cl, %rax, %rsi -; FALLBACK1-NEXT: shldq %cl, %r8, %rax +; FALLBACK1-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK1-NEXT: shldq %cl, %r8, %rsi +; FALLBACK1-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK1-NEXT: movq %rax, %r9 +; FALLBACK1-NEXT: shlq %cl, %r9 ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shlq %cl, %r8 +; FALLBACK1-NEXT: shldq %cl, %rax, %r8 +; FALLBACK1-NEXT: movq %r8, 8(%rdx) ; FALLBACK1-NEXT: movq %rsi, 16(%rdx) ; FALLBACK1-NEXT: movq %rdi, 24(%rdx) -; FALLBACK1-NEXT: movq %r8, (%rdx) -; FALLBACK1-NEXT: movq %rax, 8(%rdx) +; FALLBACK1-NEXT: movq %r9, (%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: shl_32bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 +; FALLBACK2-NEXT: movups (%rdi), %xmm0 +; FALLBACK2-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK2-NEXT: movzbl (%rsi), %ecx +; FALLBACK2-NEXT: leal (,%rcx,8), %eax +; FALLBACK2-NEXT: xorps %xmm2, %xmm2 +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 +; FALLBACK2-NEXT: andb $24, %cl +; FALLBACK2-NEXT: negb %cl +; FALLBACK2-NEXT: movsbq %cl, %rcx +; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi +; FALLBACK2-NEXT: movq -24(%rsp,%rcx), %rdi +; FALLBACK2-NEXT: shlxq %rax, %rdi, %r8 +; FALLBACK2-NEXT: movq -40(%rsp,%rcx), %r9 +; FALLBACK2-NEXT: movq -32(%rsp,%rcx), %rcx +; FALLBACK2-NEXT: shlxq %rax, %rcx, %r10 +; FALLBACK2-NEXT: shlxq %rax, %r9, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: orq %rsi, %rdi ; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax +; FALLBACK2-NEXT: shrxq %rax, %rcx, %rcx +; FALLBACK2-NEXT: orq %r8, %rcx +; FALLBACK2-NEXT: shrq %r9 +; FALLBACK2-NEXT: shrxq %rax, %r9, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, (%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rax, 8(%rdx) +; FALLBACK2-NEXT: movq %rcx, 16(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes: ; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movups (%rdi), %xmm0 +; FALLBACK3-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK3-NEXT: movzbl (%rsi), %eax +; FALLBACK3-NEXT: leal (,%rax,8), %ecx +; FALLBACK3-NEXT: xorps %xmm2, %xmm2 +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: negb %sil -; FALLBACK3-NEXT: movsbq %sil, %rax +; FALLBACK3-NEXT: andb $24, %al +; FALLBACK3-NEXT: negb %al +; FALLBACK3-NEXT: movsbq %al, %rax ; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK3-NEXT: shldq %cl, %rax, %rsi -; FALLBACK3-NEXT: shldq %cl, %r8, %rax -; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx +; FALLBACK3-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK3-NEXT: shldq %cl, %r8, %rsi +; FALLBACK3-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK3-NEXT: shlxq %rcx, %rax, %r9 +; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK3-NEXT: shldq %cl, %rax, %r8 +; FALLBACK3-NEXT: movq %r8, 8(%rdx) ; FALLBACK3-NEXT: movq %rsi, 16(%rdx) ; FALLBACK3-NEXT: movq %rdi, 24(%rdx) -; FALLBACK3-NEXT: movq %rcx, (%rdx) -; FALLBACK3-NEXT: movq %rax, 8(%rdx) +; FALLBACK3-NEXT: movq %r9, (%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: shl_32bytes: @@ -6117,9 +5855,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: shrq %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: orq %r10, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r8 +; FALLBACK4-NEXT: orq %r10, %r11 ; FALLBACK4-NEXT: movq %r9, %r10 ; FALLBACK4-NEXT: shrq %r10 ; FALLBACK4-NEXT: movl %esi, %ecx @@ -6150,14 +5888,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK5-NEXT: shldq %cl, %rax, %rsi -; FALLBACK5-NEXT: movq %r8, %r9 +; FALLBACK5-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK5-NEXT: shldq %cl, %r8, %rsi +; FALLBACK5-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK5-NEXT: movq %rax, %r9 ; FALLBACK5-NEXT: shlq %cl, %r9 ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shldq %cl, %r8, %rax -; FALLBACK5-NEXT: movq %rax, 8(%rdx) +; FALLBACK5-NEXT: shldq %cl, %rax, %r8 +; FALLBACK5-NEXT: movq %r8, 8(%rdx) ; FALLBACK5-NEXT: movq %rsi, 16(%rdx) ; FALLBACK5-NEXT: movq %rdi, 24(%rdx) ; FALLBACK5-NEXT: movq %r9, (%rdx) @@ -6218,13 +5956,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK7-NEXT: shldq %cl, %rax, %rsi -; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK7-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK7-NEXT: shldq %cl, %r8, %rsi +; FALLBACK7-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK7-NEXT: shlxq %rcx, %rax, %r9 ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shldq %cl, %r8, %rax -; FALLBACK7-NEXT: movq %rax, 8(%rdx) +; FALLBACK7-NEXT: shldq %cl, %rax, %r8 +; FALLBACK7-NEXT: movq %r8, 8(%rdx) ; FALLBACK7-NEXT: movq %rsi, 16(%rdx) ; FALLBACK7-NEXT: movq %rdi, 24(%rdx) ; FALLBACK7-NEXT: movq %r9, (%rdx) @@ -6260,9 +5998,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: shrq %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: orq %r10, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r8 +; FALLBACK8-NEXT: orq %r10, %r11 ; FALLBACK8-NEXT: movq %r9, %r10 ; FALLBACK8-NEXT: shrq %r10 ; FALLBACK8-NEXT: movl %esi, %ecx @@ -6291,14 +6029,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK9-NEXT: shldq %cl, %rax, %rsi -; FALLBACK9-NEXT: movq %r8, %r9 +; FALLBACK9-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK9-NEXT: shldq %cl, %r8, %rsi +; FALLBACK9-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK9-NEXT: movq %rax, %r9 ; FALLBACK9-NEXT: shlq %cl, %r9 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shldq %cl, %r8, %rax -; FALLBACK9-NEXT: movq %rax, 8(%rdx) +; FALLBACK9-NEXT: shldq %cl, %rax, %r8 +; FALLBACK9-NEXT: movq %r8, 8(%rdx) ; FALLBACK9-NEXT: movq %rsi, 16(%rdx) ; FALLBACK9-NEXT: movq %rdi, 24(%rdx) ; FALLBACK9-NEXT: movq %r9, (%rdx) @@ -6355,13 +6093,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK11-NEXT: shldq %cl, %rax, %rsi -; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK11-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK11-NEXT: shldq %cl, %r8, %rsi +; FALLBACK11-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK11-NEXT: shlxq %rcx, %rax, %r9 ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shldq %cl, %r8, %rax -; FALLBACK11-NEXT: movq %rax, 8(%rdx) +; FALLBACK11-NEXT: shldq %cl, %rax, %r8 +; FALLBACK11-NEXT: movq %r8, 8(%rdx) ; FALLBACK11-NEXT: movq %rsi, 16(%rdx) ; FALLBACK11-NEXT: movq %rdi, 24(%rdx) ; FALLBACK11-NEXT: movq %r9, (%rdx) @@ -6398,9 +6136,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: shrq %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: orq %r10, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r8 +; FALLBACK12-NEXT: orq %r10, %r11 ; FALLBACK12-NEXT: movq %r9, %r10 ; FALLBACK12-NEXT: shrq %r10 ; FALLBACK12-NEXT: movl %esi, %ecx @@ -6429,14 +6167,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK13-NEXT: shldq %cl, %rax, %rsi -; FALLBACK13-NEXT: movq %r8, %r9 +; FALLBACK13-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK13-NEXT: shldq %cl, %r8, %rsi +; FALLBACK13-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK13-NEXT: movq %rax, %r9 ; FALLBACK13-NEXT: shlq %cl, %r9 ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shldq %cl, %r8, %rax -; FALLBACK13-NEXT: movq %rax, 8(%rdx) +; FALLBACK13-NEXT: shldq %cl, %rax, %r8 +; FALLBACK13-NEXT: movq %r8, 8(%rdx) ; FALLBACK13-NEXT: movq %rsi, 16(%rdx) ; FALLBACK13-NEXT: movq %rdi, 24(%rdx) ; FALLBACK13-NEXT: movq %r9, (%rdx) @@ -6493,13 +6231,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK15-NEXT: shldq %cl, %rax, %rsi -; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK15-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK15-NEXT: shldq %cl, %r8, %rsi +; FALLBACK15-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK15-NEXT: shlxq %rcx, %rax, %r9 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shldq %cl, %r8, %rax -; FALLBACK15-NEXT: movq %rax, 8(%rdx) +; FALLBACK15-NEXT: shldq %cl, %rax, %r8 +; FALLBACK15-NEXT: movq %r8, 8(%rdx) ; FALLBACK15-NEXT: movq %rsi, 16(%rdx) ; FALLBACK15-NEXT: movq %rdi, 24(%rdx) ; FALLBACK15-NEXT: movq %r9, (%rdx) @@ -6512,119 +6250,113 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $108, %esp +; FALLBACK16-NEXT: subl $124, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %edx +; FALLBACK16-NEXT: movups (%ecx), %xmm0 +; FALLBACK16-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK16-NEXT: movzbl (%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, %ebx +; FALLBACK16-NEXT: shlb $3, %bl +; FALLBACK16-NEXT: xorps %xmm2, %xmm2 +; FALLBACK16-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: andb $28, %cl +; FALLBACK16-NEXT: negb %cl +; FALLBACK16-NEXT: movsbl %cl, %ebp +; FALLBACK16-NEXT: movl 100(%esp,%ebp), %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %edx +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%ecx), %esi -; FALLBACK16-NEXT: movl 12(%ecx), %edi -; FALLBACK16-NEXT: movl 16(%ecx), %ebx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movl 20(%ecx), %ebp -; FALLBACK16-NEXT: movl 24(%ecx), %edx -; FALLBACK16-NEXT: movl 28(%ecx), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movb %ah, %ch -; FALLBACK16-NEXT: shlb $3, %ch -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $28, %ah -; FALLBACK16-NEXT: negb %ah -; FALLBACK16-NEXT: movsbl %ah, %ebx -; FALLBACK16-NEXT: movl 64(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 68(%esp,%ebx), %eax +; FALLBACK16-NEXT: movb %bl, %ch +; FALLBACK16-NEXT: notb %ch +; FALLBACK16-NEXT: movl 96(%esp,%ebp), %eax ; FALLBACK16-NEXT: movl %eax, %esi +; FALLBACK16-NEXT: shrl %esi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movb %ch, %dl -; FALLBACK16-NEXT: notb %dl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 92(%esp,%ebp), %esi +; FALLBACK16-NEXT: movl %esi, %edi ; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl 72(%esp,%ebx), %esi -; FALLBACK16-NEXT: movl %esi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %eax -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: orl %esi, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 84(%esp,%ebx), %esi -; FALLBACK16-NEXT: movl %esi, %eax +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 88(%esp,%ebp), %eax +; FALLBACK16-NEXT: movl %eax, %esi +; FALLBACK16-NEXT: shrl %esi ; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %eax -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: orl %edi, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK16-NEXT: movl 84(%esp,%ebp), %edx +; FALLBACK16-NEXT: movl %edx, %esi +; FALLBACK16-NEXT: shrl %esi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: movl 88(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, %ebx +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %ebx, %eax +; FALLBACK16-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 80(%esp,%ebp), %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: shrl %ebx -; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: orl %eax, %ebx +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 104(%esp,%ebp), %edi +; FALLBACK16-NEXT: movl %edi, %edx +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %eax ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: shrl %esi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: orl %edi, %esi +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl %eax, %edx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK16-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: shrl %edi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK16-NEXT: orl %ebp, %edi +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 24(%eax) -; FALLBACK16-NEXT: movl %ebx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl %ebp, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) +; FALLBACK16-NEXT: movl %ebp, (%eax) +; FALLBACK16-NEXT: movl %edi, 28(%eax) +; FALLBACK16-NEXT: movl %edx, 24(%eax) +; FALLBACK16-NEXT: movl %ebx, 4(%eax) +; FALLBACK16-NEXT: movl %esi, 8(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 12(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $108, %esp +; FALLBACK16-NEXT: movl %ecx, 16(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 20(%eax) +; FALLBACK16-NEXT: addl $124, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx @@ -6638,71 +6370,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $92, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl (%eax), %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%eax), %edx -; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%eax), %esi -; FALLBACK17-NEXT: movl 12(%eax), %edi -; FALLBACK17-NEXT: movl 16(%eax), %ebx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movl 20(%eax), %ebp -; FALLBACK17-NEXT: movl 24(%eax), %edx -; FALLBACK17-NEXT: movl 28(%eax), %eax -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movb %ch, %cl +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movups (%ecx), %xmm0 +; FALLBACK17-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK17-NEXT: movzbl (%eax), %eax +; FALLBACK17-NEXT: movl %eax, %ecx ; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 +; FALLBACK17-NEXT: xorps %xmm2, %xmm2 +; FALLBACK17-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $28, %ch -; FALLBACK17-NEXT: negb %ch -; FALLBACK17-NEXT: movsbl %ch, %eax -; FALLBACK17-NEXT: movl 56(%esp,%eax), %edx -; FALLBACK17-NEXT: movl 60(%esp,%eax), %ebx -; FALLBACK17-NEXT: movl %ebx, %esi -; FALLBACK17-NEXT: shldl %cl, %edx, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 52(%esp,%eax), %esi -; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %esi, %edx +; FALLBACK17-NEXT: andb $28, %al +; FALLBACK17-NEXT: negb %al +; FALLBACK17-NEXT: movsbl %al, %ebx +; FALLBACK17-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK17-NEXT: movl 68(%esp,%ebx), %edx +; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %eax, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%eax), %edi -; FALLBACK17-NEXT: movl 68(%esp,%eax), %ebp -; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edi, %ebp -; FALLBACK17-NEXT: shldl %cl, %ebx, %edi -; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebx -; FALLBACK17-NEXT: movl 72(%esp,%eax), %edx -; FALLBACK17-NEXT: movl 76(%esp,%eax), %esi -; FALLBACK17-NEXT: shldl %cl, %edx, %esi -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl 60(%esp,%ebx), %edx +; FALLBACK17-NEXT: shldl %cl, %edx, %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 56(%esp,%ebx), %edi +; FALLBACK17-NEXT: shldl %cl, %edi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 52(%esp,%ebx), %ebp +; FALLBACK17-NEXT: shldl %cl, %ebp, %edi +; FALLBACK17-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK17-NEXT: movl %esi, %edx +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK17-NEXT: shldl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl %edx, 24(%eax) -; FALLBACK17-NEXT: movl %esi, 28(%eax) -; FALLBACK17-NEXT: movl %edi, 16(%eax) -; FALLBACK17-NEXT: movl %ebp, 20(%eax) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, 8(%eax) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, 12(%eax) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %ebx, %edx +; FALLBACK17-NEXT: movl 76(%esp,%ebx), %eax +; FALLBACK17-NEXT: shldl %cl, %esi, %eax +; FALLBACK17-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK17-NEXT: movl %esi, %ebx ; FALLBACK17-NEXT: shll %cl, %ebx -; FALLBACK17-NEXT: movl %ebx, (%eax) -; FALLBACK17-NEXT: movl %edx, 4(%eax) +; FALLBACK17-NEXT: shldl %cl, %esi, %ebp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl %eax, 28(%ecx) +; FALLBACK17-NEXT: movl %edx, 24(%ecx) +; FALLBACK17-NEXT: movl %ebp, 4(%ecx) +; FALLBACK17-NEXT: movl %edi, 8(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 12(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 16(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 20(%ecx) +; FALLBACK17-NEXT: movl %ebx, (%ecx) ; FALLBACK17-NEXT: addl $92, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi @@ -6717,99 +6434,82 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %esi -; FALLBACK18-NEXT: movl 12(%eax), %edi -; FALLBACK18-NEXT: movl 16(%eax), %ebp -; FALLBACK18-NEXT: movzbl (%ebx), %ebx -; FALLBACK18-NEXT: movl 20(%eax), %edx -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl 28(%eax), %eax -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %edx -; FALLBACK18-NEXT: shlb $3, %dl -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movups (%ecx), %xmm0 +; FALLBACK18-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK18-NEXT: movzbl (%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: xorps %xmm2, %xmm2 +; FALLBACK18-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: negb %bl -; FALLBACK18-NEXT: movsbl %bl, %esi -; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %eax, %edi -; FALLBACK18-NEXT: movl %edx, %ecx -; FALLBACK18-NEXT: notb %cl -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx -; FALLBACK18-NEXT: orl %edi, %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, %edi +; FALLBACK18-NEXT: andb $28, %cl +; FALLBACK18-NEXT: negb %cl +; FALLBACK18-NEXT: movsbl %cl, %edx +; FALLBACK18-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %ecx, %ecx +; FALLBACK18-NEXT: movl 80(%esp,%edx), %esi +; FALLBACK18-NEXT: shlxl %eax, %esi, %edi +; FALLBACK18-NEXT: movl %eax, %ebx +; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK18-NEXT: orl %ecx, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 76(%esp,%edx), %ecx +; FALLBACK18-NEXT: movl %ecx, %esi +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK18-NEXT: orl %edi, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %ecx, %ecx +; FALLBACK18-NEXT: movl 72(%esp,%edx), %esi +; FALLBACK18-NEXT: movl %esi, %edi ; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx +; FALLBACK18-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK18-NEXT: orl %ecx, %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %esi, %ecx +; FALLBACK18-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK18-NEXT: movl %esi, %edi +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp +; FALLBACK18-NEXT: orl %ecx, %ebp +; FALLBACK18-NEXT: shlxl %eax, %esi, %edi +; FALLBACK18-NEXT: movl 64(%esp,%edx), %esi +; FALLBACK18-NEXT: movl %esi, %ecx +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: shlxl %eax, %esi, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, 92(%esp,%edx), %edi +; FALLBACK18-NEXT: movl 88(%esp,%edx), %edx +; FALLBACK18-NEXT: shlxl %eax, %edx, %esi ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax -; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp -; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi -; FALLBACK18-NEXT: orl %ebp, %esi -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: shrl %edx +; FALLBACK18-NEXT: shrxl %ebx, %edx, %edx +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK18-NEXT: movl %edi, (%esi) +; FALLBACK18-NEXT: movl %edx, 28(%esi) +; FALLBACK18-NEXT: movl %eax, 24(%esi) +; FALLBACK18-NEXT: movl %ecx, 4(%esi) +; FALLBACK18-NEXT: movl %ebp, 8(%esi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%esi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 16(%esi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%esi) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -6824,73 +6524,55 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $92, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %esi -; FALLBACK19-NEXT: movl 12(%ecx), %edi -; FALLBACK19-NEXT: movl 16(%ecx), %ebp -; FALLBACK19-NEXT: movzbl (%ebx), %ebx -; FALLBACK19-NEXT: movl 20(%ecx), %edx -; FALLBACK19-NEXT: movl 24(%ecx), %eax -; FALLBACK19-NEXT: movl 28(%ecx), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, %ecx +; FALLBACK19-NEXT: movups (%ecx), %xmm0 +; FALLBACK19-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK19-NEXT: movzbl (%eax), %eax +; FALLBACK19-NEXT: movl %eax, %ecx ; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: xorps %xmm2, %xmm2 +; FALLBACK19-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $28, %bl -; FALLBACK19-NEXT: negb %bl -; FALLBACK19-NEXT: movsbl %bl, %eax -; FALLBACK19-NEXT: movl 56(%esp,%eax), %edx -; FALLBACK19-NEXT: movl 60(%esp,%eax), %esi -; FALLBACK19-NEXT: movl %esi, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edx, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 52(%esp,%eax), %ebx +; FALLBACK19-NEXT: andb $28, %al +; FALLBACK19-NEXT: negb %al +; FALLBACK19-NEXT: movsbl %al, %edi +; FALLBACK19-NEXT: movl 64(%esp,%edi), %eax +; FALLBACK19-NEXT: movl 68(%esp,%edi), %edx +; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %eax, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 60(%esp,%edi), %edx +; FALLBACK19-NEXT: shldl %cl, %edx, %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 56(%esp,%edi), %ebx ; FALLBACK19-NEXT: shldl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 64(%esp,%eax), %edi -; FALLBACK19-NEXT: movl 68(%esp,%eax), %ebp -; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edi, %ebp -; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl 48(%esp,%eax), %edx -; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 72(%esp,%eax), %edx -; FALLBACK19-NEXT: movl 76(%esp,%eax), %esi -; FALLBACK19-NEXT: shldl %cl, %edx, %esi -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK19-NEXT: shldl %cl, %ebp, %ebx +; FALLBACK19-NEXT: movl 72(%esp,%edi), %esi +; FALLBACK19-NEXT: movl %esi, %edx +; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK19-NEXT: shldl %cl, %eax, %edx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl %edx, 24(%eax) -; FALLBACK19-NEXT: movl %esi, 28(%eax) -; FALLBACK19-NEXT: movl %edi, 16(%eax) -; FALLBACK19-NEXT: movl %ebp, 20(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, 8(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, 12(%eax) -; FALLBACK19-NEXT: movl (%esp), %esi # 4-byte Reload -; FALLBACK19-NEXT: shlxl %ecx, %esi, %edx -; FALLBACK19-NEXT: movl %edx, (%eax) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: shldl %cl, %esi, %ebx -; FALLBACK19-NEXT: movl %ebx, 4(%eax) +; FALLBACK19-NEXT: movl 76(%esp,%edi), %eax +; FALLBACK19-NEXT: shldl %cl, %esi, %eax +; FALLBACK19-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK19-NEXT: shldl %cl, %esi, %ebp +; FALLBACK19-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK19-NEXT: movl %eax, 28(%esi) +; FALLBACK19-NEXT: movl %edx, 24(%esi) +; FALLBACK19-NEXT: movl %ebp, 4(%esi) +; FALLBACK19-NEXT: movl %ebx, 8(%esi) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 12(%esi) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 16(%esi) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 20(%esi) +; FALLBACK19-NEXT: movl %ecx, (%esi) ; FALLBACK19-NEXT: addl $92, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi @@ -6904,14 +6586,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $108, %esp +; FALLBACK20-NEXT: subl $124, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movb %cl, %dh -; FALLBACK20-NEXT: shlb $3, %dh +; FALLBACK20-NEXT: movl %ecx, %ebx +; FALLBACK20-NEXT: shlb $3, %bl ; FALLBACK20-NEXT: xorps %xmm2, %xmm2 ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -6919,82 +6601,89 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $28, %cl ; FALLBACK20-NEXT: negb %cl -; FALLBACK20-NEXT: movsbl %cl, %ebx -; FALLBACK20-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: movb %dh, %dl -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK20-NEXT: movl %esi, %eax -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %edi, %eax +; FALLBACK20-NEXT: movsbl %cl, %ebp +; FALLBACK20-NEXT: movl 100(%esp,%ebp), %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %ebx, %ecx +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %bl, %ch +; FALLBACK20-NEXT: notb %ch +; FALLBACK20-NEXT: movl 96(%esp,%ebp), %eax +; FALLBACK20-NEXT: movl %eax, %esi +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: movl 92(%esp,%ebp), %esi +; FALLBACK20-NEXT: movl %esi, %edi +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movb %bl, %cl ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl %ebx, %edi -; FALLBACK20-NEXT: movl 76(%esp,%ebx), %ebp -; FALLBACK20-NEXT: movl %ebp, %eax -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %esi, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ebx -; FALLBACK20-NEXT: movl %ebx, %eax -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 88(%esp,%ebp), %eax +; FALLBACK20-NEXT: movl %eax, %esi +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 68(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %esi +; FALLBACK20-NEXT: movl 84(%esp,%ebp), %edx +; FALLBACK20-NEXT: movl %edx, %esi ; FALLBACK20-NEXT: shrl %esi -; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: movl 64(%esp,%edi), %ebx +; FALLBACK20-NEXT: movl %ebx, %eax +; FALLBACK20-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 80(%esp,%ebp), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: shrl %ebx -; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl 88(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %edi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 104(%esp,%ebp), %edi +; FALLBACK20-NEXT: movl %edi, %edx +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %edi, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %edx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl 92(%esp,%eax), %edi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %edi, %ebp -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK20-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK20-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK20-NEXT: orl %ebp, %edi +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %edx, (%eax) -; FALLBACK20-NEXT: movl %ebp, 28(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 24(%eax) +; FALLBACK20-NEXT: movl %ebp, (%eax) +; FALLBACK20-NEXT: movl %edi, 28(%eax) +; FALLBACK20-NEXT: movl %edx, 24(%eax) ; FALLBACK20-NEXT: movl %ebx, 4(%eax) ; FALLBACK20-NEXT: movl %esi, 8(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -7003,7 +6692,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl %ecx, 16(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: addl $108, %esp +; FALLBACK20-NEXT: addl $124, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx @@ -7031,42 +6720,42 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: andb $28, %al ; FALLBACK21-NEXT: negb %al -; FALLBACK21-NEXT: movsbl %al, %ebp -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movsbl %al, %ebx +; FALLBACK21-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK21-NEXT: movl 68(%esp,%ebx), %edx +; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl 60(%esp,%ebx), %edx ; FALLBACK21-NEXT: shldl %cl, %edx, %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edi +; FALLBACK21-NEXT: movl 56(%esp,%ebx), %edi ; FALLBACK21-NEXT: shldl %cl, %edi, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %ebx -; FALLBACK21-NEXT: shldl %cl, %ebx, %edi -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, %eax -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 52(%esp,%ebx), %ebp +; FALLBACK21-NEXT: shldl %cl, %ebp, %edi +; FALLBACK21-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK21-NEXT: movl %esi, %edx +; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK21-NEXT: shldl %cl, %eax, %edx +; FALLBACK21-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK21-NEXT: shldl %cl, %esi, %eax -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %ebp -; FALLBACK21-NEXT: shldl %cl, %edx, %ebp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK21-NEXT: movl %ebp, 28(%edx) -; FALLBACK21-NEXT: movl %eax, 24(%edx) -; FALLBACK21-NEXT: movl %esi, %eax -; FALLBACK21-NEXT: shll %cl, %eax -; FALLBACK21-NEXT: shldl %cl, %esi, %ebx -; FALLBACK21-NEXT: movl %ebx, 4(%edx) -; FALLBACK21-NEXT: movl %edi, 8(%edx) -; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK21-NEXT: movl %ecx, 12(%edx) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK21-NEXT: movl %ecx, 16(%edx) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK21-NEXT: movl %ecx, 20(%edx) -; FALLBACK21-NEXT: movl %eax, (%edx) +; FALLBACK21-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK21-NEXT: movl %esi, %ebx +; FALLBACK21-NEXT: shll %cl, %ebx +; FALLBACK21-NEXT: shldl %cl, %esi, %ebp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movl %eax, 28(%ecx) +; FALLBACK21-NEXT: movl %edx, 24(%ecx) +; FALLBACK21-NEXT: movl %ebp, 4(%ecx) +; FALLBACK21-NEXT: movl %edi, 8(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 12(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 16(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 20(%ecx) +; FALLBACK21-NEXT: movl %ebx, (%ecx) ; FALLBACK21-NEXT: addl $92, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -7185,42 +6874,41 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: andb $28, %al ; FALLBACK23-NEXT: negb %al -; FALLBACK23-NEXT: movsbl %al, %ebx -; FALLBACK23-NEXT: movl 64(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl 68(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movsbl %al, %edi +; FALLBACK23-NEXT: movl 64(%esp,%edi), %eax +; FALLBACK23-NEXT: movl 68(%esp,%edi), %edx +; FALLBACK23-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %eax, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 60(%esp,%ebx), %edx +; FALLBACK23-NEXT: movl 60(%esp,%edi), %edx ; FALLBACK23-NEXT: shldl %cl, %edx, %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 56(%esp,%ebx), %edi -; FALLBACK23-NEXT: shldl %cl, %edi, %edx -; FALLBACK23-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK23-NEXT: movl 52(%esp,%ebx), %ebp -; FALLBACK23-NEXT: shldl %cl, %ebp, %edi -; FALLBACK23-NEXT: movl 72(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, %eax -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK23-NEXT: movl 56(%esp,%edi), %ebx +; FALLBACK23-NEXT: shldl %cl, %ebx, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK23-NEXT: shldl %cl, %ebp, %ebx +; FALLBACK23-NEXT: movl 72(%esp,%edi), %esi +; FALLBACK23-NEXT: movl %esi, %edx +; FALLBACK23-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK23-NEXT: shldl %cl, %eax, %edx +; FALLBACK23-NEXT: movl 76(%esp,%edi), %eax ; FALLBACK23-NEXT: shldl %cl, %esi, %eax -; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 76(%esp,%ebx), %ebx -; FALLBACK23-NEXT: shldl %cl, %edx, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK23-NEXT: movl %ebx, 28(%edx) -; FALLBACK23-NEXT: movl %eax, 24(%edx) -; FALLBACK23-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: movl 48(%esp,%edi), %esi ; FALLBACK23-NEXT: shldl %cl, %esi, %ebp -; FALLBACK23-NEXT: movl %ebp, 4(%edx) -; FALLBACK23-NEXT: movl %edi, 8(%edx) -; FALLBACK23-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 12(%edx) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 16(%edx) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 20(%edx) -; FALLBACK23-NEXT: movl %eax, (%edx) +; FALLBACK23-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK23-NEXT: movl %eax, 28(%esi) +; FALLBACK23-NEXT: movl %edx, 24(%esi) +; FALLBACK23-NEXT: movl %ebp, 4(%esi) +; FALLBACK23-NEXT: movl %ebx, 8(%esi) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 12(%esi) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 16(%esi) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 20(%esi) +; FALLBACK23-NEXT: movl %ecx, (%esi) ; FALLBACK23-NEXT: addl $92, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi @@ -7234,94 +6922,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $108, %esp +; FALLBACK24-NEXT: subl $124, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movb %cl, %dh -; FALLBACK24-NEXT: shlb $3, %dh +; FALLBACK24-NEXT: movl %ecx, %ebx +; FALLBACK24-NEXT: shlb $3, %bl ; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $28, %cl ; FALLBACK24-NEXT: negb %cl -; FALLBACK24-NEXT: movsbl %cl, %ebx -; FALLBACK24-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: movb %dh, %dl -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK24-NEXT: movl %esi, %eax -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %edi, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl %ebx, %edi -; FALLBACK24-NEXT: movl 76(%esp,%ebx), %ebp -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %esi, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ebx -; FALLBACK24-NEXT: movl %ebx, %eax -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 68(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %esi +; FALLBACK24-NEXT: movsbl %cl, %ebp +; FALLBACK24-NEXT: movl 100(%esp,%ebp), %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %ebx, %ecx +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %bl, %ch +; FALLBACK24-NEXT: notb %ch +; FALLBACK24-NEXT: movl 96(%esp,%ebp), %eax +; FALLBACK24-NEXT: movl %eax, %esi ; FALLBACK24-NEXT: shrl %esi -; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: movl 64(%esp,%edi), %ebx +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 92(%esp,%ebp), %esi +; FALLBACK24-NEXT: movl %esi, %edi +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 88(%esp,%ebp), %eax +; FALLBACK24-NEXT: movl %eax, %esi +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 84(%esp,%ebp), %edx +; FALLBACK24-NEXT: movl %edx, %esi +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %ebx, %eax +; FALLBACK24-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 80(%esp,%ebp), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: shrl %ebx -; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl 88(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %edi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 104(%esp,%ebp), %edi +; FALLBACK24-NEXT: movl %edi, %edx +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %edi, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %edx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl 92(%esp,%eax), %edi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %edi, %ebp -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK24-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK24-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK24-NEXT: orl %ebp, %edi +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %edx, (%eax) -; FALLBACK24-NEXT: movl %ebp, 28(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 24(%eax) +; FALLBACK24-NEXT: movl %ebp, (%eax) +; FALLBACK24-NEXT: movl %edi, 28(%eax) +; FALLBACK24-NEXT: movl %edx, 24(%eax) ; FALLBACK24-NEXT: movl %ebx, 4(%eax) ; FALLBACK24-NEXT: movl %esi, 8(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -7330,7 +7025,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl %ecx, 16(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: addl $108, %esp +; FALLBACK24-NEXT: addl $124, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx @@ -7356,42 +7051,42 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: andb $28, %al ; FALLBACK25-NEXT: negb %al -; FALLBACK25-NEXT: movsbl %al, %ebp -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movsbl %al, %ebx +; FALLBACK25-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK25-NEXT: movl 68(%esp,%ebx), %edx +; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl 60(%esp,%ebx), %edx ; FALLBACK25-NEXT: shldl %cl, %edx, %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edi +; FALLBACK25-NEXT: movl 56(%esp,%ebx), %edi ; FALLBACK25-NEXT: shldl %cl, %edi, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %ebx -; FALLBACK25-NEXT: shldl %cl, %ebx, %edi -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, %eax -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 52(%esp,%ebx), %ebp +; FALLBACK25-NEXT: shldl %cl, %ebp, %edi +; FALLBACK25-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK25-NEXT: movl %esi, %edx +; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK25-NEXT: shldl %cl, %eax, %edx +; FALLBACK25-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK25-NEXT: shldl %cl, %esi, %eax -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %ebp -; FALLBACK25-NEXT: shldl %cl, %edx, %ebp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK25-NEXT: movl %ebp, 28(%edx) -; FALLBACK25-NEXT: movl %eax, 24(%edx) -; FALLBACK25-NEXT: movl %esi, %eax -; FALLBACK25-NEXT: shll %cl, %eax -; FALLBACK25-NEXT: shldl %cl, %esi, %ebx -; FALLBACK25-NEXT: movl %ebx, 4(%edx) -; FALLBACK25-NEXT: movl %edi, 8(%edx) -; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK25-NEXT: movl %ecx, 12(%edx) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK25-NEXT: movl %ecx, 16(%edx) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK25-NEXT: movl %ecx, 20(%edx) -; FALLBACK25-NEXT: movl %eax, (%edx) +; FALLBACK25-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK25-NEXT: movl %esi, %ebx +; FALLBACK25-NEXT: shll %cl, %ebx +; FALLBACK25-NEXT: shldl %cl, %esi, %ebp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: movl %eax, 28(%ecx) +; FALLBACK25-NEXT: movl %edx, 24(%ecx) +; FALLBACK25-NEXT: movl %ebp, 4(%ecx) +; FALLBACK25-NEXT: movl %edi, 8(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 12(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 16(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 20(%ecx) +; FALLBACK25-NEXT: movl %ebx, (%ecx) ; FALLBACK25-NEXT: addl $92, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -7506,42 +7201,41 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: andb $28, %al ; FALLBACK27-NEXT: negb %al -; FALLBACK27-NEXT: movsbl %al, %ebx -; FALLBACK27-NEXT: movl 64(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl 68(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movsbl %al, %edi +; FALLBACK27-NEXT: movl 64(%esp,%edi), %eax +; FALLBACK27-NEXT: movl 68(%esp,%edi), %edx +; FALLBACK27-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %eax, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 60(%esp,%ebx), %edx +; FALLBACK27-NEXT: movl 60(%esp,%edi), %edx ; FALLBACK27-NEXT: shldl %cl, %edx, %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 56(%esp,%ebx), %edi -; FALLBACK27-NEXT: shldl %cl, %edi, %edx -; FALLBACK27-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK27-NEXT: movl 52(%esp,%ebx), %ebp -; FALLBACK27-NEXT: shldl %cl, %ebp, %edi -; FALLBACK27-NEXT: movl 72(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, %eax -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK27-NEXT: movl 56(%esp,%edi), %ebx +; FALLBACK27-NEXT: shldl %cl, %ebx, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK27-NEXT: shldl %cl, %ebp, %ebx +; FALLBACK27-NEXT: movl 72(%esp,%edi), %esi +; FALLBACK27-NEXT: movl %esi, %edx +; FALLBACK27-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK27-NEXT: shldl %cl, %eax, %edx +; FALLBACK27-NEXT: movl 76(%esp,%edi), %eax ; FALLBACK27-NEXT: shldl %cl, %esi, %eax -; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 76(%esp,%ebx), %ebx -; FALLBACK27-NEXT: shldl %cl, %edx, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK27-NEXT: movl %ebx, 28(%edx) -; FALLBACK27-NEXT: movl %eax, 24(%edx) -; FALLBACK27-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: movl 48(%esp,%edi), %esi ; FALLBACK27-NEXT: shldl %cl, %esi, %ebp -; FALLBACK27-NEXT: movl %ebp, 4(%edx) -; FALLBACK27-NEXT: movl %edi, 8(%edx) -; FALLBACK27-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 12(%edx) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 16(%edx) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 20(%edx) -; FALLBACK27-NEXT: movl %eax, (%edx) +; FALLBACK27-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK27-NEXT: movl %eax, 28(%esi) +; FALLBACK27-NEXT: movl %edx, 24(%esi) +; FALLBACK27-NEXT: movl %ebp, 4(%esi) +; FALLBACK27-NEXT: movl %ebx, 8(%esi) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 12(%esi) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 16(%esi) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 20(%esi) +; FALLBACK27-NEXT: movl %ecx, (%esi) ; FALLBACK27-NEXT: addl $92, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi @@ -7556,94 +7250,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $108, %esp +; FALLBACK28-NEXT: subl $124, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movb %cl, %dh -; FALLBACK28-NEXT: shlb $3, %dh +; FALLBACK28-NEXT: movl %ecx, %ebx +; FALLBACK28-NEXT: shlb $3, %bl ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $28, %cl ; FALLBACK28-NEXT: negb %cl -; FALLBACK28-NEXT: movsbl %cl, %ebx -; FALLBACK28-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: movb %dh, %dl -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK28-NEXT: movl %esi, %eax -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %edi, %eax +; FALLBACK28-NEXT: movsbl %cl, %ebp +; FALLBACK28-NEXT: movl 100(%esp,%ebp), %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %ebx, %ecx +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %bl, %ch +; FALLBACK28-NEXT: notb %ch +; FALLBACK28-NEXT: movl 96(%esp,%ebp), %eax +; FALLBACK28-NEXT: movl %eax, %esi +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: movl 92(%esp,%ebp), %esi +; FALLBACK28-NEXT: movl %esi, %edi +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movb %bl, %cl ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl %ebx, %edi -; FALLBACK28-NEXT: movl 76(%esp,%ebx), %ebp -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %esi, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ebx -; FALLBACK28-NEXT: movl %ebx, %eax -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 88(%esp,%ebp), %eax +; FALLBACK28-NEXT: movl %eax, %esi +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 68(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %esi +; FALLBACK28-NEXT: movl 84(%esp,%ebp), %edx +; FALLBACK28-NEXT: movl %edx, %esi ; FALLBACK28-NEXT: shrl %esi -; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: movl 64(%esp,%edi), %ebx +; FALLBACK28-NEXT: movl %ebx, %eax +; FALLBACK28-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 80(%esp,%ebp), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: shrl %ebx -; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl 88(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %edi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 104(%esp,%ebp), %edi +; FALLBACK28-NEXT: movl %edi, %edx +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %edi, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %edx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl 92(%esp,%eax), %edi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %edi, %ebp -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK28-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK28-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK28-NEXT: orl %ebp, %edi +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %edx, (%eax) -; FALLBACK28-NEXT: movl %ebp, 28(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 24(%eax) +; FALLBACK28-NEXT: movl %ebp, (%eax) +; FALLBACK28-NEXT: movl %edi, 28(%eax) +; FALLBACK28-NEXT: movl %edx, 24(%eax) ; FALLBACK28-NEXT: movl %ebx, 4(%eax) ; FALLBACK28-NEXT: movl %esi, 8(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -7652,7 +7353,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl %ecx, 16(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: addl $108, %esp +; FALLBACK28-NEXT: addl $124, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx @@ -7678,42 +7379,42 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: andb $28, %al ; FALLBACK29-NEXT: negb %al -; FALLBACK29-NEXT: movsbl %al, %ebp -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movsbl %al, %ebx +; FALLBACK29-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK29-NEXT: movl 68(%esp,%ebx), %edx +; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl 60(%esp,%ebx), %edx ; FALLBACK29-NEXT: shldl %cl, %edx, %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edi +; FALLBACK29-NEXT: movl 56(%esp,%ebx), %edi ; FALLBACK29-NEXT: shldl %cl, %edi, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %ebx -; FALLBACK29-NEXT: shldl %cl, %ebx, %edi -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, %eax -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 52(%esp,%ebx), %ebp +; FALLBACK29-NEXT: shldl %cl, %ebp, %edi +; FALLBACK29-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK29-NEXT: movl %esi, %edx +; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK29-NEXT: shldl %cl, %eax, %edx +; FALLBACK29-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK29-NEXT: shldl %cl, %esi, %eax -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %ebp -; FALLBACK29-NEXT: shldl %cl, %edx, %ebp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK29-NEXT: movl %ebp, 28(%edx) -; FALLBACK29-NEXT: movl %eax, 24(%edx) -; FALLBACK29-NEXT: movl %esi, %eax -; FALLBACK29-NEXT: shll %cl, %eax -; FALLBACK29-NEXT: shldl %cl, %esi, %ebx -; FALLBACK29-NEXT: movl %ebx, 4(%edx) -; FALLBACK29-NEXT: movl %edi, 8(%edx) -; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK29-NEXT: movl %ecx, 12(%edx) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK29-NEXT: movl %ecx, 16(%edx) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK29-NEXT: movl %ecx, 20(%edx) -; FALLBACK29-NEXT: movl %eax, (%edx) +; FALLBACK29-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK29-NEXT: movl %esi, %ebx +; FALLBACK29-NEXT: shll %cl, %ebx +; FALLBACK29-NEXT: shldl %cl, %esi, %ebp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: movl %eax, 28(%ecx) +; FALLBACK29-NEXT: movl %edx, 24(%ecx) +; FALLBACK29-NEXT: movl %ebp, 4(%ecx) +; FALLBACK29-NEXT: movl %edi, 8(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 12(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 16(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 20(%ecx) +; FALLBACK29-NEXT: movl %ebx, (%ecx) ; FALLBACK29-NEXT: addl $92, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -7828,42 +7529,41 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: andb $28, %al ; FALLBACK31-NEXT: negb %al -; FALLBACK31-NEXT: movsbl %al, %ebx -; FALLBACK31-NEXT: movl 64(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl 68(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movsbl %al, %edi +; FALLBACK31-NEXT: movl 64(%esp,%edi), %eax +; FALLBACK31-NEXT: movl 68(%esp,%edi), %edx +; FALLBACK31-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %eax, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 60(%esp,%ebx), %edx +; FALLBACK31-NEXT: movl 60(%esp,%edi), %edx ; FALLBACK31-NEXT: shldl %cl, %edx, %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 56(%esp,%ebx), %edi -; FALLBACK31-NEXT: shldl %cl, %edi, %edx -; FALLBACK31-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK31-NEXT: movl 52(%esp,%ebx), %ebp -; FALLBACK31-NEXT: shldl %cl, %ebp, %edi -; FALLBACK31-NEXT: movl 72(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, %eax -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK31-NEXT: movl 56(%esp,%edi), %ebx +; FALLBACK31-NEXT: shldl %cl, %ebx, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK31-NEXT: shldl %cl, %ebp, %ebx +; FALLBACK31-NEXT: movl 72(%esp,%edi), %esi +; FALLBACK31-NEXT: movl %esi, %edx +; FALLBACK31-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK31-NEXT: shldl %cl, %eax, %edx +; FALLBACK31-NEXT: movl 76(%esp,%edi), %eax ; FALLBACK31-NEXT: shldl %cl, %esi, %eax -; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 76(%esp,%ebx), %ebx -; FALLBACK31-NEXT: shldl %cl, %edx, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK31-NEXT: movl %ebx, 28(%edx) -; FALLBACK31-NEXT: movl %eax, 24(%edx) -; FALLBACK31-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: movl 48(%esp,%edi), %esi ; FALLBACK31-NEXT: shldl %cl, %esi, %ebp -; FALLBACK31-NEXT: movl %ebp, 4(%edx) -; FALLBACK31-NEXT: movl %edi, 8(%edx) -; FALLBACK31-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 12(%edx) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 16(%edx) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 20(%edx) -; FALLBACK31-NEXT: movl %eax, (%edx) +; FALLBACK31-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK31-NEXT: movl %eax, 28(%esi) +; FALLBACK31-NEXT: movl %edx, 24(%esi) +; FALLBACK31-NEXT: movl %ebp, 4(%esi) +; FALLBACK31-NEXT: movl %ebx, 8(%esi) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 12(%esi) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 16(%esi) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 20(%esi) +; FALLBACK31-NEXT: movl %ecx, (%esi) ; FALLBACK31-NEXT: addl $92, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi @@ -7882,171 +7582,155 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: shl_32bytes_dwordOff: ; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: movl %esi, %eax +; FALLBACK0-NEXT: movups (%rdi), %xmm0 +; FALLBACK0-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK0-NEXT: movzbl (%rsi), %ecx +; FALLBACK0-NEXT: movl %ecx, %eax ; FALLBACK0-NEXT: shlb $5, %al -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: xorps %xmm2, %xmm2 +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: shlb $2, %sil -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: negb %sil -; FALLBACK0-NEXT: movsbq %sil, %r10 -; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 -; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: shlb $2, %cl +; FALLBACK0-NEXT: andb $24, %cl +; FALLBACK0-NEXT: negb %cl +; FALLBACK0-NEXT: movsbq %cl, %r8 +; FALLBACK0-NEXT: movq -16(%rsp,%r8), %r9 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: shlq %cl, %r9 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq %r8, %r9 -; FALLBACK0-NEXT: shrq %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 -; FALLBACK0-NEXT: movq %r10, %rbx -; FALLBACK0-NEXT: shrq %rbx +; FALLBACK0-NEXT: movq -24(%rsp,%r8), %r10 +; FALLBACK0-NEXT: movq %r10, %rdi +; FALLBACK0-NEXT: shrq %rdi ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: orq %r11, %rbx +; FALLBACK0-NEXT: shrq %cl, %rdi +; FALLBACK0-NEXT: orq %r9, %rdi ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: shrq %rdi +; FALLBACK0-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK0-NEXT: movq -32(%rsp,%r8), %r8 +; FALLBACK0-NEXT: movq %r8, %r11 +; FALLBACK0-NEXT: shrq %r11 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %rdi, 16(%rdx) -; FALLBACK0-NEXT: movq %rbx, 24(%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: orq %r10, %r11 +; FALLBACK0-NEXT: movq %r9, %r10 +; FALLBACK0-NEXT: shrq %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: orq %r8, %r10 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r9 +; FALLBACK0-NEXT: movq %r9, (%rdx) +; FALLBACK0-NEXT: movq %r10, 8(%rdx) +; FALLBACK0-NEXT: movq %r11, 16(%rdx) +; FALLBACK0-NEXT: movq %rdi, 24(%rdx) ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: shl_32bytes_dwordOff: ; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: movl %esi, %ecx +; FALLBACK1-NEXT: movups (%rdi), %xmm0 +; FALLBACK1-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK1-NEXT: movzbl (%rsi), %eax +; FALLBACK1-NEXT: movl %eax, %ecx ; FALLBACK1-NEXT: shlb $5, %cl -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 +; FALLBACK1-NEXT: xorps %xmm2, %xmm2 +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: shlb $2, %sil -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: negb %sil -; FALLBACK1-NEXT: movsbq %sil, %rax +; FALLBACK1-NEXT: shlb $2, %al +; FALLBACK1-NEXT: andb $24, %al +; FALLBACK1-NEXT: negb %al +; FALLBACK1-NEXT: movsbq %al, %rax ; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK1-NEXT: shldq %cl, %rax, %rsi -; FALLBACK1-NEXT: shldq %cl, %r8, %rax -; FALLBACK1-NEXT: shlq %cl, %r8 +; FALLBACK1-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK1-NEXT: shldq %cl, %r8, %rsi +; FALLBACK1-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK1-NEXT: movq %rax, %r9 +; FALLBACK1-NEXT: shlq %cl, %r9 +; FALLBACK1-NEXT: shldq %cl, %rax, %r8 +; FALLBACK1-NEXT: movq %r8, 8(%rdx) ; FALLBACK1-NEXT: movq %rsi, 16(%rdx) ; FALLBACK1-NEXT: movq %rdi, 24(%rdx) -; FALLBACK1-NEXT: movq %r8, (%rdx) -; FALLBACK1-NEXT: movq %rax, 8(%rdx) +; FALLBACK1-NEXT: movq %r9, (%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: shl_32bytes_dwordOff: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: movl %esi, %eax +; FALLBACK2-NEXT: movups (%rdi), %xmm0 +; FALLBACK2-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK2-NEXT: movzbl (%rsi), %ecx +; FALLBACK2-NEXT: movl %ecx, %eax ; FALLBACK2-NEXT: shlb $5, %al -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: xorps %xmm2, %xmm2 +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: shlb $2, %sil -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 +; FALLBACK2-NEXT: shlb $2, %cl +; FALLBACK2-NEXT: andb $24, %cl +; FALLBACK2-NEXT: negb %cl +; FALLBACK2-NEXT: movsbq %cl, %rcx +; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi +; FALLBACK2-NEXT: movq -24(%rsp,%rcx), %rdi +; FALLBACK2-NEXT: shlxq %rax, %rdi, %r8 +; FALLBACK2-NEXT: movq -40(%rsp,%rcx), %r9 +; FALLBACK2-NEXT: movq -32(%rsp,%rcx), %rcx +; FALLBACK2-NEXT: shlxq %rax, %rcx, %r10 +; FALLBACK2-NEXT: shlxq %rax, %r9, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: orq %rsi, %rdi ; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax +; FALLBACK2-NEXT: shrxq %rax, %rcx, %rcx +; FALLBACK2-NEXT: orq %r8, %rcx +; FALLBACK2-NEXT: shrq %r9 +; FALLBACK2-NEXT: shrxq %rax, %r9, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, (%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rax, 8(%rdx) +; FALLBACK2-NEXT: movq %rcx, 16(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes_dwordOff: ; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: movl %esi, %ecx +; FALLBACK3-NEXT: movups (%rdi), %xmm0 +; FALLBACK3-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK3-NEXT: movzbl (%rsi), %eax +; FALLBACK3-NEXT: movl %eax, %ecx ; FALLBACK3-NEXT: shlb $5, %cl -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: xorps %xmm2, %xmm2 +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: shlb $2, %sil -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: negb %sil -; FALLBACK3-NEXT: movsbq %sil, %rax +; FALLBACK3-NEXT: shlb $2, %al +; FALLBACK3-NEXT: andb $24, %al +; FALLBACK3-NEXT: negb %al +; FALLBACK3-NEXT: movsbq %al, %rax ; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK3-NEXT: shldq %cl, %rax, %rsi -; FALLBACK3-NEXT: shldq %cl, %r8, %rax -; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx +; FALLBACK3-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK3-NEXT: shldq %cl, %r8, %rsi +; FALLBACK3-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK3-NEXT: shlxq %rcx, %rax, %r9 +; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK3-NEXT: shldq %cl, %rax, %r8 +; FALLBACK3-NEXT: movq %r8, 8(%rdx) ; FALLBACK3-NEXT: movq %rsi, 16(%rdx) ; FALLBACK3-NEXT: movq %rdi, 24(%rdx) -; FALLBACK3-NEXT: movq %rcx, (%rdx) -; FALLBACK3-NEXT: movq %rax, 8(%rdx) +; FALLBACK3-NEXT: movq %r9, (%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: shl_32bytes_dwordOff: @@ -8084,9 +7768,9 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK4-NEXT: shrq %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: orq %r10, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r8 +; FALLBACK4-NEXT: orq %r10, %r11 ; FALLBACK4-NEXT: movq %r9, %r10 ; FALLBACK4-NEXT: shrq %r10 ; FALLBACK4-NEXT: movl %esi, %ecx @@ -8119,13 +7803,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK5-NEXT: shldq %cl, %rax, %rsi -; FALLBACK5-NEXT: movq %r8, %r9 +; FALLBACK5-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK5-NEXT: shldq %cl, %r8, %rsi +; FALLBACK5-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK5-NEXT: movq %rax, %r9 ; FALLBACK5-NEXT: shlq %cl, %r9 -; FALLBACK5-NEXT: shldq %cl, %r8, %rax -; FALLBACK5-NEXT: movq %rax, 8(%rdx) +; FALLBACK5-NEXT: shldq %cl, %rax, %r8 +; FALLBACK5-NEXT: movq %r8, 8(%rdx) ; FALLBACK5-NEXT: movq %rsi, 16(%rdx) ; FALLBACK5-NEXT: movq %rdi, 24(%rdx) ; FALLBACK5-NEXT: movq %r9, (%rdx) @@ -8190,13 +7874,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK7-NEXT: shldq %cl, %rax, %rsi -; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK7-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK7-NEXT: shldq %cl, %r8, %rsi +; FALLBACK7-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK7-NEXT: shlxq %rcx, %rax, %r9 ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shldq %cl, %r8, %rax -; FALLBACK7-NEXT: movq %rax, 8(%rdx) +; FALLBACK7-NEXT: shldq %cl, %rax, %r8 +; FALLBACK7-NEXT: movq %r8, 8(%rdx) ; FALLBACK7-NEXT: movq %rsi, 16(%rdx) ; FALLBACK7-NEXT: movq %rdi, 24(%rdx) ; FALLBACK7-NEXT: movq %r9, (%rdx) @@ -8234,9 +7918,9 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK8-NEXT: shrq %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: orq %r10, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r8 +; FALLBACK8-NEXT: orq %r10, %r11 ; FALLBACK8-NEXT: movq %r9, %r10 ; FALLBACK8-NEXT: shrq %r10 ; FALLBACK8-NEXT: movl %esi, %ecx @@ -8267,13 +7951,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK9-NEXT: shldq %cl, %rax, %rsi -; FALLBACK9-NEXT: movq %r8, %r9 +; FALLBACK9-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK9-NEXT: shldq %cl, %r8, %rsi +; FALLBACK9-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK9-NEXT: movq %rax, %r9 ; FALLBACK9-NEXT: shlq %cl, %r9 -; FALLBACK9-NEXT: shldq %cl, %r8, %rax -; FALLBACK9-NEXT: movq %rax, 8(%rdx) +; FALLBACK9-NEXT: shldq %cl, %rax, %r8 +; FALLBACK9-NEXT: movq %r8, 8(%rdx) ; FALLBACK9-NEXT: movq %rsi, 16(%rdx) ; FALLBACK9-NEXT: movq %rdi, 24(%rdx) ; FALLBACK9-NEXT: movq %r9, (%rdx) @@ -8334,13 +8018,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK11-NEXT: shldq %cl, %rax, %rsi -; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK11-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK11-NEXT: shldq %cl, %r8, %rsi +; FALLBACK11-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK11-NEXT: shlxq %rcx, %rax, %r9 ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shldq %cl, %r8, %rax -; FALLBACK11-NEXT: movq %rax, 8(%rdx) +; FALLBACK11-NEXT: shldq %cl, %rax, %r8 +; FALLBACK11-NEXT: movq %r8, 8(%rdx) ; FALLBACK11-NEXT: movq %rsi, 16(%rdx) ; FALLBACK11-NEXT: movq %rdi, 24(%rdx) ; FALLBACK11-NEXT: movq %r9, (%rdx) @@ -8379,9 +8063,9 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK12-NEXT: shrq %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: orq %r10, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r8 +; FALLBACK12-NEXT: orq %r10, %r11 ; FALLBACK12-NEXT: movq %r9, %r10 ; FALLBACK12-NEXT: shrq %r10 ; FALLBACK12-NEXT: movl %esi, %ecx @@ -8412,13 +8096,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK13-NEXT: shldq %cl, %rax, %rsi -; FALLBACK13-NEXT: movq %r8, %r9 +; FALLBACK13-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK13-NEXT: shldq %cl, %r8, %rsi +; FALLBACK13-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK13-NEXT: movq %rax, %r9 ; FALLBACK13-NEXT: shlq %cl, %r9 -; FALLBACK13-NEXT: shldq %cl, %r8, %rax -; FALLBACK13-NEXT: movq %rax, 8(%rdx) +; FALLBACK13-NEXT: shldq %cl, %rax, %r8 +; FALLBACK13-NEXT: movq %r8, 8(%rdx) ; FALLBACK13-NEXT: movq %rsi, 16(%rdx) ; FALLBACK13-NEXT: movq %rdi, 24(%rdx) ; FALLBACK13-NEXT: movq %r9, (%rdx) @@ -8479,13 +8163,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK15-NEXT: shldq %cl, %rax, %rsi -; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK15-NEXT: movq -32(%rsp,%rax), %r8 +; FALLBACK15-NEXT: shldq %cl, %r8, %rsi +; FALLBACK15-NEXT: movq -40(%rsp,%rax), %rax +; FALLBACK15-NEXT: shlxq %rcx, %rax, %r9 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shldq %cl, %r8, %rax -; FALLBACK15-NEXT: movq %rax, 8(%rdx) +; FALLBACK15-NEXT: shldq %cl, %rax, %r8 +; FALLBACK15-NEXT: movq %r8, 8(%rdx) ; FALLBACK15-NEXT: movq %rsi, 16(%rdx) ; FALLBACK15-NEXT: movq %rdi, 24(%rdx) ; FALLBACK15-NEXT: movq %r9, (%rdx) @@ -8494,67 +8178,27 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; ; X86-SSE2-LABEL: shl_32bytes_dwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %edi -; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $92, %esp +; X86-SSE2-NEXT: subl $76, %esp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SSE2-NEXT: movl (%ebp), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%ebp), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%ebp), %esi -; X86-SSE2-NEXT: movl 12(%ebp), %edi -; X86-SSE2-NEXT: movl 16(%ebp), %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movups 16(%edx), %xmm1 ; X86-SSE2-NEXT: movzbl (%ecx), %ecx -; X86-SSE2-NEXT: movl 20(%ebp), %edx -; X86-SSE2-NEXT: movl 24(%ebp), %eax -; X86-SSE2-NEXT: movl 28(%ebp), %ebp -; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: xorps %xmm0, %xmm0 -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: xorps %xmm2, %xmm2 +; X86-SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm2, (%esp) +; X86-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: shlb $2, %cl ; X86-SSE2-NEXT: andb $28, %cl ; X86-SSE2-NEXT: negb %cl -; X86-SSE2-NEXT: movsbl %cl, %edx -; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi -; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi -; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx -; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp -; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx -; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %edx, 24(%eax) -; X86-SSE2-NEXT: movl %ecx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $92, %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %edi -; X86-SSE2-NEXT: popl %ebx -; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: movsbl %cl, %ecx +; X86-SSE2-NEXT: movups 32(%esp,%ecx), %xmm0 +; X86-SSE2-NEXT: movups 48(%esp,%ecx), %xmm1 +; X86-SSE2-NEXT: movups %xmm1, 16(%eax) +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $76, %esp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: shl_32bytes_dwordOff: @@ -8615,30 +8259,22 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: shl_32bytes_qwordOff: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movq (%rdi), %rax -; X64-SSE2-NEXT: movq 8(%rdi), %rcx -; X64-SSE2-NEXT: movq 16(%rdi), %r8 -; X64-SSE2-NEXT: movq 24(%rdi), %rdi -; X64-SSE2-NEXT: movzbl (%rsi), %esi -; X64-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-SSE2-NEXT: movups (%rdi), %xmm0 +; X64-SSE2-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movzbl (%rsi), %eax +; X64-SSE2-NEXT: xorps %xmm2, %xmm2 +; X64-SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: shlb $3, %sil -; X64-SSE2-NEXT: andb $24, %sil -; X64-SSE2-NEXT: negb %sil -; X64-SSE2-NEXT: movsbq %sil, %rax -; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rcx -; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rax -; X64-SSE2-NEXT: movq %rax, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rcx, (%rdx) -; X64-SSE2-NEXT: movq %rsi, 8(%rdx) +; X64-SSE2-NEXT: shlb $3, %al +; X64-SSE2-NEXT: andb $24, %al +; X64-SSE2-NEXT: negb %al +; X64-SSE2-NEXT: movsbq %al, %rax +; X64-SSE2-NEXT: movups -40(%rsp,%rax), %xmm0 +; X64-SSE2-NEXT: movups -24(%rsp,%rax), %xmm1 +; X64-SSE2-NEXT: movups %xmm1, 16(%rdx) +; X64-SSE2-NEXT: movups %xmm0, (%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: shl_32bytes_qwordOff: @@ -8681,67 +8317,27 @@ define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou ; ; X86-SSE2-LABEL: shl_32bytes_qwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %edi -; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $92, %esp +; X86-SSE2-NEXT: subl $76, %esp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SSE2-NEXT: movl (%ebp), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%ebp), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%ebp), %esi -; X86-SSE2-NEXT: movl 12(%ebp), %edi -; X86-SSE2-NEXT: movl 16(%ebp), %ebx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movups 16(%edx), %xmm1 ; X86-SSE2-NEXT: movzbl (%ecx), %ecx -; X86-SSE2-NEXT: movl 20(%ebp), %edx -; X86-SSE2-NEXT: movl 24(%ebp), %eax -; X86-SSE2-NEXT: movl 28(%ebp), %ebp -; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: xorps %xmm2, %xmm2 +; X86-SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm2, (%esp) +; X86-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: shlb $3, %cl ; X86-SSE2-NEXT: andb $24, %cl ; X86-SSE2-NEXT: negb %cl -; X86-SSE2-NEXT: movsbl %cl, %edx -; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi -; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi -; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx -; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp -; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx -; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %edx, 24(%eax) -; X86-SSE2-NEXT: movl %ecx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $92, %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %edi -; X86-SSE2-NEXT: popl %ebx -; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: movsbl %cl, %ecx +; X86-SSE2-NEXT: movups 32(%esp,%ecx), %xmm0 +; X86-SSE2-NEXT: movups 48(%esp,%ecx), %xmm1 +; X86-SSE2-NEXT: movups %xmm1, 16(%eax) +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $76, %esp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: shl_32bytes_qwordOff: @@ -8802,17 +8398,16 @@ define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: ashr_32bytes: ; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 +; FALLBACK0-NEXT: movups (%rdi), %xmm0 +; FALLBACK0-NEXT: movq 16(%rdi), %rcx ; FALLBACK0-NEXT: movq 24(%rdi), %rdi ; FALLBACK0-NEXT: movzbl (%rsi), %esi ; FALLBACK0-NEXT: leal (,%rsi,8), %eax ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: sarq $63, %rdi ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -8820,52 +8415,51 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: andb $24, %sil ; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: movq -72(%rsp,%r9), %r10 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r8 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi +; FALLBACK0-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movq -56(%rsp,%r9), %r11 +; FALLBACK0-NEXT: movq %r11, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 +; FALLBACK0-NEXT: movq -48(%rsp,%r9), %r9 +; FALLBACK0-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: shlq %cl, %r14 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r8 +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: orq %rbx, %r14 +; FALLBACK0-NEXT: addq %r11, %r11 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: orq %r8, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: sarq %cl, %r9 ; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %r11, 8(%rdx) +; FALLBACK0-NEXT: movq %r14, 16(%rdx) ; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: ashr_32bytes: ; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movups (%rdi), %xmm0 +; FALLBACK1-NEXT: movq 16(%rdi), %rax ; FALLBACK1-NEXT: movq 24(%rdi), %rdi ; FALLBACK1-NEXT: movzbl (%rsi), %esi ; FALLBACK1-NEXT: leal (,%rsi,8), %ecx ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: sarq $63, %rdi ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -8873,34 +8467,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: andb $24, %sil ; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK1-NEXT: movq %rdi, %r8 +; FALLBACK1-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK1-NEXT: movq %r9, %r10 +; FALLBACK1-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK1-NEXT: shrdq %cl, %r9, %rax ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: sarq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) +; FALLBACK1-NEXT: sarq %cl, %rsi +; FALLBACK1-NEXT: movq %r10, 8(%rdx) +; FALLBACK1-NEXT: movq %r8, 16(%rdx) +; FALLBACK1-NEXT: movq %rsi, 24(%rdx) +; FALLBACK1-NEXT: movq %rax, (%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: ashr_32bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movups (%rdi), %xmm0 +; FALLBACK2-NEXT: movq 16(%rdi), %rcx ; FALLBACK2-NEXT: movq 24(%rdi), %rdi ; FALLBACK2-NEXT: movzbl (%rsi), %esi ; FALLBACK2-NEXT: leal (,%rsi,8), %eax ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: sarq $63, %rdi ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -8908,42 +8501,40 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi +; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: orq %rsi, %rdi ; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax +; FALLBACK2-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK2-NEXT: orq %r9, %rcx +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rax, %r8, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rax, 8(%rdx) +; FALLBACK2-NEXT: movq %rcx, 16(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes: ; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movups (%rdi), %xmm0 +; FALLBACK3-NEXT: movq 16(%rdi), %rax ; FALLBACK3-NEXT: movq 24(%rdi), %rdi ; FALLBACK3-NEXT: movzbl (%rsi), %esi ; FALLBACK3-NEXT: leal (,%rsi,8), %ecx ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: sarq $63, %rdi ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -8951,23 +8542,25 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: andb $24, %sil ; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) +; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK3-NEXT: movq %rdi, %r8 +; FALLBACK3-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK3-NEXT: movq %r9, %r10 +; FALLBACK3-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK3-NEXT: shrdq %cl, %r9, %rax +; FALLBACK3-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK3-NEXT: movq %r10, 8(%rdx) +; FALLBACK3-NEXT: movq %r8, 16(%rdx) +; FALLBACK3-NEXT: movq %rcx, 24(%rdx) +; FALLBACK3-NEXT: movq %rax, (%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: ashr_32bytes: ; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %r14 ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movq 16(%rdi), %rcx @@ -8984,38 +8577,39 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: andb $24, %sil ; FALLBACK4-NEXT: movzbl %sil, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK4-NEXT: movq -72(%rsp,%r9), %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r8 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 +; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r11 +; FALLBACK4-NEXT: movq %r11, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK4-NEXT: shrq %cl, %rbx +; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r9 +; FALLBACK4-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx +; FALLBACK4-NEXT: shlq %cl, %r14 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: orq %r10, %rdi +; FALLBACK4-NEXT: orq %rbx, %r14 +; FALLBACK4-NEXT: addq %r11, %r11 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: shlq %cl, %r11 +; FALLBACK4-NEXT: orq %r8, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: sarq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %r11, 8(%rdx) +; FALLBACK4-NEXT: movq %r14, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) ; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: popq %r14 ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: ashr_32bytes: @@ -9039,17 +8633,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK5-NEXT: movq %rdi, %r8 ; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r10 +; FALLBACK5-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK5-NEXT: movq %r9, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK5-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK5-NEXT: shrdq %cl, %r9, %rax ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: sarq %cl, %rsi ; FALLBACK5-NEXT: movq %r10, 8(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: movq %rax, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: ashr_32bytes: @@ -9114,20 +8708,21 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK7-NEXT: movq %rdi, %r8 ; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r10 +; FALLBACK7-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK7-NEXT: movq %r9, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK7-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK7-NEXT: shrdq %cl, %r9, %rax +; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rcx ; FALLBACK7-NEXT: movq %r10, 8(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: movq %rcx, 24(%rdx) +; FALLBACK7-NEXT: movq %rax, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: ashr_32bytes: ; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %r14 ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK8-NEXT: movq 16(%rdi), %rcx @@ -9144,38 +8739,39 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: andb $24, %sil ; FALLBACK8-NEXT: movzbl %sil, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r8 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 +; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r11 +; FALLBACK8-NEXT: movq %r11, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK8-NEXT: shrq %cl, %rbx +; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r9 +; FALLBACK8-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx +; FALLBACK8-NEXT: shlq %cl, %r14 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: orq %r10, %rdi +; FALLBACK8-NEXT: orq %rbx, %r14 +; FALLBACK8-NEXT: addq %r11, %r11 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: shlq %cl, %r11 +; FALLBACK8-NEXT: orq %r8, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: sarq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %r11, 8(%rdx) +; FALLBACK8-NEXT: movq %r14, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) ; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: popq %r14 ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: ashr_32bytes: @@ -9199,17 +8795,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK9-NEXT: movq %rdi, %r8 ; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r10 +; FALLBACK9-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK9-NEXT: movq %r9, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK9-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK9-NEXT: shrdq %cl, %r9, %rax ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: sarq %cl, %rsi ; FALLBACK9-NEXT: movq %r10, 8(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: movq %rax, (%rdx) ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: ashr_32bytes: @@ -9274,20 +8870,21 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK11-NEXT: movq %rdi, %r8 ; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r10 +; FALLBACK11-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK11-NEXT: movq %r9, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK11-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK11-NEXT: shrdq %cl, %r9, %rax +; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rcx ; FALLBACK11-NEXT: movq %r10, 8(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: movq %rcx, 24(%rdx) +; FALLBACK11-NEXT: movq %rax, (%rdx) ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: ashr_32bytes: ; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %r14 ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK12-NEXT: movq 16(%rdi), %rcx @@ -9304,38 +8901,39 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: andb $24, %sil ; FALLBACK12-NEXT: movzbl %sil, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r8 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 +; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r11 +; FALLBACK12-NEXT: movq %r11, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK12-NEXT: shrq %cl, %rbx +; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx +; FALLBACK12-NEXT: shlq %cl, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: orq %r10, %rdi +; FALLBACK12-NEXT: orq %rbx, %r14 +; FALLBACK12-NEXT: addq %r11, %r11 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: shlq %cl, %r11 +; FALLBACK12-NEXT: orq %r8, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: sarq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %r11, 8(%rdx) +; FALLBACK12-NEXT: movq %r14, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: popq %r14 ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: ashr_32bytes: @@ -9359,17 +8957,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK13-NEXT: movq %rdi, %r8 ; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK13-NEXT: movq %rax, %r10 +; FALLBACK13-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK13-NEXT: movq %r9, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK13-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK13-NEXT: shrdq %cl, %r9, %rax ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: sarq %cl, %rsi ; FALLBACK13-NEXT: movq %r10, 8(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: movq %rax, (%rdx) ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: ashr_32bytes: @@ -9434,16 +9032,16 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK15-NEXT: movq %rdi, %r8 ; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r10 +; FALLBACK15-NEXT: movq -64(%rsp,%rax), %r9 +; FALLBACK15-NEXT: movq %r9, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK15-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK15-NEXT: shrdq %cl, %r9, %rax +; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rcx ; FALLBACK15-NEXT: movq %r10, 8(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: movq %rcx, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, (%rdx) ; FALLBACK15-NEXT: retq ; ; FALLBACK16-LABEL: ashr_32bytes: @@ -9454,122 +9052,115 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $108, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK16-NEXT: movl (%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%esi), %ebx -; FALLBACK16-NEXT: movl 12(%esi), %ebp -; FALLBACK16-NEXT: movl 16(%esi), %edi +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movups (%ecx), %xmm0 +; FALLBACK16-NEXT: movl 16(%ecx), %esi +; FALLBACK16-NEXT: movl 20(%ecx), %edi +; FALLBACK16-NEXT: movl 24(%ecx), %ebx +; FALLBACK16-NEXT: movl 28(%ecx), %edx ; FALLBACK16-NEXT: movzbl (%eax), %ecx -; FALLBACK16-NEXT: movl 20(%esi), %edx -; FALLBACK16-NEXT: movl 24(%esi), %eax -; FALLBACK16-NEXT: movl 28(%esi), %esi -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movb %cl, %ch +; FALLBACK16-NEXT: shlb $3, %ch ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, %edx -; FALLBACK16-NEXT: shlb $3, %dl -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: sarl $31, %esi -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: sarl $31, %edx +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: andb $28, %cl -; FALLBACK16-NEXT: movzbl %cl, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax -; FALLBACK16-NEXT: movl %eax, %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movb %dl, %ch -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebp +; FALLBACK16-NEXT: movzbl %cl, %ebx +; FALLBACK16-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 36(%esp,%ebx), %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: notb %cl +; FALLBACK16-NEXT: leal (%edx,%edx), %esi +; FALLBACK16-NEXT: movl %ecx, %edx +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 44(%esp,%ebx), %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: addl %eax, %eax +; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %esi, %eax +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl 48(%esp,%ebx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp -; FALLBACK16-NEXT: movl %ebp, %esi +; FALLBACK16-NEXT: addl %eax, %eax +; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl %edx, %ebx -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: movl 48(%esp,%eax), %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%edx,%edx), %eax -; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %esi, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebx, %edx -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK16-NEXT: movl 52(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, %eax -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 56(%esp,%esi), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi +; FALLBACK16-NEXT: movl 40(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %edi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %edi, %edi +; FALLBACK16-NEXT: addl %eax, %eax +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 52(%esp,%ebx), %edx +; FALLBACK16-NEXT: movl %edx, %ebp +; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: orl %edi, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK16-NEXT: leal (,%ebp,2), %edi +; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %eax, %edi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%esp,%eax), %eax -; FALLBACK16-NEXT: leal (%eax,%eax), %edx +; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: addl %edx, %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; FALLBACK16-NEXT: sarl %cl, %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl %eax, 28(%ecx) -; FALLBACK16-NEXT: movl %edx, 24(%ecx) -; FALLBACK16-NEXT: movl %edi, 16(%ecx) -; FALLBACK16-NEXT: movl %esi, 20(%ecx) -; FALLBACK16-NEXT: movl %ebp, 8(%ecx) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, 12(%ecx) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, (%ecx) +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 60(%esp,%ebx), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %ebx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, 4(%ecx) +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: addl %esi, %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK16-NEXT: orl %ebp, %ebx +; FALLBACK16-NEXT: orl %eax, %esi +; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; FALLBACK16-NEXT: sarl %cl, %edi +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl %edi, 28(%eax) +; FALLBACK16-NEXT: movl %esi, 4(%eax) +; FALLBACK16-NEXT: movl %ebx, 24(%eax) +; FALLBACK16-NEXT: movl %edx, 16(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 20(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 8(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 12(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, (%eax) ; FALLBACK16-NEXT: addl $108, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi @@ -9583,81 +9174,72 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $92, %esp +; FALLBACK17-NEXT: subl $108, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ecx), %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ecx), %edx -; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%ecx), %ebp -; FALLBACK17-NEXT: movl 16(%ecx), %ebx -; FALLBACK17-NEXT: movzbl (%eax), %eax +; FALLBACK17-NEXT: movups (%ecx), %xmm0 +; FALLBACK17-NEXT: movl 16(%ecx), %esi ; FALLBACK17-NEXT: movl 20(%ecx), %edi -; FALLBACK17-NEXT: movl 24(%ecx), %edx -; FALLBACK17-NEXT: movl 28(%ecx), %esi -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl 24(%ecx), %ebx +; FALLBACK17-NEXT: movl 28(%ecx), %edx +; FALLBACK17-NEXT: movzbl (%eax), %eax ; FALLBACK17-NEXT: movl %eax, %ecx ; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: sarl $31, %edx +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: sarl $31, %esi -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: andb $28, %al -; FALLBACK17-NEXT: movzbl %al, %ebp -; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi -; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movzbl %al, %edi +; FALLBACK17-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK17-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %esi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 40(%esp,%edi), %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %esi +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK17-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK17-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK17-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK17-NEXT: movl %esi, %edx +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK17-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK17-NEXT: shrdl %cl, %esi, %edi ; FALLBACK17-NEXT: sarl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl %ebx, 16(%ebp) -; FALLBACK17-NEXT: movl %edi, 20(%ebp) +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl %edx, 4(%ecx) +; FALLBACK17-NEXT: movl %ebp, 24(%ecx) +; FALLBACK17-NEXT: movl %eax, 28(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %esi, (%ebp) +; FALLBACK17-NEXT: movl %eax, 16(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $92, %esp +; FALLBACK17-NEXT: movl %eax, 20(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 8(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 12(%ecx) +; FALLBACK17-NEXT: movl %edi, (%ecx) +; FALLBACK17-NEXT: addl $108, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx @@ -9671,100 +9253,93 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK18-NEXT: movl (%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%esi), %ebx -; FALLBACK18-NEXT: movl 12(%esi), %ebp -; FALLBACK18-NEXT: movl 16(%esi), %edi -; FALLBACK18-NEXT: movzbl (%ecx), %ecx -; FALLBACK18-NEXT: movl 20(%esi), %edx -; FALLBACK18-NEXT: movl 24(%esi), %eax -; FALLBACK18-NEXT: movl 28(%esi), %esi -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movups (%ecx), %xmm0 +; FALLBACK18-NEXT: movl 16(%ecx), %esi +; FALLBACK18-NEXT: movl 20(%ecx), %edi +; FALLBACK18-NEXT: movl 24(%ecx), %ebx +; FALLBACK18-NEXT: movl 28(%ecx), %edx +; FALLBACK18-NEXT: movzbl (%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, %eax ; FALLBACK18-NEXT: shlb $3, %al -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: sarl $31, %edx +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: sarl $31, %esi -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: andb $28, %cl ; FALLBACK18-NEXT: movzbl %cl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ecx ; FALLBACK18-NEXT: movl %eax, %edx ; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp -; FALLBACK18-NEXT: orl %ebx, %ebp -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx +; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: addl %esi, %esi ; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: orl %ebx, %esi +; FALLBACK18-NEXT: orl %ecx, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK18-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %ecx, %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 44(%esp,%edi), %ecx +; FALLBACK18-NEXT: shrxl %eax, %ecx, %ebx +; FALLBACK18-NEXT: orl %ebx, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%esi,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx +; FALLBACK18-NEXT: addl %ecx, %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ecx, %ebx +; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: orl %ebx, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl 56(%esp,%edi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %edx, %ebx, %eax +; FALLBACK18-NEXT: movl 52(%esp,%edi), %ebx +; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: addl %ebx, %ebx +; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx +; FALLBACK18-NEXT: orl %ebp, %ebx +; FALLBACK18-NEXT: shrxl %ecx, %esi, %ecx +; FALLBACK18-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx +; FALLBACK18-NEXT: sarxl %eax, %edi, %eax ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi +; FALLBACK18-NEXT: orl %ecx, %edi ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: addl %ecx, %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK18-NEXT: orl %esi, %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: movl %eax, 28(%edx) +; FALLBACK18-NEXT: movl %ecx, 4(%edx) +; FALLBACK18-NEXT: movl %edi, 24(%edx) +; FALLBACK18-NEXT: movl %ebx, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edx) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -9778,82 +9353,71 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $92, %esp +; FALLBACK19-NEXT: subl $108, %esp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %edx -; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%ecx), %ebp -; FALLBACK19-NEXT: movl 16(%ecx), %ebx -; FALLBACK19-NEXT: movzbl (%eax), %eax +; FALLBACK19-NEXT: movups (%ecx), %xmm0 +; FALLBACK19-NEXT: movl 16(%ecx), %esi ; FALLBACK19-NEXT: movl 20(%ecx), %edi -; FALLBACK19-NEXT: movl 24(%ecx), %edx -; FALLBACK19-NEXT: movl 28(%ecx), %esi -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl 24(%ecx), %ebx +; FALLBACK19-NEXT: movl 28(%ecx), %edx +; FALLBACK19-NEXT: movzbl (%eax), %eax ; FALLBACK19-NEXT: movl %eax, %ecx ; FALLBACK19-NEXT: shlb $3, %cl +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: sarl $31, %edx +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: sarl $31, %esi -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: andb $28, %al -; FALLBACK19-NEXT: movzbl %al, %ebp -; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %esi, %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: movzbl %al, %edi +; FALLBACK19-NEXT: movl 48(%esp,%edi), %edx +; FALLBACK19-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK19-NEXT: movl %eax, %esi +; FALLBACK19-NEXT: shrdl %cl, %edx, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 40(%esp,%edi), %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl %edx, %esi ; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi -; FALLBACK19-NEXT: shrdl %cl, %edi, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: sarxl %ecx, %edi, %eax -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl %ebx, 16(%ebp) -; FALLBACK19-NEXT: movl %esi, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK19-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK19-NEXT: movl %eax, %ebx +; FALLBACK19-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, (%ebp) +; FALLBACK19-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK19-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 4(%ebp) -; FALLBACK19-NEXT: addl $92, %esp +; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK19-NEXT: movl %esi, 4(%eax) +; FALLBACK19-NEXT: movl %ebp, 24(%eax) +; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK19-NEXT: movl %esi, 28(%eax) +; FALLBACK19-NEXT: movl %edx, 16(%eax) +; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: shrdl %cl, %edx, %edi +; FALLBACK19-NEXT: movl %ebx, 20(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 8(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 12(%eax) +; FALLBACK19-NEXT: movl %edi, (%eax) +; FALLBACK19-NEXT: addl $108, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx @@ -9874,9 +9438,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl 20(%ecx), %edi ; FALLBACK20-NEXT: movl 24(%ecx), %ebx ; FALLBACK20-NEXT: movl 28(%ecx), %edx -; FALLBACK20-NEXT: movzbl (%eax), %eax -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shlb $3, %cl +; FALLBACK20-NEXT: movzbl (%eax), %ecx +; FALLBACK20-NEXT: movb %cl, %ch +; FALLBACK20-NEXT: shlb $3, %ch ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp) @@ -9891,88 +9455,92 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $28, %al -; FALLBACK20-NEXT: movzbl %al, %edi -; FALLBACK20-NEXT: movl 32(%esp,%edi), %eax -; FALLBACK20-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: andb $28, %cl +; FALLBACK20-NEXT: movzbl %cl, %ebx +; FALLBACK20-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 36(%esp,%ebx), %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: notb %cl +; FALLBACK20-NEXT: leal (%edx,%edx), %esi ; FALLBACK20-NEXT: movl %ecx, %edx -; FALLBACK20-NEXT: movb %cl, %dh -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %eax, %esi ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %eax -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %eax, %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, %eax -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %eax -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: movl 44(%esp,%ebx), %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %eax, %eax +; FALLBACK20-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 40(%esp,%ebx), %esi +; FALLBACK20-NEXT: movl %esi, %edi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: addl %eax, %eax ; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 52(%esp,%ebx), %edx +; FALLBACK20-NEXT: movl %edx, %ebp +; FALLBACK20-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: orl %edi, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK20-NEXT: leal (,%ebp,2), %edi +; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 60(%esp,%ebx), %edi +; FALLBACK20-NEXT: leal (%edi,%edi), %ebx +; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebp, %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %eax, %ebp -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 60(%esp,%edi), %eax -; FALLBACK20-NEXT: leal (%eax,%eax), %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: sarl %cl, %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movl %eax, 28(%ecx) -; FALLBACK20-NEXT: movl %esi, 4(%ecx) -; FALLBACK20-NEXT: movl %edi, 24(%ecx) -; FALLBACK20-NEXT: movl %ebp, 16(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, 20(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, 8(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, 12(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, (%ecx) +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK20-NEXT: orl %ebp, %ebx +; FALLBACK20-NEXT: orl %eax, %esi +; FALLBACK20-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; FALLBACK20-NEXT: sarl %cl, %edi +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl %edi, 28(%eax) +; FALLBACK20-NEXT: movl %esi, 4(%eax) +; FALLBACK20-NEXT: movl %ebx, 24(%eax) +; FALLBACK20-NEXT: movl %edx, 16(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 20(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 8(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 12(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, (%eax) ; FALLBACK20-NEXT: addl $108, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi @@ -10012,45 +9580,45 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: andb $28, %al -; FALLBACK21-NEXT: movzbl %al, %ebp -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK21-NEXT: movzbl %al, %edi +; FALLBACK21-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK21-NEXT: movl 44(%esp,%edi), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl 40(%esp,%edi), %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK21-NEXT: movl 52(%esp,%edi), %eax ; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK21-NEXT: shrdl %cl, %ebp, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl %edi, %esi -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %esi, 4(%ebp) -; FALLBACK21-NEXT: movl %ebx, 24(%ebp) -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx +; FALLBACK21-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK21-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK21-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK21-NEXT: movl %esi, %edx +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK21-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK21-NEXT: shrdl %cl, %esi, %edi ; FALLBACK21-NEXT: sarl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 28(%ebp) +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movl %edx, 4(%ecx) +; FALLBACK21-NEXT: movl %ebp, 24(%ecx) +; FALLBACK21-NEXT: movl %eax, 28(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) +; FALLBACK21-NEXT: movl %eax, 16(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) +; FALLBACK21-NEXT: movl %eax, 20(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) +; FALLBACK21-NEXT: movl %eax, 8(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %edx, (%ebp) +; FALLBACK21-NEXT: movl %eax, 12(%ecx) +; FALLBACK21-NEXT: movl %edi, (%ecx) ; FALLBACK21-NEXT: addl $108, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -10191,44 +9759,44 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: andb $28, %al -; FALLBACK23-NEXT: movzbl %al, %ebx -; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edi -; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK23-NEXT: movzbl %al, %edi +; FALLBACK23-NEXT: movl 48(%esp,%edi), %edx +; FALLBACK23-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK23-NEXT: movl %eax, %esi +; FALLBACK23-NEXT: shrdl %cl, %edx, %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK23-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK23-NEXT: movl %eax, %ebx +; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK23-NEXT: shrdl %cl, %eax, %edx +; FALLBACK23-NEXT: movl 60(%esp,%edi), %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK23-NEXT: shrdl %cl, %eax, %esi +; FALLBACK23-NEXT: movl 32(%esp,%edi), %edi ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl %ebx, 4(%eax) +; FALLBACK23-NEXT: movl %esi, 4(%eax) ; FALLBACK23-NEXT: movl %ebp, 24(%eax) -; FALLBACK23-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK23-NEXT: movl %ebx, 28(%eax) -; FALLBACK23-NEXT: movl %esi, 16(%eax) -; FALLBACK23-NEXT: movl %edi, 20(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 8(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 12(%eax) +; FALLBACK23-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK23-NEXT: movl %esi, 28(%eax) +; FALLBACK23-NEXT: movl %edx, 16(%eax) ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, (%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: shrdl %cl, %edx, %edi +; FALLBACK23-NEXT: movl %ebx, 20(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 8(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 12(%eax) +; FALLBACK23-NEXT: movl %edi, (%eax) ; FALLBACK23-NEXT: addl $108, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi @@ -10250,9 +9818,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl 20(%ecx), %edi ; FALLBACK24-NEXT: movl 24(%ecx), %ebx ; FALLBACK24-NEXT: movl 28(%ecx), %edx -; FALLBACK24-NEXT: movzbl (%eax), %eax -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shlb $3, %cl +; FALLBACK24-NEXT: movzbl (%eax), %ecx +; FALLBACK24-NEXT: movb %cl, %ch +; FALLBACK24-NEXT: shlb $3, %ch ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) @@ -10267,88 +9835,92 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $28, %al -; FALLBACK24-NEXT: movzbl %al, %edi -; FALLBACK24-NEXT: movl 32(%esp,%edi), %eax -; FALLBACK24-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: andb $28, %cl +; FALLBACK24-NEXT: movzbl %cl, %ebx +; FALLBACK24-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 36(%esp,%ebx), %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: notb %cl +; FALLBACK24-NEXT: leal (%edx,%edx), %esi ; FALLBACK24-NEXT: movl %ecx, %edx -; FALLBACK24-NEXT: movb %cl, %dh -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %eax, %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %eax -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl 48(%esp,%edi), %esi ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %eax, %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, %eax -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: movl 44(%esp,%ebx), %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %eax, %eax +; FALLBACK24-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 40(%esp,%ebx), %esi +; FALLBACK24-NEXT: movl %esi, %edi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: addl %eax, %eax ; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 52(%esp,%ebx), %edx +; FALLBACK24-NEXT: movl %edx, %ebp +; FALLBACK24-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: orl %edi, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK24-NEXT: leal (,%ebp,2), %edi +; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 60(%esp,%ebx), %edi +; FALLBACK24-NEXT: leal (%edi,%edi), %ebx +; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebp, %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %eax, %ebp -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 60(%esp,%edi), %eax -; FALLBACK24-NEXT: leal (%eax,%eax), %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: sarl %cl, %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: movl %eax, 28(%ecx) -; FALLBACK24-NEXT: movl %esi, 4(%ecx) -; FALLBACK24-NEXT: movl %edi, 24(%ecx) -; FALLBACK24-NEXT: movl %ebp, 16(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, 20(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, 8(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, 12(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, (%ecx) +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK24-NEXT: orl %ebp, %ebx +; FALLBACK24-NEXT: orl %eax, %esi +; FALLBACK24-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; FALLBACK24-NEXT: sarl %cl, %edi +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl %edi, 28(%eax) +; FALLBACK24-NEXT: movl %esi, 4(%eax) +; FALLBACK24-NEXT: movl %ebx, 24(%eax) +; FALLBACK24-NEXT: movl %edx, 16(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 20(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 8(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 12(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, (%eax) ; FALLBACK24-NEXT: addl $108, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi @@ -10388,45 +9960,45 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: andb $28, %al -; FALLBACK25-NEXT: movzbl %al, %ebp -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK25-NEXT: movzbl %al, %edi +; FALLBACK25-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK25-NEXT: movl 44(%esp,%edi), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl 40(%esp,%edi), %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK25-NEXT: movl 52(%esp,%edi), %eax ; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK25-NEXT: shrdl %cl, %ebp, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl %edi, %esi -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %esi, 4(%ebp) -; FALLBACK25-NEXT: movl %ebx, 24(%ebp) -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx +; FALLBACK25-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK25-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK25-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK25-NEXT: movl %esi, %edx +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK25-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK25-NEXT: shrdl %cl, %esi, %edi ; FALLBACK25-NEXT: sarl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 28(%ebp) +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: movl %edx, 4(%ecx) +; FALLBACK25-NEXT: movl %ebp, 24(%ecx) +; FALLBACK25-NEXT: movl %eax, 28(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) +; FALLBACK25-NEXT: movl %eax, 16(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) +; FALLBACK25-NEXT: movl %eax, 20(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) +; FALLBACK25-NEXT: movl %eax, 8(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %edx, (%ebp) +; FALLBACK25-NEXT: movl %eax, 12(%ecx) +; FALLBACK25-NEXT: movl %edi, (%ecx) ; FALLBACK25-NEXT: addl $108, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -10567,44 +10139,44 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: andb $28, %al -; FALLBACK27-NEXT: movzbl %al, %ebx -; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edi -; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK27-NEXT: movzbl %al, %edi +; FALLBACK27-NEXT: movl 48(%esp,%edi), %edx +; FALLBACK27-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK27-NEXT: movl %eax, %esi +; FALLBACK27-NEXT: shrdl %cl, %edx, %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK27-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK27-NEXT: movl %eax, %ebx +; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK27-NEXT: shrdl %cl, %eax, %edx +; FALLBACK27-NEXT: movl 60(%esp,%edi), %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK27-NEXT: shrdl %cl, %eax, %esi +; FALLBACK27-NEXT: movl 32(%esp,%edi), %edi ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl %ebx, 4(%eax) +; FALLBACK27-NEXT: movl %esi, 4(%eax) ; FALLBACK27-NEXT: movl %ebp, 24(%eax) -; FALLBACK27-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK27-NEXT: movl %ebx, 28(%eax) -; FALLBACK27-NEXT: movl %esi, 16(%eax) -; FALLBACK27-NEXT: movl %edi, 20(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 8(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 12(%eax) +; FALLBACK27-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK27-NEXT: movl %esi, 28(%eax) +; FALLBACK27-NEXT: movl %edx, 16(%eax) ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, (%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: shrdl %cl, %edx, %edi +; FALLBACK27-NEXT: movl %ebx, 20(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 8(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 12(%eax) +; FALLBACK27-NEXT: movl %edi, (%eax) ; FALLBACK27-NEXT: addl $108, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi @@ -10626,9 +10198,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl 20(%ecx), %edi ; FALLBACK28-NEXT: movl 24(%ecx), %ebx ; FALLBACK28-NEXT: movl 28(%ecx), %edx -; FALLBACK28-NEXT: movzbl (%eax), %eax -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shlb $3, %cl +; FALLBACK28-NEXT: movzbl (%eax), %ecx +; FALLBACK28-NEXT: movb %cl, %ch +; FALLBACK28-NEXT: shlb $3, %ch ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) @@ -10643,88 +10215,92 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $28, %al -; FALLBACK28-NEXT: movzbl %al, %edi -; FALLBACK28-NEXT: movl 32(%esp,%edi), %eax -; FALLBACK28-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: andb $28, %cl +; FALLBACK28-NEXT: movzbl %cl, %ebx +; FALLBACK28-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 36(%esp,%ebx), %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: notb %cl +; FALLBACK28-NEXT: leal (%edx,%edx), %esi ; FALLBACK28-NEXT: movl %ecx, %edx -; FALLBACK28-NEXT: movb %cl, %dh -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %eax, %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %eax -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl 48(%esp,%edi), %esi ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %eax, %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, %eax -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: movl 44(%esp,%ebx), %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %eax, %eax +; FALLBACK28-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 40(%esp,%ebx), %esi +; FALLBACK28-NEXT: movl %esi, %edi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: addl %eax, %eax ; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 52(%esp,%ebx), %edx +; FALLBACK28-NEXT: movl %edx, %ebp +; FALLBACK28-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: orl %edi, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK28-NEXT: leal (,%ebp,2), %edi +; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 60(%esp,%ebx), %edi +; FALLBACK28-NEXT: leal (%edi,%edi), %ebx +; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebp, %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %eax, %ebp -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 60(%esp,%edi), %eax -; FALLBACK28-NEXT: leal (%eax,%eax), %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: sarl %cl, %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: movl %eax, 28(%ecx) -; FALLBACK28-NEXT: movl %esi, 4(%ecx) -; FALLBACK28-NEXT: movl %edi, 24(%ecx) -; FALLBACK28-NEXT: movl %ebp, 16(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, 20(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, 8(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, 12(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, (%ecx) +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK28-NEXT: orl %ebp, %ebx +; FALLBACK28-NEXT: orl %eax, %esi +; FALLBACK28-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; FALLBACK28-NEXT: sarl %cl, %edi +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl %edi, 28(%eax) +; FALLBACK28-NEXT: movl %esi, 4(%eax) +; FALLBACK28-NEXT: movl %ebx, 24(%eax) +; FALLBACK28-NEXT: movl %edx, 16(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 20(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 8(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 12(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, (%eax) ; FALLBACK28-NEXT: addl $108, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi @@ -10764,45 +10340,45 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: andb $28, %al -; FALLBACK29-NEXT: movzbl %al, %ebp -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK29-NEXT: movzbl %al, %edi +; FALLBACK29-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK29-NEXT: movl 44(%esp,%edi), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl 40(%esp,%edi), %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK29-NEXT: movl 52(%esp,%edi), %eax ; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK29-NEXT: shrdl %cl, %ebp, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl %edi, %esi -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %esi, 4(%ebp) -; FALLBACK29-NEXT: movl %ebx, 24(%ebp) -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx +; FALLBACK29-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK29-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK29-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK29-NEXT: movl %esi, %edx +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK29-NEXT: movl 32(%esp,%edi), %edi +; FALLBACK29-NEXT: shrdl %cl, %esi, %edi ; FALLBACK29-NEXT: sarl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 28(%ebp) +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: movl %edx, 4(%ecx) +; FALLBACK29-NEXT: movl %ebp, 24(%ecx) +; FALLBACK29-NEXT: movl %eax, 28(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) +; FALLBACK29-NEXT: movl %eax, 16(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) +; FALLBACK29-NEXT: movl %eax, 20(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) +; FALLBACK29-NEXT: movl %eax, 8(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %edx, (%ebp) +; FALLBACK29-NEXT: movl %eax, 12(%ecx) +; FALLBACK29-NEXT: movl %edi, (%ecx) ; FALLBACK29-NEXT: addl $108, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -10943,44 +10519,44 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: andb $28, %al -; FALLBACK31-NEXT: movzbl %al, %ebx -; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edi -; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK31-NEXT: movzbl %al, %edi +; FALLBACK31-NEXT: movl 48(%esp,%edi), %edx +; FALLBACK31-NEXT: movl 44(%esp,%edi), %eax +; FALLBACK31-NEXT: movl %eax, %esi +; FALLBACK31-NEXT: shrdl %cl, %edx, %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK31-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK31-NEXT: movl %eax, %ebx +; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK31-NEXT: shrdl %cl, %eax, %edx +; FALLBACK31-NEXT: movl 60(%esp,%edi), %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK31-NEXT: shrdl %cl, %eax, %esi +; FALLBACK31-NEXT: movl 32(%esp,%edi), %edi ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl %ebx, 4(%eax) +; FALLBACK31-NEXT: movl %esi, 4(%eax) ; FALLBACK31-NEXT: movl %ebp, 24(%eax) -; FALLBACK31-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK31-NEXT: movl %ebx, 28(%eax) -; FALLBACK31-NEXT: movl %esi, 16(%eax) -; FALLBACK31-NEXT: movl %edi, 20(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 8(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 12(%eax) +; FALLBACK31-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK31-NEXT: movl %esi, 28(%eax) +; FALLBACK31-NEXT: movl %edx, 16(%eax) ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, (%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: shrdl %cl, %edx, %edi +; FALLBACK31-NEXT: movl %ebx, 20(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 8(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 12(%eax) +; FALLBACK31-NEXT: movl %edi, (%eax) ; FALLBACK31-NEXT: addl $108, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi @@ -10998,18 +10574,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: ashr_32bytes_dwordOff: ; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 +; FALLBACK0-NEXT: movups (%rdi), %xmm0 +; FALLBACK0-NEXT: movq 16(%rdi), %rcx ; FALLBACK0-NEXT: movq 24(%rdi), %rdi ; FALLBACK0-NEXT: movzbl (%rsi), %esi ; FALLBACK0-NEXT: movl %esi, %eax ; FALLBACK0-NEXT: shlb $5, %al ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: sarq $63, %rdi ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -11017,53 +10592,52 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: andb $6, %sil ; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: movq -72(%rsp,%r9,4), %r10 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r8 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi +; FALLBACK0-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %r11 +; FALLBACK0-NEXT: movq %r11, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 +; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %r9 +; FALLBACK0-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: shlq %cl, %r14 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r8 +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: orq %rbx, %r14 +; FALLBACK0-NEXT: addq %r11, %r11 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: orq %r8, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: sarq %cl, %r9 ; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %r11, 8(%rdx) +; FALLBACK0-NEXT: movq %r14, 16(%rdx) ; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: ashr_32bytes_dwordOff: ; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movups (%rdi), %xmm0 +; FALLBACK1-NEXT: movq 16(%rdi), %rax ; FALLBACK1-NEXT: movq 24(%rdi), %rdi ; FALLBACK1-NEXT: movzbl (%rsi), %esi ; FALLBACK1-NEXT: movl %esi, %ecx ; FALLBACK1-NEXT: shlb $5, %cl ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: sarq $63, %rdi ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -11071,34 +10645,33 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: andb $6, %sil ; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: sarq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) +; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK1-NEXT: movq %rdi, %r8 +; FALLBACK1-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK1-NEXT: movq %r9, %r10 +; FALLBACK1-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK1-NEXT: shrdq %cl, %r9, %rax +; FALLBACK1-NEXT: sarq %cl, %rsi +; FALLBACK1-NEXT: movq %r10, 8(%rdx) +; FALLBACK1-NEXT: movq %r8, 16(%rdx) +; FALLBACK1-NEXT: movq %rsi, 24(%rdx) +; FALLBACK1-NEXT: movq %rax, (%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: ashr_32bytes_dwordOff: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movups (%rdi), %xmm0 +; FALLBACK2-NEXT: movq 16(%rdi), %rcx ; FALLBACK2-NEXT: movq 24(%rdi), %rdi ; FALLBACK2-NEXT: movzbl (%rsi), %esi ; FALLBACK2-NEXT: movl %esi, %eax ; FALLBACK2-NEXT: shlb $5, %al ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: sarq $63, %rdi ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -11106,43 +10679,41 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: andb $6, %sil ; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi +; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: orq %rsi, %rdi ; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax +; FALLBACK2-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK2-NEXT: orq %r9, %rcx +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rax, %r8, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rax, 8(%rdx) +; FALLBACK2-NEXT: movq %rcx, 16(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes_dwordOff: ; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movups (%rdi), %xmm0 +; FALLBACK3-NEXT: movq 16(%rdi), %rax ; FALLBACK3-NEXT: movq 24(%rdi), %rdi ; FALLBACK3-NEXT: movzbl (%rsi), %esi ; FALLBACK3-NEXT: movl %esi, %ecx ; FALLBACK3-NEXT: shlb $5, %cl ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: sarq $63, %rdi ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -11150,23 +10721,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: andb $6, %sil ; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) +; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK3-NEXT: movq %rdi, %r8 +; FALLBACK3-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK3-NEXT: movq %r9, %r10 +; FALLBACK3-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK3-NEXT: shrdq %cl, %r9, %rax +; FALLBACK3-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK3-NEXT: movq %r10, 8(%rdx) +; FALLBACK3-NEXT: movq %r8, 16(%rdx) +; FALLBACK3-NEXT: movq %rcx, 24(%rdx) +; FALLBACK3-NEXT: movq %rax, (%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: ashr_32bytes_dwordOff: ; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %r14 ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movq 16(%rdi), %rcx @@ -11184,38 +10757,39 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: andb $6, %sil ; FALLBACK4-NEXT: movzbl %sil, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK4-NEXT: movq -72(%rsp,%r9,4), %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r8 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 +; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r11 +; FALLBACK4-NEXT: movq %r11, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK4-NEXT: shrq %cl, %rbx +; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r9 +; FALLBACK4-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx +; FALLBACK4-NEXT: shlq %cl, %r14 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: orq %r10, %rdi +; FALLBACK4-NEXT: orq %rbx, %r14 +; FALLBACK4-NEXT: addq %r11, %r11 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: shlq %cl, %r11 +; FALLBACK4-NEXT: orq %r8, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: sarq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %r11, 8(%rdx) +; FALLBACK4-NEXT: movq %r14, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) ; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: popq %r14 ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: ashr_32bytes_dwordOff: @@ -11240,16 +10814,16 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK5-NEXT: movq %rdi, %r8 ; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK5-NEXT: movq %rax, %r10 +; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK5-NEXT: movq %r9, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK5-NEXT: shrdq %cl, %r9, %rax ; FALLBACK5-NEXT: sarq %cl, %rsi ; FALLBACK5-NEXT: movq %r10, 8(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: movq %rax, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: ashr_32bytes_dwordOff: @@ -11316,20 +10890,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK7-NEXT: movq %rdi, %r8 ; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK7-NEXT: movq %rax, %r10 +; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK7-NEXT: movq %r9, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK7-NEXT: shrdq %cl, %r9, %rax +; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rcx ; FALLBACK7-NEXT: movq %r10, 8(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: movq %rcx, 24(%rdx) +; FALLBACK7-NEXT: movq %rax, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: ashr_32bytes_dwordOff: ; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %r14 ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK8-NEXT: movq 16(%rdi), %rcx @@ -11347,38 +10922,39 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: andb $6, %sil ; FALLBACK8-NEXT: movzbl %sil, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK8-NEXT: movq -72(%rsp,%r9,4), %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r8 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 +; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r11 +; FALLBACK8-NEXT: movq %r11, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK8-NEXT: shrq %cl, %rbx +; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r9 +; FALLBACK8-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx +; FALLBACK8-NEXT: shlq %cl, %r14 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: orq %r10, %rdi +; FALLBACK8-NEXT: orq %rbx, %r14 +; FALLBACK8-NEXT: addq %r11, %r11 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: shlq %cl, %r11 +; FALLBACK8-NEXT: orq %r8, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: sarq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %r11, 8(%rdx) +; FALLBACK8-NEXT: movq %r14, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) ; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: popq %r14 ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: ashr_32bytes_dwordOff: @@ -11403,16 +10979,16 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK9-NEXT: movq %rdi, %r8 ; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK9-NEXT: movq %rax, %r10 +; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK9-NEXT: movq %r9, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK9-NEXT: shrdq %cl, %r9, %rax ; FALLBACK9-NEXT: sarq %cl, %rsi ; FALLBACK9-NEXT: movq %r10, 8(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: movq %rax, (%rdx) ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: ashr_32bytes_dwordOff: @@ -11479,20 +11055,21 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK11-NEXT: movq %rdi, %r8 ; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK11-NEXT: movq %rax, %r10 +; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK11-NEXT: movq %r9, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK11-NEXT: shrdq %cl, %r9, %rax +; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rcx ; FALLBACK11-NEXT: movq %r10, 8(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: movq %rcx, 24(%rdx) +; FALLBACK11-NEXT: movq %rax, (%rdx) ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: ashr_32bytes_dwordOff: ; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %r14 ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK12-NEXT: movq 16(%rdi), %rcx @@ -11510,38 +11087,39 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: andb $6, %sil ; FALLBACK12-NEXT: movzbl %sil, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK12-NEXT: movq -72(%rsp,%r9,4), %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r8 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 +; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r11 +; FALLBACK12-NEXT: movq %r11, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK12-NEXT: shrq %cl, %rbx +; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %r14 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx +; FALLBACK12-NEXT: shlq %cl, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: orq %r10, %rdi +; FALLBACK12-NEXT: orq %rbx, %r14 +; FALLBACK12-NEXT: addq %r11, %r11 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: shlq %cl, %r11 +; FALLBACK12-NEXT: orq %r8, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: sarq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %r11, 8(%rdx) +; FALLBACK12-NEXT: movq %r14, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: popq %r14 ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: ashr_32bytes_dwordOff: @@ -11566,16 +11144,16 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK13-NEXT: movq %rdi, %r8 ; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK13-NEXT: movq %rax, %r10 +; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK13-NEXT: movq %r9, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK13-NEXT: shrdq %cl, %r9, %rax ; FALLBACK13-NEXT: sarq %cl, %rsi ; FALLBACK13-NEXT: movq %r10, 8(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: movq %rax, (%rdx) ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: ashr_32bytes_dwordOff: @@ -11642,84 +11220,56 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK15-NEXT: movq %rdi, %r8 ; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK15-NEXT: movq %rax, %r10 +; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %r9 +; FALLBACK15-NEXT: movq %r9, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %rax +; FALLBACK15-NEXT: shrdq %cl, %r9, %rax +; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rcx ; FALLBACK15-NEXT: movq %r10, 8(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: movq %rcx, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, (%rdx) ; FALLBACK15-NEXT: retq ; ; X86-SSE2-LABEL: ashr_32bytes_dwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $92, %esp +; X86-SSE2-NEXT: subl $64, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%eax), %edi -; X86-SSE2-NEXT: movl 12(%eax), %ebx -; X86-SSE2-NEXT: movl 16(%eax), %ebp -; X86-SSE2-NEXT: movl 20(%eax), %esi -; X86-SSE2-NEXT: movl 24(%eax), %edx -; X86-SSE2-NEXT: movl 28(%eax), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movzbl (%eax), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movl 16(%edx), %esi +; X86-SSE2-NEXT: movl 20(%edx), %edi +; X86-SSE2-NEXT: movl 24(%edx), %ebx +; X86-SSE2-NEXT: movl 28(%edx), %edx +; X86-SSE2-NEXT: movzbl (%ecx), %ecx ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-SSE2-NEXT: sarl $31, %edx ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: sarl $31, %ecx -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $7, %eax -; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi -; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi -; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx -; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp -; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx -; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $92, %esp +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: andl $7, %ecx +; X86-SSE2-NEXT: movups (%esp,%ecx,4), %xmm0 +; X86-SSE2-NEXT: movups 16(%esp,%ecx,4), %xmm1 +; X86-SSE2-NEXT: movups %xmm1, 16(%eax) +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $64, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx -; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: ashr_32bytes_dwordOff: @@ -11812,29 +11362,23 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: ashr_32bytes_qwordOff: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movq (%rdi), %rax -; X64-SSE2-NEXT: movq 8(%rdi), %rcx -; X64-SSE2-NEXT: movq 16(%rdi), %r8 -; X64-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-SSE2-NEXT: movups (%rdi), %xmm0 +; X64-SSE2-NEXT: movq 16(%rdi), %rax +; X64-SSE2-NEXT: movq 24(%rdi), %rcx ; X64-SSE2-NEXT: movzbl (%rsi), %esi -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: sarq $63, %rdi -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: sarq $63, %rcx +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: andl $3, %esi -; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax -; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi -; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi -; X64-SSE2-NEXT: movq %rsi, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) -; X64-SSE2-NEXT: movq %rcx, 8(%rdx) +; X64-SSE2-NEXT: movups -72(%rsp,%rsi,8), %xmm0 +; X64-SSE2-NEXT: movups -56(%rsp,%rsi,8), %xmm1 +; X64-SSE2-NEXT: movups %xmm1, 16(%rdx) +; X64-SSE2-NEXT: movups %xmm0, (%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: ashr_32bytes_qwordOff: @@ -11881,70 +11425,42 @@ define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no ; ; X86-SSE2-LABEL: ashr_32bytes_qwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $92, %esp +; X86-SSE2-NEXT: subl $64, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%eax), %edi -; X86-SSE2-NEXT: movl 12(%eax), %ebx -; X86-SSE2-NEXT: movl 16(%eax), %ebp -; X86-SSE2-NEXT: movl 20(%eax), %esi -; X86-SSE2-NEXT: movl 24(%eax), %edx -; X86-SSE2-NEXT: movl 28(%eax), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movzbl (%eax), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movl 16(%edx), %esi +; X86-SSE2-NEXT: movl 20(%edx), %edi +; X86-SSE2-NEXT: movl 24(%edx), %ebx +; X86-SSE2-NEXT: movl 28(%edx), %edx +; X86-SSE2-NEXT: movzbl (%ecx), %ecx ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-SSE2-NEXT: sarl $31, %edx ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: sarl $31, %ecx -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $3, %eax -; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi -; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi -; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx -; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp -; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx -; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $92, %esp +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: andl $3, %ecx +; X86-SSE2-NEXT: movups (%esp,%ecx,8), %xmm0 +; X86-SSE2-NEXT: movups 16(%esp,%ecx,8), %xmm1 +; X86-SSE2-NEXT: movups %xmm1, 16(%eax) +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $64, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx -; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: ashr_32bytes_qwordOff: @@ -12037,106 +11553,104 @@ define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: lshr_64bytes: ; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %rbp ; FALLBACK0-NEXT: pushq %r15 ; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %r13 ; FALLBACK0-NEXT: pushq %r12 ; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rax -; FALLBACK0-NEXT: movq 8(%rdi), %rcx -; FALLBACK0-NEXT: movq 16(%rdi), %r8 -; FALLBACK0-NEXT: movq 24(%rdi), %r9 -; FALLBACK0-NEXT: movq 32(%rdi), %r10 -; FALLBACK0-NEXT: movq 40(%rdi), %r11 -; FALLBACK0-NEXT: movq 48(%rdi), %rbx -; FALLBACK0-NEXT: movq 56(%rdi), %r14 -; FALLBACK0-NEXT: movl (%rsi), %edi -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: subq $24, %rsp +; FALLBACK0-NEXT: movups (%rdi), %xmm0 +; FALLBACK0-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK0-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK0-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK0-NEXT: movl (%rsi), %r8d +; FALLBACK0-NEXT: xorps %xmm4, %xmm4 +; FALLBACK0-NEXT: movaps %xmm4, (%rsp) +; FALLBACK0-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: leal (,%rdi,8), %eax +; FALLBACK0-NEXT: leal (,%r8,8), %eax ; FALLBACK0-NEXT: andl $56, %eax -; FALLBACK0-NEXT: andl $56, %edi -; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 -; FALLBACK0-NEXT: movq %r8, %r11 +; FALLBACK0-NEXT: andl $56, %r8d +; FALLBACK0-NEXT: movq -112(%rsp,%r8), %r15 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: shrq %cl, %r15 +; FALLBACK0-NEXT: movq -104(%rsp,%r8), %rcx +; FALLBACK0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %r8, %r8 +; FALLBACK0-NEXT: leaq (%rcx,%rcx), %rdi ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r10, %r8 -; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq %r10, %r15 +; FALLBACK0-NEXT: shlq %cl, %rdi +; FALLBACK0-NEXT: movq -88(%rsp,%r8), %r9 +; FALLBACK0-NEXT: movq %r9, %r12 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r15 -; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 +; FALLBACK0-NEXT: shrq %cl, %r12 +; FALLBACK0-NEXT: movq -80(%rsp,%r8), %r14 ; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: orq %r15, %r11 +; FALLBACK0-NEXT: movq -96(%rsp,%r8), %rbx +; FALLBACK0-NEXT: movq %rbx, %r13 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: addq %r10, %r10 +; FALLBACK0-NEXT: shrq %cl, %r13 +; FALLBACK0-NEXT: addq %r9, %r9 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: movq %rbx, %r12 +; FALLBACK0-NEXT: shlq %cl, %r9 +; FALLBACK0-NEXT: orq %r15, %rdi +; FALLBACK0-NEXT: movq -72(%rsp,%r8), %r15 +; FALLBACK0-NEXT: movq %r15, %r10 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r12 -; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 -; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: orq %r12, %r11 +; FALLBACK0-NEXT: movq -64(%rsp,%r8), %rbp +; FALLBACK0-NEXT: leaq (,%rbp,2), %r12 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r15 -; FALLBACK0-NEXT: orq %r12, %r15 +; FALLBACK0-NEXT: shlq %cl, %r12 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r14 -; FALLBACK0-NEXT: addq %rbx, %rbx +; FALLBACK0-NEXT: addq %r15, %r15 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rbx -; FALLBACK0-NEXT: orq %r14, %rbx +; FALLBACK0-NEXT: shlq %cl, %r15 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbp +; FALLBACK0-NEXT: orq %r13, %r9 +; FALLBACK0-NEXT: orq %r10, %r12 +; FALLBACK0-NEXT: movq -56(%rsp,%r8), %r8 +; FALLBACK0-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload ; FALLBACK0-NEXT: shrq %cl, %r13 -; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 +; FALLBACK0-NEXT: orq %r14, %r15 +; FALLBACK0-NEXT: orq %rbp, %r10 +; FALLBACK0-NEXT: addq %rbx, %rbx ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r14 -; FALLBACK0-NEXT: orq %r13, %r14 +; FALLBACK0-NEXT: shlq %cl, %rbx +; FALLBACK0-NEXT: orq %r13, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: movq %rdi, 56(%rdx) -; FALLBACK0-NEXT: movq %r14, 48(%rdx) -; FALLBACK0-NEXT: movq %rbx, 32(%rdx) -; FALLBACK0-NEXT: movq %r15, 40(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: shrq %cl, %r8 +; FALLBACK0-NEXT: movq %r8, 56(%rdx) +; FALLBACK0-NEXT: movq %rbx, 8(%rdx) +; FALLBACK0-NEXT: movq %r10, 48(%rdx) +; FALLBACK0-NEXT: movq %r15, 32(%rdx) +; FALLBACK0-NEXT: movq %r12, 40(%rdx) +; FALLBACK0-NEXT: movq %r9, 16(%rdx) ; FALLBACK0-NEXT: movq %r11, 24(%rdx) -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) +; FALLBACK0-NEXT: movq %rdi, (%rdx) +; FALLBACK0-NEXT: addq $24, %rsp ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: popq %r12 ; FALLBACK0-NEXT: popq %r13 ; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: popq %r15 +; FALLBACK0-NEXT: popq %rbp ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: lshr_64bytes: @@ -12144,59 +11658,52 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK1-NEXT: pushq %r15 ; FALLBACK1-NEXT: pushq %r14 ; FALLBACK1-NEXT: pushq %rbx -; FALLBACK1-NEXT: movq (%rdi), %rcx -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %r10 -; FALLBACK1-NEXT: movq 32(%rdi), %r11 -; FALLBACK1-NEXT: movq 40(%rdi), %rbx -; FALLBACK1-NEXT: movq 48(%rdi), %r14 -; FALLBACK1-NEXT: movq 56(%rdi), %rdi +; FALLBACK1-NEXT: movups (%rdi), %xmm0 +; FALLBACK1-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK1-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK1-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK1-NEXT: movl (%rsi), %eax -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: xorps %xmm4, %xmm4 +; FALLBACK1-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: leal (,%rax,8), %ecx ; FALLBACK1-NEXT: andl $56, %ecx ; FALLBACK1-NEXT: andl $56, %eax -; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK1-NEXT: movq %r9, %r8 -; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK1-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK1-NEXT: movq %r9, %rsi +; FALLBACK1-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK1-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK1-NEXT: movq %r10, %r8 +; FALLBACK1-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK1-NEXT: movq %r11, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx +; FALLBACK1-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK1-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK1-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r14 ; FALLBACK1-NEXT: movq %r14, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi +; FALLBACK1-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK1-NEXT: shrdq %cl, %r14, %rax ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shrq %cl, %rax -; FALLBACK1-NEXT: movq %r11, 48(%rdx) -; FALLBACK1-NEXT: movq %rax, 56(%rdx) -; FALLBACK1-NEXT: movq %r10, 32(%rdx) -; FALLBACK1-NEXT: movq %r15, 40(%rdx) -; FALLBACK1-NEXT: movq %rdi, 16(%rdx) -; FALLBACK1-NEXT: movq %rbx, 24(%rdx) -; FALLBACK1-NEXT: movq %rsi, (%rdx) -; FALLBACK1-NEXT: movq %r8, 8(%rdx) +; FALLBACK1-NEXT: shrq %cl, %r11 +; FALLBACK1-NEXT: movq %r15, 8(%rdx) +; FALLBACK1-NEXT: movq %r9, 48(%rdx) +; FALLBACK1-NEXT: movq %r11, 56(%rdx) +; FALLBACK1-NEXT: movq %rdi, 32(%rdx) +; FALLBACK1-NEXT: movq %rbx, 40(%rdx) +; FALLBACK1-NEXT: movq %r8, 16(%rdx) +; FALLBACK1-NEXT: movq %rsi, 24(%rdx) +; FALLBACK1-NEXT: movq %rax, (%rdx) ; FALLBACK1-NEXT: popq %rbx ; FALLBACK1-NEXT: popq %r14 ; FALLBACK1-NEXT: popq %r15 @@ -12211,77 +11718,69 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %r10 -; FALLBACK2-NEXT: movq 32(%rdi), %r11 -; FALLBACK2-NEXT: movq 40(%rdi), %rbx -; FALLBACK2-NEXT: movq 48(%rdi), %r14 -; FALLBACK2-NEXT: movq 56(%rdi), %rdi +; FALLBACK2-NEXT: movups (%rdi), %xmm0 +; FALLBACK2-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK2-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK2-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK2-NEXT: movl (%rsi), %eax -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: xorps %xmm4, %xmm4 +; FALLBACK2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: leal (,%rax,8), %ecx -; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: leal (,%rax,8), %esi +; FALLBACK2-NEXT: andl $56, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 -; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %rcx +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rdi +; FALLBACK2-NEXT: shrxq %rsi, %rdi, %r12 +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r13 +; FALLBACK2-NEXT: shrxq %rsi, %rcx, %r9 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r14 +; FALLBACK2-NEXT: shrxq %rsi, %r13, %r15 +; FALLBACK2-NEXT: movl %esi, %ebx +; FALLBACK2-NEXT: notb %bl +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rbp +; FALLBACK2-NEXT: leaq (,%rbp,2), %r8 +; FALLBACK2-NEXT: shlxq %rbx, %r8, %r8 +; FALLBACK2-NEXT: orq %r11, %r8 +; FALLBACK2-NEXT: leaq (,%r13,2), %r11 +; FALLBACK2-NEXT: shlxq %rbx, %r11, %r11 +; FALLBACK2-NEXT: orq %r12, %r11 +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r12, %r13 +; FALLBACK2-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %rax, %rsi +; FALLBACK2-NEXT: addq %rdi, %rdi +; FALLBACK2-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: leaq (%r12,%r12), %r9 +; FALLBACK2-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK2-NEXT: orq %r14, %r9 ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 +; FALLBACK2-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK2-NEXT: orq %r15, %r10 ; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax +; FALLBACK2-NEXT: shlxq %rbx, %rax, %rax ; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) +; FALLBACK2-NEXT: addq %rcx, %rcx +; FALLBACK2-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK2-NEXT: orq %rbp, %rcx +; FALLBACK2-NEXT: movq %rsi, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 8(%rdx) ; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r9, 40(%rdx) +; FALLBACK2-NEXT: movq %rdi, 16(%rdx) +; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 @@ -12296,59 +11795,52 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: pushq %r15 ; FALLBACK3-NEXT: pushq %r14 ; FALLBACK3-NEXT: pushq %rbx -; FALLBACK3-NEXT: movq (%rdi), %rcx -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %r10 -; FALLBACK3-NEXT: movq 32(%rdi), %r11 -; FALLBACK3-NEXT: movq 40(%rdi), %rbx -; FALLBACK3-NEXT: movq 48(%rdi), %r14 -; FALLBACK3-NEXT: movq 56(%rdi), %rdi +; FALLBACK3-NEXT: movups (%rdi), %xmm0 +; FALLBACK3-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK3-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK3-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK3-NEXT: movl (%rsi), %eax -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: xorps %xmm4, %xmm4 +; FALLBACK3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: leal (,%rax,8), %ecx ; FALLBACK3-NEXT: andl $56, %ecx ; FALLBACK3-NEXT: andl $56, %eax -; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK3-NEXT: movq %r9, %r8 -; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK3-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK3-NEXT: movq %r9, %rsi +; FALLBACK3-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK3-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK3-NEXT: movq %r10, %r8 +; FALLBACK3-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK3-NEXT: movq %r11, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx +; FALLBACK3-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK3-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK3-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r14 ; FALLBACK3-NEXT: movq %r14, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax +; FALLBACK3-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK3-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK3-NEXT: movq %r11, 48(%rdx) -; FALLBACK3-NEXT: movq %r10, 32(%rdx) -; FALLBACK3-NEXT: movq %r15, 40(%rdx) -; FALLBACK3-NEXT: movq %rdi, 16(%rdx) -; FALLBACK3-NEXT: movq %rbx, 24(%rdx) -; FALLBACK3-NEXT: movq %rsi, (%rdx) -; FALLBACK3-NEXT: movq %r8, 8(%rdx) -; FALLBACK3-NEXT: movq %rax, 56(%rdx) +; FALLBACK3-NEXT: shrdq %cl, %r14, %rax +; FALLBACK3-NEXT: movq %r15, 8(%rdx) +; FALLBACK3-NEXT: movq %r9, 48(%rdx) +; FALLBACK3-NEXT: movq %rdi, 32(%rdx) +; FALLBACK3-NEXT: movq %rbx, 40(%rdx) +; FALLBACK3-NEXT: movq %r8, 16(%rdx) +; FALLBACK3-NEXT: movq %rsi, 24(%rdx) +; FALLBACK3-NEXT: movq %rax, (%rdx) +; FALLBACK3-NEXT: movq %r10, 56(%rdx) ; FALLBACK3-NEXT: popq %rbx ; FALLBACK3-NEXT: popq %r14 ; FALLBACK3-NEXT: popq %r15 @@ -12362,14 +11854,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: pushq %r13 ; FALLBACK4-NEXT: pushq %r12 ; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: pushq %rax +; FALLBACK4-NEXT: subq $24, %rsp ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK4-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK4-NEXT: movl (%rsi), %r8d ; FALLBACK4-NEXT: xorps %xmm4, %xmm4 -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm4, (%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) @@ -12380,72 +11872,74 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: leal (,%r8,8), %eax ; FALLBACK4-NEXT: andl $56, %eax ; FALLBACK4-NEXT: andl $56, %r8d -; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9 +; FALLBACK4-NEXT: movq -112(%rsp,%r8), %r15 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: shrq %cl, %r15 +; FALLBACK4-NEXT: movq -104(%rsp,%r8), %rcx +; FALLBACK4-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi +; FALLBACK4-NEXT: leaq (%rcx,%rcx), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -104(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq %r10, %rbx +; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r9 +; FALLBACK4-NEXT: movq %r9, %r12 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbx -; FALLBACK4-NEXT: movq -96(%rsp,%r8), %r12 -; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK4-NEXT: shrq %cl, %r12 +; FALLBACK4-NEXT: movq -80(%rsp,%r8), %r14 +; FALLBACK4-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r11 -; FALLBACK4-NEXT: orq %rbx, %r11 -; FALLBACK4-NEXT: movq -112(%rsp,%r8), %rbx -; FALLBACK4-NEXT: movq %rbx, %r14 +; FALLBACK4-NEXT: movq -96(%rsp,%r8), %rbx +; FALLBACK4-NEXT: movq %rbx, %r13 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r14 -; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: shrq %cl, %r13 +; FALLBACK4-NEXT: addq %r9, %r9 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r14, %r10 -; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r14 -; FALLBACK4-NEXT: movq %r14, %r13 +; FALLBACK4-NEXT: shlq %cl, %r9 +; FALLBACK4-NEXT: orq %r15, %rdi +; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r15 +; FALLBACK4-NEXT: movq %r15, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r13 -; FALLBACK4-NEXT: movq -80(%rsp,%r8), %rbp -; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: orq %r12, %r11 +; FALLBACK4-NEXT: movq -64(%rsp,%r8), %rbp +; FALLBACK4-NEXT: leaq (,%rbp,2), %r12 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r15 -; FALLBACK4-NEXT: orq %r13, %r15 +; FALLBACK4-NEXT: shlq %cl, %r12 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r12 -; FALLBACK4-NEXT: addq %r14, %r14 +; FALLBACK4-NEXT: shrq %cl, %r14 +; FALLBACK4-NEXT: addq %r15, %r15 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r14 -; FALLBACK4-NEXT: orq %r12, %r14 +; FALLBACK4-NEXT: shlq %cl, %r15 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %rbp -; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r8 -; FALLBACK4-NEXT: leaq (%r8,%r8), %r12 +; FALLBACK4-NEXT: orq %r13, %r9 +; FALLBACK4-NEXT: orq %r10, %r12 +; FALLBACK4-NEXT: movq -56(%rsp,%r8), %r8 +; FALLBACK4-NEXT: leaq (%r8,%r8), %r10 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r12 -; FALLBACK4-NEXT: orq %rbp, %r12 +; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 +; FALLBACK4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; FALLBACK4-NEXT: shrq %cl, %r13 +; FALLBACK4-NEXT: orq %r14, %r15 +; FALLBACK4-NEXT: orq %rbp, %r10 ; FALLBACK4-NEXT: addq %rbx, %rbx ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r9, %rbx +; FALLBACK4-NEXT: orq %r13, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 ; FALLBACK4-NEXT: movq %r8, 56(%rdx) ; FALLBACK4-NEXT: movq %rbx, 8(%rdx) -; FALLBACK4-NEXT: movq %r12, 48(%rdx) -; FALLBACK4-NEXT: movq %r14, 32(%rdx) -; FALLBACK4-NEXT: movq %r15, 40(%rdx) -; FALLBACK4-NEXT: movq %r10, 16(%rdx) +; FALLBACK4-NEXT: movq %r10, 48(%rdx) +; FALLBACK4-NEXT: movq %r15, 32(%rdx) +; FALLBACK4-NEXT: movq %r12, 40(%rdx) +; FALLBACK4-NEXT: movq %r9, 16(%rdx) ; FALLBACK4-NEXT: movq %r11, 24(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: addq $8, %rsp +; FALLBACK4-NEXT: addq $24, %rsp ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: popq %r12 ; FALLBACK4-NEXT: popq %r13 @@ -12490,11 +11984,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r15 +; FALLBACK5-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK5-NEXT: movq %r14, %r15 ; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK5-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK5-NEXT: shrdq %cl, %r14, %rax ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: shrq %cl, %r11 ; FALLBACK5-NEXT: movq %r15, 8(%rdx) @@ -12504,7 +11998,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: movq %rbx, 40(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r14, (%rdx) +; FALLBACK5-NEXT: movq %rax, (%rdx) ; FALLBACK5-NEXT: popq %rbx ; FALLBACK5-NEXT: popq %r14 ; FALLBACK5-NEXT: popq %r15 @@ -12548,10 +12042,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movl %esi, %ebx ; FALLBACK6-NEXT: notb %bl ; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK6-NEXT: leaq (,%rbp,2), %r8 ; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK6-NEXT: leaq (,%r13,2), %r11 ; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK6-NEXT: orq %r12, %r11 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 @@ -12627,20 +12121,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r15 +; FALLBACK7-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK7-NEXT: movq %r14, %r15 ; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK7-NEXT: movq -128(%rsp,%rax), %rax ; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK7-NEXT: shrdq %cl, %r14, %rax ; FALLBACK7-NEXT: movq %r15, 8(%rdx) ; FALLBACK7-NEXT: movq %r9, 48(%rdx) ; FALLBACK7-NEXT: movq %rdi, 32(%rdx) ; FALLBACK7-NEXT: movq %rbx, 40(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rsi, 24(%rdx) -; FALLBACK7-NEXT: movq %r14, (%rdx) +; FALLBACK7-NEXT: movq %rax, (%rdx) ; FALLBACK7-NEXT: movq %r10, 56(%rdx) ; FALLBACK7-NEXT: popq %rbx ; FALLBACK7-NEXT: popq %r14 @@ -12655,84 +12149,86 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: pushq %r13 ; FALLBACK8-NEXT: pushq %r12 ; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: pushq %rax +; FALLBACK8-NEXT: subq $24, %rsp ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK8-NEXT: movl (%rsi), %r9d +; FALLBACK8-NEXT: movl (%rsi), %r8d ; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: leal (,%r9,8), %eax +; FALLBACK8-NEXT: leal (,%r8,8), %eax ; FALLBACK8-NEXT: andl $56, %eax -; FALLBACK8-NEXT: andl $56, %r9d -; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8 +; FALLBACK8-NEXT: andl $56, %r8d +; FALLBACK8-NEXT: movq -112(%rsp,%r8), %r15 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: shrq %cl, %r15 +; FALLBACK8-NEXT: movq -104(%rsp,%r8), %rcx +; FALLBACK8-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK8-NEXT: leaq (%rcx,%rcx), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq %r10, %rbx +; FALLBACK8-NEXT: movq -88(%rsp,%r8), %r9 +; FALLBACK8-NEXT: movq %r9, %r12 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbx -; FALLBACK8-NEXT: movq -96(%rsp,%r9), %r12 -; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK8-NEXT: shrq %cl, %r12 +; FALLBACK8-NEXT: movq -80(%rsp,%r8), %r14 +; FALLBACK8-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r11 -; FALLBACK8-NEXT: orq %rbx, %r11 -; FALLBACK8-NEXT: movq -112(%rsp,%r9), %rbx -; FALLBACK8-NEXT: movq %rbx, %r14 +; FALLBACK8-NEXT: movq -96(%rsp,%r8), %rbx +; FALLBACK8-NEXT: movq %rbx, %r13 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r14 -; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: shrq %cl, %r13 +; FALLBACK8-NEXT: addq %r9, %r9 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r14, %r10 -; FALLBACK8-NEXT: movq -88(%rsp,%r9), %r14 -; FALLBACK8-NEXT: movq %r14, %r13 +; FALLBACK8-NEXT: shlq %cl, %r9 +; FALLBACK8-NEXT: orq %r15, %rdi +; FALLBACK8-NEXT: movq -72(%rsp,%r8), %r15 +; FALLBACK8-NEXT: movq %r15, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r13 -; FALLBACK8-NEXT: movq -80(%rsp,%r9), %rbp -; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: orq %r12, %r11 +; FALLBACK8-NEXT: movq -64(%rsp,%r8), %rbp +; FALLBACK8-NEXT: leaq (,%rbp,2), %r12 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r15 -; FALLBACK8-NEXT: orq %r13, %r15 +; FALLBACK8-NEXT: shlq %cl, %r12 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r12 -; FALLBACK8-NEXT: addq %r14, %r14 +; FALLBACK8-NEXT: shrq %cl, %r14 +; FALLBACK8-NEXT: addq %r15, %r15 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r14 -; FALLBACK8-NEXT: orq %r12, %r14 +; FALLBACK8-NEXT: shlq %cl, %r15 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %rbp -; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %r12 +; FALLBACK8-NEXT: orq %r13, %r9 +; FALLBACK8-NEXT: orq %r10, %r12 +; FALLBACK8-NEXT: movq -56(%rsp,%r8), %r8 +; FALLBACK8-NEXT: leaq (%r8,%r8), %r10 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r12 -; FALLBACK8-NEXT: orq %rbp, %r12 +; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 +; FALLBACK8-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; FALLBACK8-NEXT: shrq %cl, %r13 +; FALLBACK8-NEXT: orq %r14, %r15 +; FALLBACK8-NEXT: orq %rbp, %r10 ; FALLBACK8-NEXT: addq %rbx, %rbx ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r8, %rbx +; FALLBACK8-NEXT: orq %r13, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 56(%rdx) +; FALLBACK8-NEXT: shrq %cl, %r8 +; FALLBACK8-NEXT: movq %r8, 56(%rdx) ; FALLBACK8-NEXT: movq %rbx, 8(%rdx) -; FALLBACK8-NEXT: movq %r12, 48(%rdx) -; FALLBACK8-NEXT: movq %r14, 32(%rdx) -; FALLBACK8-NEXT: movq %r15, 40(%rdx) -; FALLBACK8-NEXT: movq %r10, 16(%rdx) +; FALLBACK8-NEXT: movq %r10, 48(%rdx) +; FALLBACK8-NEXT: movq %r15, 32(%rdx) +; FALLBACK8-NEXT: movq %r12, 40(%rdx) +; FALLBACK8-NEXT: movq %r9, 16(%rdx) ; FALLBACK8-NEXT: movq %r11, 24(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: addq $8, %rsp +; FALLBACK8-NEXT: addq $24, %rsp ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: popq %r12 ; FALLBACK8-NEXT: popq %r13 @@ -12772,11 +12268,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK9-NEXT: movq %r14, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK9-NEXT: shrdq %cl, %r14, %rax ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) @@ -12786,7 +12282,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r14, (%rdx) +; FALLBACK9-NEXT: movq %rax, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 ; FALLBACK9-NEXT: popq %r15 @@ -12825,10 +12321,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movl %esi, %ebx ; FALLBACK10-NEXT: notb %bl ; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK10-NEXT: leaq (,%rbp,2), %r8 ; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK10-NEXT: leaq (,%r13,2), %r11 ; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK10-NEXT: orq %r12, %r11 ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 @@ -12899,20 +12395,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r15 +; FALLBACK11-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK11-NEXT: movq %r14, %r15 ; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK11-NEXT: movq -128(%rsp,%rax), %rax ; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK11-NEXT: shrdq %cl, %r14, %rax ; FALLBACK11-NEXT: movq %r15, 8(%rdx) ; FALLBACK11-NEXT: movq %r9, 48(%rdx) ; FALLBACK11-NEXT: movq %rdi, 32(%rdx) ; FALLBACK11-NEXT: movq %rbx, 40(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) ; FALLBACK11-NEXT: movq %rsi, 24(%rdx) -; FALLBACK11-NEXT: movq %r14, (%rdx) +; FALLBACK11-NEXT: movq %rax, (%rdx) ; FALLBACK11-NEXT: movq %r10, 56(%rdx) ; FALLBACK11-NEXT: popq %rbx ; FALLBACK11-NEXT: popq %r14 @@ -12928,81 +12424,83 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: pushq %r13 ; FALLBACK12-NEXT: pushq %r12 ; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: pushq %rax +; FALLBACK12-NEXT: subq $24, %rsp ; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %r9d +; FALLBACK12-NEXT: movl (%rsi), %r10d ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%r9,8), %eax +; FALLBACK12-NEXT: leal (,%r10,8), %eax ; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %r9d -; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 +; FALLBACK12-NEXT: andl $56, %r10d +; FALLBACK12-NEXT: movq -112(%rsp,%r10), %r15 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: shrq %cl, %r15 +; FALLBACK12-NEXT: movq -104(%rsp,%r10), %rcx +; FALLBACK12-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK12-NEXT: leaq (%rcx,%rcx), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx +; FALLBACK12-NEXT: movq -88(%rsp,%r10), %r8 +; FALLBACK12-NEXT: movq %r8, %r12 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 -; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK12-NEXT: shrq %cl, %r12 +; FALLBACK12-NEXT: movq -80(%rsp,%r10), %r14 +; FALLBACK12-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 -; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx -; FALLBACK12-NEXT: movq %rbx, %r14 +; FALLBACK12-NEXT: movq -96(%rsp,%r10), %rbx +; FALLBACK12-NEXT: movq %rbx, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: shrq %cl, %r13 +; FALLBACK12-NEXT: addq %r8, %r8 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 -; FALLBACK12-NEXT: movq %r14, %r13 +; FALLBACK12-NEXT: shlq %cl, %r8 +; FALLBACK12-NEXT: orq %r15, %rdi +; FALLBACK12-NEXT: movq -72(%rsp,%r10), %r15 +; FALLBACK12-NEXT: movq %r15, %r9 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp -; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK12-NEXT: shrq %cl, %r9 +; FALLBACK12-NEXT: orq %r12, %r11 +; FALLBACK12-NEXT: movq -64(%rsp,%r10), %rbp +; FALLBACK12-NEXT: leaq (,%rbp,2), %r12 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r15 -; FALLBACK12-NEXT: orq %r13, %r15 +; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r12 -; FALLBACK12-NEXT: addq %r14, %r14 +; FALLBACK12-NEXT: shrq %cl, %r14 +; FALLBACK12-NEXT: addq %r15, %r15 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r14 -; FALLBACK12-NEXT: orq %r12, %r14 +; FALLBACK12-NEXT: shlq %cl, %r15 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 +; FALLBACK12-NEXT: orq %r13, %r8 +; FALLBACK12-NEXT: orq %r9, %r12 +; FALLBACK12-NEXT: movq -56(%rsp,%r10), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %r10 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r12 -; FALLBACK12-NEXT: orq %rbp, %r12 +; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 +; FALLBACK12-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; FALLBACK12-NEXT: shrq %cl, %r13 +; FALLBACK12-NEXT: orq %r14, %r15 +; FALLBACK12-NEXT: orq %rbp, %r10 ; FALLBACK12-NEXT: addq %rbx, %rbx ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r8, %rbx +; FALLBACK12-NEXT: orq %r13, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 56(%rdx) ; FALLBACK12-NEXT: movq %rbx, 8(%rdx) -; FALLBACK12-NEXT: movq %r12, 48(%rdx) -; FALLBACK12-NEXT: movq %r14, 32(%rdx) -; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) +; FALLBACK12-NEXT: movq %r10, 48(%rdx) +; FALLBACK12-NEXT: movq %r15, 32(%rdx) +; FALLBACK12-NEXT: movq %r12, 40(%rdx) +; FALLBACK12-NEXT: movq %r8, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: addq $8, %rsp +; FALLBACK12-NEXT: addq $24, %rsp ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: popq %r12 ; FALLBACK12-NEXT: popq %r13 @@ -13018,42 +12516,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: pushq %r14 ; FALLBACK13-NEXT: pushq %rbx ; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK13-NEXT: movl (%rsi), %edi +; FALLBACK13-NEXT: movl (%rsi), %eax ; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: leal (,%rdi,8), %ecx +; FALLBACK13-NEXT: leal (,%rax,8), %ecx ; FALLBACK13-NEXT: andl $56, %ecx -; FALLBACK13-NEXT: andl $56, %edi -; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi -; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9 -; FALLBACK13-NEXT: movq %r9, %rax -; FALLBACK13-NEXT: shrdq %cl, %rsi, %rax -; FALLBACK13-NEXT: movq -112(%rsp,%rdi), %r10 +; FALLBACK13-NEXT: andl $56, %eax +; FALLBACK13-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK13-NEXT: movq %r9, %rsi +; FALLBACK13-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK13-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK13-NEXT: movq %r10, %r8 ; FALLBACK13-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK13-NEXT: movq -80(%rsp,%rdi), %r9 -; FALLBACK13-NEXT: movq -88(%rsp,%rdi), %r11 +; FALLBACK13-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK13-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK13-NEXT: movq %r11, %rbx ; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK13-NEXT: shrdq %cl, %r11, %rsi -; FALLBACK13-NEXT: movq -72(%rsp,%rdi), %r11 +; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK13-NEXT: movq -128(%rsp,%rdi), %r14 -; FALLBACK13-NEXT: movq -120(%rsp,%rdi), %rdi -; FALLBACK13-NEXT: movq %rdi, %r15 +; FALLBACK13-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK13-NEXT: movq %r14, %r15 ; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r14 +; FALLBACK13-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK13-NEXT: shrdq %cl, %r14, %rax ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: shrq %cl, %r11 ; FALLBACK13-NEXT: movq %r15, 8(%rdx) ; FALLBACK13-NEXT: movq %r9, 48(%rdx) ; FALLBACK13-NEXT: movq %r11, 56(%rdx) -; FALLBACK13-NEXT: movq %rsi, 32(%rdx) +; FALLBACK13-NEXT: movq %rdi, 32(%rdx) ; FALLBACK13-NEXT: movq %rbx, 40(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rax, 24(%rdx) -; FALLBACK13-NEXT: movq %r14, (%rdx) +; FALLBACK13-NEXT: movq %rsi, 24(%rdx) +; FALLBACK13-NEXT: movq %rax, (%rdx) ; FALLBACK13-NEXT: popq %rbx ; FALLBACK13-NEXT: popq %r14 ; FALLBACK13-NEXT: popq %r15 @@ -13089,10 +12587,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movl %ecx, %ebx ; FALLBACK14-NEXT: notb %bl ; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK14-NEXT: leaq (,%rbp,2), %r8 ; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK14-NEXT: leaq (,%r13,2), %r11 ; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK14-NEXT: orq %r12, %r11 ; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 @@ -13160,20 +12658,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK15-NEXT: movq %r14, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK15-NEXT: movq -128(%rsp,%rax), %rax ; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: shrdq %cl, %r14, %rax ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) ; FALLBACK15-NEXT: movq %rdi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) ; FALLBACK15-NEXT: movq %rsi, 24(%rdx) -; FALLBACK15-NEXT: movq %r14, (%rdx) +; FALLBACK15-NEXT: movq %rax, (%rdx) ; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx ; FALLBACK15-NEXT: popq %r14 @@ -13187,211 +12685,193 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $204, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 16(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 20(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 24(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 28(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%eax), %ebp -; FALLBACK16-NEXT: movl 44(%eax), %ebx -; FALLBACK16-NEXT: movl 48(%eax), %edi -; FALLBACK16-NEXT: movl 52(%eax), %esi -; FALLBACK16-NEXT: movl 56(%eax), %edx -; FALLBACK16-NEXT: movl 60(%eax), %ecx +; FALLBACK16-NEXT: subl $236, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movups (%ecx), %xmm0 +; FALLBACK16-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK16-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK16-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK16-NEXT: movl (%eax), %eax -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: xorps %xmm4, %xmm4 +; FALLBACK16-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %eax, %esi -; FALLBACK16-NEXT: andl $60, %esi -; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: andl $60, %ecx +; FALLBACK16-NEXT: movl 100(%esp,%ecx), %edx +; FALLBACK16-NEXT: movl %ecx, %ebx ; FALLBACK16-NEXT: shll $3, %eax ; FALLBACK16-NEXT: andl $24, %eax -; FALLBACK16-NEXT: movl %edx, %edi +; FALLBACK16-NEXT: movl %edx, %esi ; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 104(%esp,%ebx), %ebp +; FALLBACK16-NEXT: leal (,%ebp,2), %esi ; FALLBACK16-NEXT: movb %al, %ch ; FALLBACK16-NEXT: notb %ch ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: movl %eax, %edi +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: addl %edx, %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %edi, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK16-NEXT: movl %edx, %ebp -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %ebp, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %eax +; FALLBACK16-NEXT: movl %edi, %edx +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 112(%esp,%ebx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%eax,%eax), %edi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movl %eax, %edx +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %esi, %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %eax +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: addl %eax, %eax +; FALLBACK16-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%edi,%edi), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 124(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %ebp ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %eax +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 128(%esp,%ebx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %eax, %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %edx, %ebx +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %esi, %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK16-NEXT: movl 132(%esp,%esi), %edx +; FALLBACK16-NEXT: movl %edx, %eax +; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %eax, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK16-NEXT: leal (%edx,%edx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 136(%esp,%esi), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %eax, %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 140(%esp,%esi), %edx +; FALLBACK16-NEXT: movl %edx, %edi +; FALLBACK16-NEXT: movl %ebx, %ecx ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 144(%esp,%esi), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %ebp, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: addl %edi, %edi +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: movl %ebx, %edi +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: addl %edx, %edx ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %edx, %edi -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK16-NEXT: movl %esi, %ebx -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK16-NEXT: movl 148(%esp,%esi), %esi +; FALLBACK16-NEXT: movl %esi, %eax +; FALLBACK16-NEXT: movl %edi, %ecx +; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 152(%esp,%ebx), %eax ; FALLBACK16-NEXT: leal (%eax,%eax), %ebp +; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: movl %edi, %edx +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK16-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK16-NEXT: movl 156(%esp,%ebx), %ebx ; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK16-NEXT: orl %eax, %edx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx @@ -13424,165 +12904,120 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: movl %ecx, (%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $204, %esp +; FALLBACK16-NEXT: addl $236, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx ; FALLBACK16-NEXT: popl %ebp ; FALLBACK16-NEXT: retl ; -; FALLBACK17-LABEL: lshr_64bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $188, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 20(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 28(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 36(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%ecx), %ebp -; FALLBACK17-NEXT: movl 44(%ecx), %ebx -; FALLBACK17-NEXT: movl 48(%ecx), %edi -; FALLBACK17-NEXT: movl 52(%ecx), %esi -; FALLBACK17-NEXT: movl 56(%ecx), %edx -; FALLBACK17-NEXT: movl 60(%ecx), %eax +; FALLBACK17-LABEL: lshr_64bytes: +; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebp +; FALLBACK17-NEXT: pushl %ebx +; FALLBACK17-NEXT: pushl %edi +; FALLBACK17-NEXT: pushl %esi +; FALLBACK17-NEXT: subl $188, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %ecx -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movups (%ecx), %xmm0 +; FALLBACK17-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK17-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK17-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK17-NEXT: movl (%eax), %ecx +; FALLBACK17-NEXT: xorps %xmm4, %xmm4 +; FALLBACK17-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %ecx, %ebx +; FALLBACK17-NEXT: andl $60, %ebx +; FALLBACK17-NEXT: movl 56(%esp,%ebx), %edx +; FALLBACK17-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shll $3, %ecx ; FALLBACK17-NEXT: andl $24, %ecx ; FALLBACK17-NEXT: shrdl %cl, %edx, %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl 64(%esp,%ebx), %edi +; FALLBACK17-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, %esi ; FALLBACK17-NEXT: shrdl %cl, %edi, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK17-NEXT: movl 68(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK17-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %edi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK17-NEXT: movl 84(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl %esi, %edx ; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl 96(%esp,%ebx), %edi +; FALLBACK17-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %edi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi ; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 104(%esp,%ebx), %eax +; FALLBACK17-NEXT: movl 100(%esp,%ebx), %edx +; FALLBACK17-NEXT: movl %edx, %esi +; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: shrdl %cl, %edx, %edi +; FALLBACK17-NEXT: movl 108(%esp,%ebx), %edx +; FALLBACK17-NEXT: shrdl %cl, %edx, %eax +; FALLBACK17-NEXT: movl 48(%esp,%ebx), %ebx +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: shrl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 60(%ebp) -; FALLBACK17-NEXT: movl %esi, 48(%ebp) -; FALLBACK17-NEXT: movl %edi, 52(%ebp) +; FALLBACK17-NEXT: shrl %cl, %edx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl %eax, 56(%ecx) +; FALLBACK17-NEXT: movl %edx, 60(%ecx) +; FALLBACK17-NEXT: movl %edi, 48(%ecx) +; FALLBACK17-NEXT: movl %esi, 52(%ecx) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 40(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) +; FALLBACK17-NEXT: movl %eax, 44(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) +; FALLBACK17-NEXT: movl %eax, 32(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) +; FALLBACK17-NEXT: movl %eax, 36(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) +; FALLBACK17-NEXT: movl %eax, 24(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) +; FALLBACK17-NEXT: movl %eax, 28(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) +; FALLBACK17-NEXT: movl %eax, 16(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) +; FALLBACK17-NEXT: movl %eax, 20(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) +; FALLBACK17-NEXT: movl %eax, 8(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) +; FALLBACK17-NEXT: movl %eax, 12(%ecx) +; FALLBACK17-NEXT: movl %ebx, (%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) +; FALLBACK17-NEXT: movl %eax, 4(%ecx) ; FALLBACK17-NEXT: addl $188, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi @@ -13598,67 +13033,22 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $204, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movups (%ecx), %xmm0 +; FALLBACK18-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK18-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK18-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 12(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 20(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 28(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 36(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebp -; FALLBACK18-NEXT: movl 44(%eax), %ebx -; FALLBACK18-NEXT: movl 48(%eax), %edi -; FALLBACK18-NEXT: movl 52(%eax), %esi -; FALLBACK18-NEXT: movl 56(%eax), %edx -; FALLBACK18-NEXT: movl 60(%eax), %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %eax -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: xorps %xmm4, %xmm4 +; FALLBACK18-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: leal (,%eax,8), %edx +; FALLBACK18-NEXT: leal (,%ecx,8), %edx ; FALLBACK18-NEXT: andl $24, %edx ; FALLBACK18-NEXT: andl $60, %ecx ; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi @@ -13668,14 +13058,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %edx, %ebx ; FALLBACK18-NEXT: notb %bl ; FALLBACK18-NEXT: leal (%eax,%eax), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK18-NEXT: orl %edi, %ebp +; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK18-NEXT: orl %edi, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax @@ -13726,24 +13116,24 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: movl 112(%esp,%ecx), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK18-NEXT: shlxl %ebx, %esi, %ecx +; FALLBACK18-NEXT: movl 108(%esp,%eax), %esi ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %esi, %esi ; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK18-NEXT: movl 120(%esp,%eax), %ebp +; FALLBACK18-NEXT: leal (,%ebp,2), %ecx ; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax +; FALLBACK18-NEXT: movl 116(%esp,%eax), %eax ; FALLBACK18-NEXT: shrxl %edx, %eax, %edi ; FALLBACK18-NEXT: orl %edi, %ecx ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload @@ -13800,75 +13190,31 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $188, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 20(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 28(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 36(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%ecx), %ebp -; FALLBACK19-NEXT: movl 44(%ecx), %ebx -; FALLBACK19-NEXT: movl 48(%ecx), %edi -; FALLBACK19-NEXT: movl 52(%ecx), %esi -; FALLBACK19-NEXT: movl 56(%ecx), %edx -; FALLBACK19-NEXT: movl 60(%ecx), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %ecx -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movups (%ecx), %xmm0 +; FALLBACK19-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK19-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK19-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK19-NEXT: movl (%eax), %ecx +; FALLBACK19-NEXT: xorps %xmm4, %xmm4 +; FALLBACK19-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ecx, %ebp ; FALLBACK19-NEXT: andl $60, %ebp ; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 52(%esp,%ebp), %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shll $3, %ecx ; FALLBACK19-NEXT: andl $24, %ecx -; FALLBACK19-NEXT: shrdl %cl, %edx, %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %edx, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %esi @@ -13896,7 +13242,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx @@ -13908,10 +13254,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl %edi, %edx ; FALLBACK19-NEXT: shrdl %cl, %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %edi, %esi +; FALLBACK19-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %edi, %eax ; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl %eax, 56(%ebp) ; FALLBACK19-NEXT: movl %esi, 48(%ebp) @@ -13919,7 +13265,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl %ebx, 40(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 44(%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 32(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 36(%ebp) @@ -13933,12 +13279,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl %eax, 20(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK19-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: shrdl %cl, %edx, %edi +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 12(%ebp) ; FALLBACK19-NEXT: movl %edi, (%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 4(%ebp) @@ -13956,7 +13302,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $204, %esp +; FALLBACK20-NEXT: subl $236, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 @@ -13973,150 +13319,176 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %esi -; FALLBACK20-NEXT: andl $60, %esi -; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: andl $60, %ecx +; FALLBACK20-NEXT: movl 100(%esp,%ecx), %edx +; FALLBACK20-NEXT: movl %ecx, %ebx ; FALLBACK20-NEXT: shll $3, %eax ; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %edi +; FALLBACK20-NEXT: movl %edx, %esi ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 104(%esp,%ebx), %ebp +; FALLBACK20-NEXT: leal (,%ebp,2), %esi ; FALLBACK20-NEXT: movb %al, %ch ; FALLBACK20-NEXT: notb %ch ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl %eax, %edi +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: addl %edx, %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %edi, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: movl %edi, %edx +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 112(%esp,%ebx), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%eax,%eax), %edi ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %eax, %eax +; FALLBACK20-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%edi,%edi), %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 124(%esp,%ebx), %esi +; FALLBACK20-NEXT: movl %esi, %ebp ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %eax +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 128(%esp,%ebx), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %eax, %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movl %edx, %ebx +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK20-NEXT: leal (%edx,%edx), %eax +; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK20-NEXT: movl 132(%esp,%esi), %edx +; FALLBACK20-NEXT: movl %edx, %eax +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 136(%esp,%esi), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %eax, %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 140(%esp,%esi), %edx +; FALLBACK20-NEXT: movl %edx, %edi +; FALLBACK20-NEXT: movl %ebx, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 144(%esp,%esi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: addl %edi, %edi +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: movl %ebx, %edi +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %edx, %edx ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK20-NEXT: movl 148(%esp,%esi), %esi +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: movl %edi, %ecx +; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 152(%esp,%ebx), %eax ; FALLBACK20-NEXT: leal (%eax,%eax), %ebp +; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: movl %edi, %edx +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK20-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: movl 156(%esp,%ebx), %ebx ; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK20-NEXT: orl %eax, %edx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx @@ -14149,7 +13521,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl %ecx, (%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 4(%eax) -; FALLBACK20-NEXT: addl $204, %esp +; FALLBACK20-NEXT: addl $236, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx @@ -14179,91 +13551,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %ecx, %ebx +; FALLBACK21-NEXT: andl $60, %ebx +; FALLBACK21-NEXT: movl 56(%esp,%ebx), %edx +; FALLBACK21-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shll $3, %ecx ; FALLBACK21-NEXT: andl $24, %ecx ; FALLBACK21-NEXT: shrdl %cl, %edx, %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 64(%esp,%ebx), %edi +; FALLBACK21-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, %esi ; FALLBACK21-NEXT: shrdl %cl, %edi, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK21-NEXT: movl 68(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK21-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %edi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK21-NEXT: movl 84(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl %esi, %edx ; FALLBACK21-NEXT: shrdl %cl, %eax, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: movl 96(%esp,%ebx), %edi +; FALLBACK21-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %edi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK21-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK21-NEXT: movl 104(%esp,%ebx), %eax +; FALLBACK21-NEXT: movl 100(%esp,%ebx), %edx +; FALLBACK21-NEXT: movl %edx, %esi +; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: movl 108(%esp,%ebx), %edx +; FALLBACK21-NEXT: shrdl %cl, %edx, %eax +; FALLBACK21-NEXT: movl 48(%esp,%ebx), %ebx +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: shrl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 60(%ebp) -; FALLBACK21-NEXT: movl %esi, 48(%ebp) -; FALLBACK21-NEXT: movl %edi, 52(%ebp) +; FALLBACK21-NEXT: shrl %cl, %edx +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movl %eax, 56(%ecx) +; FALLBACK21-NEXT: movl %edx, 60(%ecx) +; FALLBACK21-NEXT: movl %edi, 48(%ecx) +; FALLBACK21-NEXT: movl %esi, 52(%ecx) ; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) +; FALLBACK21-NEXT: movl %eax, 40(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) +; FALLBACK21-NEXT: movl %eax, 44(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) +; FALLBACK21-NEXT: movl %eax, 32(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) +; FALLBACK21-NEXT: movl %eax, 36(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) +; FALLBACK21-NEXT: movl %eax, 24(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) +; FALLBACK21-NEXT: movl %eax, 28(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) +; FALLBACK21-NEXT: movl %eax, 16(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) +; FALLBACK21-NEXT: movl %eax, 20(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) +; FALLBACK21-NEXT: movl %eax, 8(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) +; FALLBACK21-NEXT: movl %eax, 12(%ecx) +; FALLBACK21-NEXT: movl %ebx, (%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 4(%ebp) +; FALLBACK21-NEXT: movl %eax, 4(%ecx) ; FALLBACK21-NEXT: addl $188, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -14377,7 +13748,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %ecx, %esi ; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK22-NEXT: leal (,%ebp,2), %ecx ; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax ; FALLBACK22-NEXT: shrxl %edx, %eax, %edi @@ -14455,12 +13826,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl %ecx, %ebp ; FALLBACK23-NEXT: andl $60, %ebp ; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 52(%esp,%ebp), %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shll $3, %ecx ; FALLBACK23-NEXT: andl $24, %ecx -; FALLBACK23-NEXT: shrdl %cl, %edx, %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %edx, %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %esi @@ -14500,10 +13871,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl %edi, %edx ; FALLBACK23-NEXT: shrdl %cl, %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %edi, %esi +; FALLBACK23-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %edi, %eax ; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK23-NEXT: movl %eax, 56(%ebp) ; FALLBACK23-NEXT: movl %esi, 48(%ebp) @@ -14525,12 +13896,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl %eax, 20(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 8(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 12(%ebp) ; FALLBACK23-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK23-NEXT: shrdl %cl, %edx, %edi +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 12(%ebp) ; FALLBACK23-NEXT: movl %edi, (%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 4(%ebp) @@ -14548,162 +13919,186 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $204, %esp +; FALLBACK24-NEXT: subl $236, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK24-NEXT: movl (%eax), %ecx +; FALLBACK24-NEXT: movl (%eax), %ebx ; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, %esi -; FALLBACK24-NEXT: andl $60, %esi -; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK24-NEXT: shll $3, %ecx -; FALLBACK24-NEXT: andl $24, %ecx -; FALLBACK24-NEXT: movl %edx, %edi -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax +; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: andl $60, %ebp +; FALLBACK24-NEXT: movl 100(%esp,%ebp), %edx +; FALLBACK24-NEXT: shll $3, %ebx +; FALLBACK24-NEXT: andl $24, %ebx +; FALLBACK24-NEXT: movl %edx, %eax +; FALLBACK24-NEXT: movl %ebx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%eax,%eax), %ebx -; FALLBACK24-NEXT: movl %ecx, %ebp -; FALLBACK24-NEXT: movb %cl, %ch +; FALLBACK24-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%eax,%eax), %esi +; FALLBACK24-NEXT: movb %bl, %ch ; FALLBACK24-NEXT: notb %ch ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 96(%esp,%ebp), %eax +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: addl %edx, %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %edi, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebx +; FALLBACK24-NEXT: movl 108(%esp,%ebp), %esi +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 112(%esp,%ebp), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%eax,%eax), %edi ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: movl %ebx, %edx +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movl %eax, %edx +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 116(%esp,%ebp), %esi +; FALLBACK24-NEXT: movl %esi, %eax ; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %eax, %eax +; FALLBACK24-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK24-NEXT: movl %ebp, %ebx +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%edi,%edi), %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 124(%esp,%ebp), %esi +; FALLBACK24-NEXT: movl %esi, %ebp ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %eax +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 128(%esp,%ebx), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %eax, %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %edx, %ebx +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK24-NEXT: movl 132(%esp,%esi), %edx +; FALLBACK24-NEXT: movl %edx, %eax +; FALLBACK24-NEXT: movb %bl, %cl ; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK24-NEXT: leal (%edx,%edx), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 136(%esp,%esi), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %eax, %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 140(%esp,%esi), %edx +; FALLBACK24-NEXT: movl %edx, %edi +; FALLBACK24-NEXT: movl %ebx, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 144(%esp,%esi), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: addl %edi, %edi +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: movl %ebx, %edi +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %edx, %edx ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK24-NEXT: movl 148(%esp,%esi), %esi +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: movl %edi, %ecx +; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 152(%esp,%ebx), %eax ; FALLBACK24-NEXT: leal (%eax,%eax), %ebp +; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: movl %edi, %edx +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK24-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK24-NEXT: movl 156(%esp,%ebx), %ebx ; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK24-NEXT: orl %eax, %edx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx @@ -14736,7 +14131,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl %ecx, (%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 4(%eax) -; FALLBACK24-NEXT: addl $204, %esp +; FALLBACK24-NEXT: addl $236, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx @@ -14761,91 +14156,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %ecx, %ebx +; FALLBACK25-NEXT: andl $60, %ebx +; FALLBACK25-NEXT: movl 56(%esp,%ebx), %edx +; FALLBACK25-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shll $3, %ecx ; FALLBACK25-NEXT: andl $24, %ecx ; FALLBACK25-NEXT: shrdl %cl, %edx, %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 64(%esp,%ebx), %edi +; FALLBACK25-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, %esi ; FALLBACK25-NEXT: shrdl %cl, %edi, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK25-NEXT: movl 68(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK25-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %edi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK25-NEXT: movl 84(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl %esi, %edx ; FALLBACK25-NEXT: shrdl %cl, %eax, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %edx, %edi +; FALLBACK25-NEXT: movl 96(%esp,%ebx), %edi +; FALLBACK25-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %edi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK25-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK25-NEXT: movl 104(%esp,%ebx), %eax +; FALLBACK25-NEXT: movl 100(%esp,%ebx), %edx +; FALLBACK25-NEXT: movl %edx, %esi +; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: shrdl %cl, %edx, %edi +; FALLBACK25-NEXT: movl 108(%esp,%ebx), %edx +; FALLBACK25-NEXT: shrdl %cl, %edx, %eax +; FALLBACK25-NEXT: movl 48(%esp,%ebx), %ebx +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: shrl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 60(%ebp) -; FALLBACK25-NEXT: movl %esi, 48(%ebp) -; FALLBACK25-NEXT: movl %edi, 52(%ebp) +; FALLBACK25-NEXT: shrl %cl, %edx +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: movl %eax, 56(%ecx) +; FALLBACK25-NEXT: movl %edx, 60(%ecx) +; FALLBACK25-NEXT: movl %edi, 48(%ecx) +; FALLBACK25-NEXT: movl %esi, 52(%ecx) ; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) +; FALLBACK25-NEXT: movl %eax, 40(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) +; FALLBACK25-NEXT: movl %eax, 44(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) +; FALLBACK25-NEXT: movl %eax, 32(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) +; FALLBACK25-NEXT: movl %eax, 36(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) +; FALLBACK25-NEXT: movl %eax, 24(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) +; FALLBACK25-NEXT: movl %eax, 28(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) +; FALLBACK25-NEXT: movl %eax, 16(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) +; FALLBACK25-NEXT: movl %eax, 20(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) +; FALLBACK25-NEXT: movl %eax, 8(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) +; FALLBACK25-NEXT: movl %eax, 12(%ecx) +; FALLBACK25-NEXT: movl %ebx, (%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 4(%ebp) +; FALLBACK25-NEXT: movl %eax, 4(%ecx) ; FALLBACK25-NEXT: addl $188, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -14953,7 +14347,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: orl %eax, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax +; FALLBACK26-NEXT: leal (,%ebp,2), %eax ; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi ; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax ; FALLBACK26-NEXT: shrxl %edx, %eax, %edi @@ -15026,12 +14420,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: movl %ecx, %ebp ; FALLBACK27-NEXT: andl $60, %ebp ; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 52(%esp,%ebp), %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shll $3, %ecx ; FALLBACK27-NEXT: andl $24, %ecx -; FALLBACK27-NEXT: shrdl %cl, %edx, %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %edx, %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %esi @@ -15071,10 +14465,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: movl %edi, %edx ; FALLBACK27-NEXT: shrdl %cl, %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %edi, %esi +; FALLBACK27-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %edi, %eax ; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK27-NEXT: movl %eax, 56(%ebp) ; FALLBACK27-NEXT: movl %esi, 48(%ebp) @@ -15096,12 +14490,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: movl %eax, 20(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 8(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 12(%ebp) ; FALLBACK27-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK27-NEXT: shrdl %cl, %edx, %edi +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 12(%ebp) ; FALLBACK27-NEXT: movl %edi, (%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 4(%ebp) @@ -15120,159 +14514,183 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $204, %esp +; FALLBACK28-NEXT: subl $236, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK28-NEXT: movl (%eax), %ecx +; FALLBACK28-NEXT: movl (%eax), %ebx ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, %esi -; FALLBACK28-NEXT: andl $60, %esi -; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK28-NEXT: shll $3, %ecx -; FALLBACK28-NEXT: andl $24, %ecx -; FALLBACK28-NEXT: movl %edx, %edi -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax +; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: andl $60, %ebp +; FALLBACK28-NEXT: movl 100(%esp,%ebp), %edx +; FALLBACK28-NEXT: shll $3, %ebx +; FALLBACK28-NEXT: andl $24, %ebx +; FALLBACK28-NEXT: movl %edx, %eax +; FALLBACK28-NEXT: movl %ebx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%eax,%eax), %ebx -; FALLBACK28-NEXT: movl %ecx, %ebp -; FALLBACK28-NEXT: movb %cl, %ch +; FALLBACK28-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%eax,%eax), %esi +; FALLBACK28-NEXT: movb %bl, %ch ; FALLBACK28-NEXT: notb %ch ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 96(%esp,%ebp), %eax +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: addl %edx, %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %edi, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebx +; FALLBACK28-NEXT: movl 108(%esp,%ebp), %esi +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 112(%esp,%ebp), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%eax,%eax), %edi ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: movl %ebx, %edx +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movl %eax, %edx +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 116(%esp,%ebp), %esi +; FALLBACK28-NEXT: movl %esi, %eax ; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %eax, %eax +; FALLBACK28-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK28-NEXT: movl %ebp, %ebx +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%edi,%edi), %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 124(%esp,%ebp), %esi +; FALLBACK28-NEXT: movl %esi, %ebp ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %eax +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 128(%esp,%ebx), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %eax, %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %edx, %ebx +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK28-NEXT: movl 132(%esp,%esi), %edx +; FALLBACK28-NEXT: movl %edx, %eax +; FALLBACK28-NEXT: movb %bl, %cl ; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK28-NEXT: leal (%edx,%edx), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 136(%esp,%esi), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %eax, %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 140(%esp,%esi), %edx +; FALLBACK28-NEXT: movl %edx, %edi +; FALLBACK28-NEXT: movl %ebx, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 144(%esp,%esi), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: addl %edi, %edi +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: movl %ebx, %edi +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %edx, %edx ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK28-NEXT: movl 148(%esp,%esi), %esi +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: movl %edi, %ecx +; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 152(%esp,%ebx), %eax ; FALLBACK28-NEXT: leal (%eax,%eax), %ebp +; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: movl %edi, %edx +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK28-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: movl 156(%esp,%ebx), %ebx ; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK28-NEXT: orl %eax, %edx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx @@ -15305,7 +14723,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl %ecx, (%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 4(%eax) -; FALLBACK28-NEXT: addl $204, %esp +; FALLBACK28-NEXT: addl $236, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx @@ -15327,91 +14745,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %ecx, %ebx +; FALLBACK29-NEXT: andl $60, %ebx +; FALLBACK29-NEXT: movl 56(%esp,%ebx), %edx +; FALLBACK29-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shll $3, %ecx ; FALLBACK29-NEXT: andl $24, %ecx ; FALLBACK29-NEXT: shrdl %cl, %edx, %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 64(%esp,%ebx), %edi +; FALLBACK29-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, %esi ; FALLBACK29-NEXT: shrdl %cl, %edi, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK29-NEXT: movl 68(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK29-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %edi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK29-NEXT: movl 84(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl %esi, %edx ; FALLBACK29-NEXT: shrdl %cl, %eax, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %edx, %edi +; FALLBACK29-NEXT: movl 96(%esp,%ebx), %edi +; FALLBACK29-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %edi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK29-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK29-NEXT: movl 104(%esp,%ebx), %eax +; FALLBACK29-NEXT: movl 100(%esp,%ebx), %edx +; FALLBACK29-NEXT: movl %edx, %esi +; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: shrdl %cl, %edx, %edi +; FALLBACK29-NEXT: movl 108(%esp,%ebx), %edx +; FALLBACK29-NEXT: shrdl %cl, %edx, %eax +; FALLBACK29-NEXT: movl 48(%esp,%ebx), %ebx +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: shrl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 60(%ebp) -; FALLBACK29-NEXT: movl %esi, 48(%ebp) -; FALLBACK29-NEXT: movl %edi, 52(%ebp) +; FALLBACK29-NEXT: shrl %cl, %edx +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: movl %eax, 56(%ecx) +; FALLBACK29-NEXT: movl %edx, 60(%ecx) +; FALLBACK29-NEXT: movl %edi, 48(%ecx) +; FALLBACK29-NEXT: movl %esi, 52(%ecx) ; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) +; FALLBACK29-NEXT: movl %eax, 40(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) +; FALLBACK29-NEXT: movl %eax, 44(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) +; FALLBACK29-NEXT: movl %eax, 32(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) +; FALLBACK29-NEXT: movl %eax, 36(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) +; FALLBACK29-NEXT: movl %eax, 24(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) +; FALLBACK29-NEXT: movl %eax, 28(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) +; FALLBACK29-NEXT: movl %eax, 16(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) +; FALLBACK29-NEXT: movl %eax, 20(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) +; FALLBACK29-NEXT: movl %eax, 8(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) +; FALLBACK29-NEXT: movl %eax, 12(%ecx) +; FALLBACK29-NEXT: movl %ebx, (%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 4(%ebp) +; FALLBACK29-NEXT: movl %eax, 4(%ecx) ; FALLBACK29-NEXT: addl $188, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -15516,7 +14933,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: orl %eax, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax +; FALLBACK30-NEXT: leal (,%ebp,2), %eax ; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi ; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax ; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi @@ -15586,12 +15003,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: movl %ecx, %ebp ; FALLBACK31-NEXT: andl $60, %ebp ; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 52(%esp,%ebp), %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shll $3, %ecx ; FALLBACK31-NEXT: andl $24, %ecx -; FALLBACK31-NEXT: shrdl %cl, %edx, %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %edx, %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %esi @@ -15631,10 +15048,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: movl %edi, %edx ; FALLBACK31-NEXT: shrdl %cl, %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %edi, %esi +; FALLBACK31-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %edi, %eax ; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK31-NEXT: movl %eax, 56(%ebp) ; FALLBACK31-NEXT: movl %esi, 48(%ebp) @@ -15656,12 +15073,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: movl %eax, 20(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 8(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 12(%ebp) ; FALLBACK31-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK31-NEXT: shrdl %cl, %edx, %edi +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 12(%ebp) ; FALLBACK31-NEXT: movl %edi, (%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 4(%ebp) @@ -15684,47 +15101,31 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: lshr_64bytes_qwordOff: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pushq %rbx -; X64-SSE2-NEXT: movq (%rdi), %rax -; X64-SSE2-NEXT: movq 8(%rdi), %rcx -; X64-SSE2-NEXT: movq 16(%rdi), %r8 -; X64-SSE2-NEXT: movq 24(%rdi), %r9 -; X64-SSE2-NEXT: movq 32(%rdi), %r10 -; X64-SSE2-NEXT: movq 40(%rdi), %r11 -; X64-SSE2-NEXT: movq 48(%rdi), %rbx -; X64-SSE2-NEXT: movq 56(%rdi), %rdi -; X64-SSE2-NEXT: movl (%rsi), %esi -; X64-SSE2-NEXT: xorps %xmm0, %xmm0 -; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movups (%rdi), %xmm0 +; X64-SSE2-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movups 32(%rdi), %xmm2 +; X64-SSE2-NEXT: movups 48(%rdi), %xmm3 +; X64-SSE2-NEXT: movl (%rsi), %eax +; X64-SSE2-NEXT: xorps %xmm4, %xmm4 +; X64-SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: andl $7, %esi -; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax -; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx -; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi -; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8 -; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9 -; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10 -; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11 -; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi -; X64-SSE2-NEXT: movq %rsi, 48(%rdx) -; X64-SSE2-NEXT: movq %r11, 56(%rdx) -; X64-SSE2-NEXT: movq %r10, 32(%rdx) -; X64-SSE2-NEXT: movq %r9, 40(%rdx) -; X64-SSE2-NEXT: movq %r8, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) -; X64-SSE2-NEXT: movq %rcx, 8(%rdx) -; X64-SSE2-NEXT: popq %rbx +; X64-SSE2-NEXT: andl $7, %eax +; X64-SSE2-NEXT: movups -128(%rsp,%rax,8), %xmm0 +; X64-SSE2-NEXT: movups -112(%rsp,%rax,8), %xmm1 +; X64-SSE2-NEXT: movups -96(%rsp,%rax,8), %xmm2 +; X64-SSE2-NEXT: movups -80(%rsp,%rax,8), %xmm3 +; X64-SSE2-NEXT: movups %xmm3, 48(%rdx) +; X64-SSE2-NEXT: movups %xmm1, 16(%rdx) +; X64-SSE2-NEXT: movups %xmm2, 32(%rdx) +; X64-SSE2-NEXT: movups %xmm0, (%rdx) +; X64-SSE2-NEXT: popq %rax ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: lshr_64bytes_qwordOff: @@ -15803,130 +15204,34 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no ; ; X86-SSE2-LABEL: lshr_64bytes_qwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %edi -; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $188, %esp +; X86-SSE2-NEXT: subl $140, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 12(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 16(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 24(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 32(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 36(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 40(%eax), %ebp -; X86-SSE2-NEXT: movl 44(%eax), %ebx -; X86-SSE2-NEXT: movl 48(%eax), %edi -; X86-SSE2-NEXT: movl 52(%eax), %esi -; X86-SSE2-NEXT: movl 56(%eax), %edx -; X86-SSE2-NEXT: movl 60(%eax), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %eax -; X86-SSE2-NEXT: xorps %xmm0, %xmm0 -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $7, %eax -; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp -; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx -; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi -; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi -; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx -; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 56(%eax) -; X86-SSE2-NEXT: movl %edx, 60(%eax) -; X86-SSE2-NEXT: movl %esi, 48(%eax) -; X86-SSE2-NEXT: movl %edi, 52(%eax) -; X86-SSE2-NEXT: movl %ebx, 40(%eax) -; X86-SSE2-NEXT: movl %ebp, 44(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 32(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 36(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 28(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 16(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 20(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $188, %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %edi -; X86-SSE2-NEXT: popl %ebx -; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movups 16(%edx), %xmm1 +; X86-SSE2-NEXT: movups 32(%edx), %xmm2 +; X86-SSE2-NEXT: movups 48(%edx), %xmm3 +; X86-SSE2-NEXT: movl (%ecx), %ecx +; X86-SSE2-NEXT: xorps %xmm4, %xmm4 +; X86-SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-SSE2-NEXT: andl $7, %ecx +; X86-SSE2-NEXT: movups (%esp,%ecx,8), %xmm0 +; X86-SSE2-NEXT: movups 16(%esp,%ecx,8), %xmm1 +; X86-SSE2-NEXT: movups 32(%esp,%ecx,8), %xmm2 +; X86-SSE2-NEXT: movups 48(%esp,%ecx,8), %xmm3 +; X86-SSE2-NEXT: movups %xmm3, 48(%eax) +; X86-SSE2-NEXT: movups %xmm2, 32(%eax) +; X86-SSE2-NEXT: movups %xmm1, 16(%eax) +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $140, %esp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: lshr_64bytes_qwordOff: @@ -16022,112 +15327,112 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: shl_64bytes: ; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %rbp ; FALLBACK0-NEXT: pushq %r15 ; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %r13 ; FALLBACK0-NEXT: pushq %r12 ; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rax -; FALLBACK0-NEXT: movq 8(%rdi), %rcx -; FALLBACK0-NEXT: movq 16(%rdi), %r8 -; FALLBACK0-NEXT: movq 24(%rdi), %r9 -; FALLBACK0-NEXT: movq 32(%rdi), %r10 -; FALLBACK0-NEXT: movq 40(%rdi), %r11 -; FALLBACK0-NEXT: movq 48(%rdi), %rbx -; FALLBACK0-NEXT: movq 56(%rdi), %rdi -; FALLBACK0-NEXT: movl (%rsi), %esi -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: subq $40, %rsp +; FALLBACK0-NEXT: movups (%rdi), %xmm0 +; FALLBACK0-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK0-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK0-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK0-NEXT: movl (%rsi), %ecx +; FALLBACK0-NEXT: xorps %xmm4, %xmm4 +; FALLBACK0-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm2, (%rsp) +; FALLBACK0-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: leal (,%rsi,8), %eax +; FALLBACK0-NEXT: leal (,%rcx,8), %eax ; FALLBACK0-NEXT: andl $56, %eax -; FALLBACK0-NEXT: andl $56, %esi -; FALLBACK0-NEXT: negl %esi -; FALLBACK0-NEXT: movslq %esi, %rbx -; FALLBACK0-NEXT: movq -64(%rsp,%rbx), %r8 -; FALLBACK0-NEXT: movq -56(%rsp,%rbx), %rdi -; FALLBACK0-NEXT: movq %rdi, %r10 +; FALLBACK0-NEXT: andl $56, %ecx +; FALLBACK0-NEXT: negl %ecx +; FALLBACK0-NEXT: movslq %ecx, %r10 +; FALLBACK0-NEXT: movq 8(%rsp,%r10), %rdi +; FALLBACK0-NEXT: movq %rdi, %r12 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: shlq %cl, %r12 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq %r8, %r9 +; FALLBACK0-NEXT: movq (%rsp,%r10), %r9 +; FALLBACK0-NEXT: movq %r9, %r8 +; FALLBACK0-NEXT: shrq %r8 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r8 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r9 +; FALLBACK0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 +; FALLBACK0-NEXT: movq %r11, %r9 ; FALLBACK0-NEXT: shrq %r9 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: orq %r10, %r9 -; FALLBACK0-NEXT: movq -40(%rsp,%rbx), %r10 -; FALLBACK0-NEXT: movq %r10, %r14 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r14 -; FALLBACK0-NEXT: movq -48(%rsp,%rbx), %r15 -; FALLBACK0-NEXT: movq %r15, %r11 +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK0-NEXT: movq -16(%rsp,%r10), %rbp +; FALLBACK0-NEXT: movq %rbp, %r11 ; FALLBACK0-NEXT: shrq %r11 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: orq %r14, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r15 -; FALLBACK0-NEXT: shrq %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: orq %r15, %rdi -; FALLBACK0-NEXT: movq -24(%rsp,%rbx), %r14 -; FALLBACK0-NEXT: movq %r14, %r12 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r12 -; FALLBACK0-NEXT: movq -32(%rsp,%rbx), %r13 -; FALLBACK0-NEXT: movq %r13, %r15 -; FALLBACK0-NEXT: shrq %r15 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r15 -; FALLBACK0-NEXT: orq %r12, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r13 -; FALLBACK0-NEXT: shrq %r10 +; FALLBACK0-NEXT: shlq %cl, %rbp +; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rbx +; FALLBACK0-NEXT: movq %rbx, %r14 +; FALLBACK0-NEXT: shrq %r14 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: orq %r13, %r10 -; FALLBACK0-NEXT: movq -8(%rsp,%rbx), %r12 +; FALLBACK0-NEXT: shrq %cl, %r14 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r12 -; FALLBACK0-NEXT: movq -16(%rsp,%rbx), %rbx -; FALLBACK0-NEXT: movq %rbx, %r13 +; FALLBACK0-NEXT: shlq %cl, %rbx +; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r15 +; FALLBACK0-NEXT: movq %r15, %r13 ; FALLBACK0-NEXT: shrq %r13 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r13 -; FALLBACK0-NEXT: orq %r12, %r13 +; FALLBACK0-NEXT: orq %r12, %r8 +; FALLBACK0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK0-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; FALLBACK0-NEXT: movq 16(%rsp,%r10), %r12 +; FALLBACK0-NEXT: movq %r12, %r8 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %rbx -; FALLBACK0-NEXT: shrq %r14 +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; FALLBACK0-NEXT: orq %rbp, %r14 +; FALLBACK0-NEXT: shrq %rdi ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r14 -; FALLBACK0-NEXT: orq %rbx, %r14 +; FALLBACK0-NEXT: shrq %cl, %rdi +; FALLBACK0-NEXT: movq 24(%rsp,%r10), %r10 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %r14, 48(%rdx) -; FALLBACK0-NEXT: movq %r13, 56(%rdx) -; FALLBACK0-NEXT: movq %r10, 32(%rdx) -; FALLBACK0-NEXT: movq %r15, 40(%rdx) -; FALLBACK0-NEXT: movq %rdi, 16(%rdx) +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: orq %rbx, %r13 +; FALLBACK0-NEXT: orq %r8, %rdi +; FALLBACK0-NEXT: shrq %r12 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r12 +; FALLBACK0-NEXT: orq %r10, %r12 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r15 +; FALLBACK0-NEXT: movq %r15, (%rdx) +; FALLBACK0-NEXT: movq %r12, 56(%rdx) +; FALLBACK0-NEXT: movq %rdi, 48(%rdx) +; FALLBACK0-NEXT: movq %r13, 8(%rdx) +; FALLBACK0-NEXT: movq %r14, 16(%rdx) ; FALLBACK0-NEXT: movq %r11, 24(%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) +; FALLBACK0-NEXT: movq %r9, 32(%rdx) +; FALLBACK0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; FALLBACK0-NEXT: movq %rax, 40(%rdx) +; FALLBACK0-NEXT: addq $40, %rsp ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: popq %r12 ; FALLBACK0-NEXT: popq %r13 ; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: popq %r15 +; FALLBACK0-NEXT: popq %rbp ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: shl_64bytes: @@ -16135,60 +15440,53 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK1-NEXT: pushq %r14 ; FALLBACK1-NEXT: pushq %rbx ; FALLBACK1-NEXT: pushq %rax -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %rcx -; FALLBACK1-NEXT: movq 16(%rdi), %r8 -; FALLBACK1-NEXT: movq 24(%rdi), %r9 -; FALLBACK1-NEXT: movq 32(%rdi), %r10 -; FALLBACK1-NEXT: movq 40(%rdi), %r11 -; FALLBACK1-NEXT: movq 48(%rdi), %rbx -; FALLBACK1-NEXT: movq 56(%rdi), %rdi -; FALLBACK1-NEXT: movl (%rsi), %esi -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movups (%rdi), %xmm0 +; FALLBACK1-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK1-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK1-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK1-NEXT: movl (%rsi), %eax +; FALLBACK1-NEXT: xorps %xmm4, %xmm4 +; FALLBACK1-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx +; FALLBACK1-NEXT: leal (,%rax,8), %ecx ; FALLBACK1-NEXT: andl $56, %ecx -; FALLBACK1-NEXT: andl $56, %esi -; FALLBACK1-NEXT: negl %esi -; FALLBACK1-NEXT: movslq %esi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%r9), %rax -; FALLBACK1-NEXT: movq -40(%rsp,%r9), %r10 -; FALLBACK1-NEXT: movq %r10, %rsi +; FALLBACK1-NEXT: andl $56, %eax +; FALLBACK1-NEXT: negl %eax +; FALLBACK1-NEXT: movslq %eax, %r8 +; FALLBACK1-NEXT: movq -32(%rsp,%r8), %rax +; FALLBACK1-NEXT: movq -24(%rsp,%r8), %r9 +; FALLBACK1-NEXT: movq %r9, %rsi ; FALLBACK1-NEXT: shldq %cl, %rax, %rsi -; FALLBACK1-NEXT: movq -64(%rsp,%r9), %r8 -; FALLBACK1-NEXT: movq -56(%rsp,%r9), %rdi +; FALLBACK1-NEXT: movq -40(%rsp,%r8), %rdi ; FALLBACK1-NEXT: shldq %cl, %rdi, %rax -; FALLBACK1-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK1-NEXT: movq -24(%rsp,%r9), %rbx +; FALLBACK1-NEXT: movq -48(%rsp,%r8), %r10 +; FALLBACK1-NEXT: shldq %cl, %r10, %rdi +; FALLBACK1-NEXT: movq -56(%rsp,%r8), %r11 +; FALLBACK1-NEXT: shldq %cl, %r11, %r10 +; FALLBACK1-NEXT: movq -16(%rsp,%r8), %rbx ; FALLBACK1-NEXT: movq %rbx, %r14 -; FALLBACK1-NEXT: shldq %cl, %r11, %r14 -; FALLBACK1-NEXT: shldq %cl, %r10, %r11 -; FALLBACK1-NEXT: movq -16(%rsp,%r9), %r10 -; FALLBACK1-NEXT: movq -8(%rsp,%r9), %r9 -; FALLBACK1-NEXT: shldq %cl, %r10, %r9 -; FALLBACK1-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK1-NEXT: shldq %cl, %r8, %rdi +; FALLBACK1-NEXT: shldq %cl, %r9, %r14 +; FALLBACK1-NEXT: movq -8(%rsp,%r8), %r9 +; FALLBACK1-NEXT: shldq %cl, %rbx, %r9 +; FALLBACK1-NEXT: movq -64(%rsp,%r8), %r8 +; FALLBACK1-NEXT: movq %r8, %rbx +; FALLBACK1-NEXT: shlq %cl, %rbx ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shlq %cl, %r8 -; FALLBACK1-NEXT: movq %r10, 48(%rdx) +; FALLBACK1-NEXT: shldq %cl, %r8, %r11 ; FALLBACK1-NEXT: movq %r9, 56(%rdx) -; FALLBACK1-NEXT: movq %r11, 32(%rdx) -; FALLBACK1-NEXT: movq %r14, 40(%rdx) -; FALLBACK1-NEXT: movq %rax, 16(%rdx) -; FALLBACK1-NEXT: movq %rsi, 24(%rdx) -; FALLBACK1-NEXT: movq %r8, (%rdx) -; FALLBACK1-NEXT: movq %rdi, 8(%rdx) +; FALLBACK1-NEXT: movq %r14, 48(%rdx) +; FALLBACK1-NEXT: movq %r11, 8(%rdx) +; FALLBACK1-NEXT: movq %r10, 16(%rdx) +; FALLBACK1-NEXT: movq %rdi, 24(%rdx) +; FALLBACK1-NEXT: movq %rax, 32(%rdx) +; FALLBACK1-NEXT: movq %rsi, 40(%rdx) +; FALLBACK1-NEXT: movq %rbx, (%rdx) ; FALLBACK1-NEXT: addq $8, %rsp ; FALLBACK1-NEXT: popq %rbx ; FALLBACK1-NEXT: popq %r14 @@ -16202,81 +15500,74 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx -; FALLBACK2-NEXT: pushq %rax -; FALLBACK2-NEXT: movq (%rdi), %rax -; FALLBACK2-NEXT: movq 8(%rdi), %rcx -; FALLBACK2-NEXT: movq 16(%rdi), %r8 -; FALLBACK2-NEXT: movq 24(%rdi), %r9 -; FALLBACK2-NEXT: movq 32(%rdi), %r10 -; FALLBACK2-NEXT: movq 40(%rdi), %r11 -; FALLBACK2-NEXT: movq 48(%rdi), %rbx -; FALLBACK2-NEXT: movq 56(%rdi), %rdi -; FALLBACK2-NEXT: movl (%rsi), %esi -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: subq $24, %rsp +; FALLBACK2-NEXT: movups (%rdi), %xmm0 +; FALLBACK2-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK2-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK2-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK2-NEXT: movl (%rsi), %eax +; FALLBACK2-NEXT: xorps %xmm4, %xmm4 +; FALLBACK2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm3, (%rsp) +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: leal (,%rsi,8), %eax +; FALLBACK2-NEXT: leal (,%rax,8), %ecx +; FALLBACK2-NEXT: andl $56, %ecx ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: andl $56, %esi -; FALLBACK2-NEXT: negl %esi -; FALLBACK2-NEXT: movslq %esi, %rsi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10 -; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9 -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14 -; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shlxq %rax, %r8, %r15 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r12 -; FALLBACK2-NEXT: movl %eax, %r13d -; FALLBACK2-NEXT: notb %r13b -; FALLBACK2-NEXT: shrq %r10 -; FALLBACK2-NEXT: shrxq %r13, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp -; FALLBACK2-NEXT: shrq %r14 -; FALLBACK2-NEXT: shrxq %r13, %r14, %r14 -; FALLBACK2-NEXT: orq %r11, %r14 -; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx -; FALLBACK2-NEXT: orq %rbx, %rcx -; FALLBACK2-NEXT: shrq %r9 -; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 -; FALLBACK2-NEXT: orq %r15, %r9 +; FALLBACK2-NEXT: negl %eax +; FALLBACK2-NEXT: movslq %eax, %rsi +; FALLBACK2-NEXT: movq -8(%rsp,%rsi), %rax +; FALLBACK2-NEXT: shlxq %rcx, %rax, %r12 +; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %r15 +; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r13 +; FALLBACK2-NEXT: shlxq %rcx, %r13, %r8 +; FALLBACK2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %r10 +; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %r14 +; FALLBACK2-NEXT: shlxq %rcx, %r14, %rbx +; FALLBACK2-NEXT: movl %ecx, %r9d +; FALLBACK2-NEXT: notb %r9b ; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi -; FALLBACK2-NEXT: orq %rbp, %rdi +; FALLBACK2-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK2-NEXT: orq %r12, %rdi +; FALLBACK2-NEXT: movq (%rsp,%rsi), %rbp +; FALLBACK2-NEXT: shlxq %rcx, %rbp, %r8 +; FALLBACK2-NEXT: shrq %r13 +; FALLBACK2-NEXT: shrxq %r9, %r13, %r12 +; FALLBACK2-NEXT: orq %r15, %r12 +; FALLBACK2-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: shrq %r11 +; FALLBACK2-NEXT: shrxq %r9, %r11, %r11 +; FALLBACK2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; FALLBACK2-NEXT: shrq %r14 +; FALLBACK2-NEXT: shrxq %r9, %r14, %r14 +; FALLBACK2-NEXT: orq %r10, %r14 ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 -; FALLBACK2-NEXT: orq %rax, %r8 -; FALLBACK2-NEXT: movq %r12, (%rdx) -; FALLBACK2-NEXT: movq %r8, 48(%rdx) -; FALLBACK2-NEXT: movq %rsi, 56(%rdx) -; FALLBACK2-NEXT: movq %rdi, 32(%rdx) -; FALLBACK2-NEXT: movq %r9, 40(%rdx) -; FALLBACK2-NEXT: movq %rcx, 16(%rdx) -; FALLBACK2-NEXT: movq %r14, 24(%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) -; FALLBACK2-NEXT: addq $8, %rsp +; FALLBACK2-NEXT: shrxq %r9, %rsi, %rsi +; FALLBACK2-NEXT: orq %rbx, %rsi +; FALLBACK2-NEXT: shrq %rax +; FALLBACK2-NEXT: shrxq %r9, %rax, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: shrq %rbp +; FALLBACK2-NEXT: shrxq %r9, %rbp, %r8 +; FALLBACK2-NEXT: orq %r15, %r8 +; FALLBACK2-NEXT: movq %rcx, (%rdx) +; FALLBACK2-NEXT: movq %r8, 56(%rdx) +; FALLBACK2-NEXT: movq %rax, 48(%rdx) +; FALLBACK2-NEXT: movq %rsi, 8(%rdx) +; FALLBACK2-NEXT: movq %r14, 16(%rdx) +; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: movq %r12, 32(%rdx) +; FALLBACK2-NEXT: movq %rdi, 40(%rdx) +; FALLBACK2-NEXT: addq $24, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 ; FALLBACK2-NEXT: popq %r13 @@ -16290,59 +15581,52 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: pushq %r14 ; FALLBACK3-NEXT: pushq %rbx ; FALLBACK3-NEXT: pushq %rax -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %rcx -; FALLBACK3-NEXT: movq 16(%rdi), %r8 -; FALLBACK3-NEXT: movq 24(%rdi), %r9 -; FALLBACK3-NEXT: movq 32(%rdi), %r10 -; FALLBACK3-NEXT: movq 40(%rdi), %r11 -; FALLBACK3-NEXT: movq 48(%rdi), %rbx -; FALLBACK3-NEXT: movq 56(%rdi), %rdi -; FALLBACK3-NEXT: movl (%rsi), %esi -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movups (%rdi), %xmm0 +; FALLBACK3-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK3-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK3-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK3-NEXT: movl (%rsi), %eax +; FALLBACK3-NEXT: xorps %xmm4, %xmm4 +; FALLBACK3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx +; FALLBACK3-NEXT: leal (,%rax,8), %ecx ; FALLBACK3-NEXT: andl $56, %ecx -; FALLBACK3-NEXT: andl $56, %esi -; FALLBACK3-NEXT: negl %esi -; FALLBACK3-NEXT: movslq %esi, %r8 -; FALLBACK3-NEXT: movq -48(%rsp,%r8), %rax -; FALLBACK3-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK3-NEXT: andl $56, %eax +; FALLBACK3-NEXT: negl %eax +; FALLBACK3-NEXT: movslq %eax, %r8 +; FALLBACK3-NEXT: movq -32(%rsp,%r8), %rax +; FALLBACK3-NEXT: movq -24(%rsp,%r8), %r9 ; FALLBACK3-NEXT: movq %r9, %rsi ; FALLBACK3-NEXT: shldq %cl, %rax, %rsi -; FALLBACK3-NEXT: movq -64(%rsp,%r8), %r10 -; FALLBACK3-NEXT: movq -56(%rsp,%r8), %rdi +; FALLBACK3-NEXT: movq -40(%rsp,%r8), %rdi ; FALLBACK3-NEXT: shldq %cl, %rdi, %rax -; FALLBACK3-NEXT: movq -32(%rsp,%r8), %r11 -; FALLBACK3-NEXT: movq -24(%rsp,%r8), %rbx +; FALLBACK3-NEXT: movq -48(%rsp,%r8), %r10 +; FALLBACK3-NEXT: shldq %cl, %r10, %rdi +; FALLBACK3-NEXT: movq -56(%rsp,%r8), %r11 +; FALLBACK3-NEXT: shldq %cl, %r11, %r10 +; FALLBACK3-NEXT: movq -16(%rsp,%r8), %rbx ; FALLBACK3-NEXT: movq %rbx, %r14 -; FALLBACK3-NEXT: shldq %cl, %r11, %r14 -; FALLBACK3-NEXT: shldq %cl, %r9, %r11 -; FALLBACK3-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK3-NEXT: shldq %cl, %r9, %r8 +; FALLBACK3-NEXT: shldq %cl, %r9, %r14 +; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r9 ; FALLBACK3-NEXT: shldq %cl, %rbx, %r9 -; FALLBACK3-NEXT: shldq %cl, %r10, %rdi -; FALLBACK3-NEXT: shlxq %rcx, %r10, %rcx -; FALLBACK3-NEXT: movq %r9, 48(%rdx) -; FALLBACK3-NEXT: movq %r8, 56(%rdx) -; FALLBACK3-NEXT: movq %r11, 32(%rdx) -; FALLBACK3-NEXT: movq %r14, 40(%rdx) -; FALLBACK3-NEXT: movq %rax, 16(%rdx) -; FALLBACK3-NEXT: movq %rsi, 24(%rdx) -; FALLBACK3-NEXT: movq %rcx, (%rdx) -; FALLBACK3-NEXT: movq %rdi, 8(%rdx) +; FALLBACK3-NEXT: movq -64(%rsp,%r8), %r8 +; FALLBACK3-NEXT: shlxq %rcx, %r8, %rbx +; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK3-NEXT: shldq %cl, %r8, %r11 +; FALLBACK3-NEXT: movq %r9, 56(%rdx) +; FALLBACK3-NEXT: movq %r14, 48(%rdx) +; FALLBACK3-NEXT: movq %r11, 8(%rdx) +; FALLBACK3-NEXT: movq %r10, 16(%rdx) +; FALLBACK3-NEXT: movq %rdi, 24(%rdx) +; FALLBACK3-NEXT: movq %rax, 32(%rdx) +; FALLBACK3-NEXT: movq %rsi, 40(%rdx) +; FALLBACK3-NEXT: movq %rbx, (%rdx) ; FALLBACK3-NEXT: addq $8, %rsp ; FALLBACK3-NEXT: popq %rbx ; FALLBACK3-NEXT: popq %r14 @@ -16350,11 +15634,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK4-LABEL: shl_64bytes: ; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %rbp ; FALLBACK4-NEXT: pushq %r15 ; FALLBACK4-NEXT: pushq %r14 ; FALLBACK4-NEXT: pushq %r13 ; FALLBACK4-NEXT: pushq %r12 ; FALLBACK4-NEXT: pushq %rbx +; FALLBACK4-NEXT: subq $40, %rsp ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 @@ -16365,96 +15651,102 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm2, (%rsp) ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: leal (,%rcx,8), %eax ; FALLBACK4-NEXT: andl $56, %eax ; FALLBACK4-NEXT: andl $56, %ecx ; FALLBACK4-NEXT: negl %ecx -; FALLBACK4-NEXT: movslq %ecx, %r9 -; FALLBACK4-NEXT: movq -24(%rsp,%r9), %rdi -; FALLBACK4-NEXT: movq %rdi, %r10 +; FALLBACK4-NEXT: movslq %ecx, %r10 +; FALLBACK4-NEXT: movq 8(%rsp,%r10), %rdi +; FALLBACK4-NEXT: movq %rdi, %r12 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: shlq %cl, %r12 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK4-NEXT: movq %r11, %r8 +; FALLBACK4-NEXT: movq (%rsp,%r10), %r9 +; FALLBACK4-NEXT: movq %r9, %r8 ; FALLBACK4-NEXT: shrq %r8 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: orq %r10, %r8 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9), %rbx -; FALLBACK4-NEXT: movq %rbx, %r10 -; FALLBACK4-NEXT: shrq %r10 +; FALLBACK4-NEXT: shlq %cl, %r9 +; FALLBACK4-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK4-NEXT: movq -8(%rsp,%r10), %r11 +; FALLBACK4-NEXT: movq %r11, %r9 +; FALLBACK4-NEXT: shrq %r9 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: orq %r11, %r10 +; FALLBACK4-NEXT: shrq %cl, %r9 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r15 -; FALLBACK4-NEXT: movq %r15, %r11 +; FALLBACK4-NEXT: shlq %cl, %r11 +; FALLBACK4-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK4-NEXT: movq -16(%rsp,%r10), %rbp +; FALLBACK4-NEXT: movq %rbp, %r11 ; FALLBACK4-NEXT: shrq %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: orq %rbx, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r15 -; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r14 -; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r12 -; FALLBACK4-NEXT: movq %r12, %rbx -; FALLBACK4-NEXT: shrq %rbx +; FALLBACK4-NEXT: shlq %cl, %rbp +; FALLBACK4-NEXT: movq -24(%rsp,%r10), %rbx +; FALLBACK4-NEXT: movq %rbx, %r14 +; FALLBACK4-NEXT: shrq %r14 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbx -; FALLBACK4-NEXT: orq %r15, %rbx +; FALLBACK4-NEXT: shrq %cl, %r14 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r12 -; FALLBACK4-NEXT: movq %r14, %r15 -; FALLBACK4-NEXT: shrq %r15 +; FALLBACK4-NEXT: shlq %cl, %rbx +; FALLBACK4-NEXT: movq -32(%rsp,%r10), %r15 +; FALLBACK4-NEXT: movq %r15, %r13 +; FALLBACK4-NEXT: shrq %r13 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r15 -; FALLBACK4-NEXT: orq %r12, %r15 -; FALLBACK4-NEXT: movq -16(%rsp,%r9), %r12 -; FALLBACK4-NEXT: movq %r12, %r13 +; FALLBACK4-NEXT: shrq %cl, %r13 +; FALLBACK4-NEXT: orq %r12, %r8 +; FALLBACK4-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK4-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; FALLBACK4-NEXT: movq 16(%rsp,%r10), %r12 +; FALLBACK4-NEXT: movq %r12, %r8 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r13 +; FALLBACK4-NEXT: shlq %cl, %r8 +; FALLBACK4-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; FALLBACK4-NEXT: orq %rbp, %r14 ; FALLBACK4-NEXT: shrq %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %rdi -; FALLBACK4-NEXT: orq %r13, %rdi -; FALLBACK4-NEXT: movq -8(%rsp,%r9), %r9 +; FALLBACK4-NEXT: movq 24(%rsp,%r10), %r10 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: orq %rbx, %r13 +; FALLBACK4-NEXT: orq %r8, %rdi ; FALLBACK4-NEXT: shrq %r12 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r12 -; FALLBACK4-NEXT: orq %r9, %r12 +; FALLBACK4-NEXT: orq %r10, %r12 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r14 -; FALLBACK4-NEXT: movq %r14, (%rdx) +; FALLBACK4-NEXT: shlq %cl, %r15 +; FALLBACK4-NEXT: movq %r15, (%rdx) ; FALLBACK4-NEXT: movq %r12, 56(%rdx) ; FALLBACK4-NEXT: movq %rdi, 48(%rdx) -; FALLBACK4-NEXT: movq %r15, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %r13, 8(%rdx) +; FALLBACK4-NEXT: movq %r14, 16(%rdx) ; FALLBACK4-NEXT: movq %r11, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 32(%rdx) -; FALLBACK4-NEXT: movq %r8, 40(%rdx) +; FALLBACK4-NEXT: movq %r9, 32(%rdx) +; FALLBACK4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; FALLBACK4-NEXT: movq %rax, 40(%rdx) +; FALLBACK4-NEXT: addq $40, %rsp ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: popq %r12 ; FALLBACK4-NEXT: popq %r13 ; FALLBACK4-NEXT: popq %r14 ; FALLBACK4-NEXT: popq %r15 +; FALLBACK4-NEXT: popq %rbp ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: shl_64bytes: ; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: pushq %r15 ; FALLBACK5-NEXT: pushq %r14 ; FALLBACK5-NEXT: pushq %rbx +; FALLBACK5-NEXT: pushq %rax ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 @@ -16482,29 +15774,29 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: shldq %cl, %rdi, %rax ; FALLBACK5-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK5-NEXT: shldq %cl, %r10, %rdi -; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK5-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK5-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK5-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK5-NEXT: movq %r14, %r15 -; FALLBACK5-NEXT: shldq %cl, %r9, %r15 -; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK5-NEXT: shldq %cl, %r14, %r8 -; FALLBACK5-NEXT: movq %r11, %r9 -; FALLBACK5-NEXT: shlq %cl, %r9 +; FALLBACK5-NEXT: movq -56(%rsp,%r8), %r11 +; FALLBACK5-NEXT: shldq %cl, %r11, %r10 +; FALLBACK5-NEXT: movq -16(%rsp,%r8), %rbx +; FALLBACK5-NEXT: movq %rbx, %r14 +; FALLBACK5-NEXT: shldq %cl, %r9, %r14 +; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r9 +; FALLBACK5-NEXT: shldq %cl, %rbx, %r9 +; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r8 +; FALLBACK5-NEXT: movq %r8, %rbx +; FALLBACK5-NEXT: shlq %cl, %rbx ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shldq %cl, %r11, %rbx -; FALLBACK5-NEXT: movq %r8, 56(%rdx) -; FALLBACK5-NEXT: movq %r15, 48(%rdx) -; FALLBACK5-NEXT: movq %rbx, 8(%rdx) +; FALLBACK5-NEXT: shldq %cl, %r8, %r11 +; FALLBACK5-NEXT: movq %r9, 56(%rdx) +; FALLBACK5-NEXT: movq %r14, 48(%rdx) +; FALLBACK5-NEXT: movq %r11, 8(%rdx) ; FALLBACK5-NEXT: movq %r10, 16(%rdx) ; FALLBACK5-NEXT: movq %rdi, 24(%rdx) ; FALLBACK5-NEXT: movq %rax, 32(%rdx) ; FALLBACK5-NEXT: movq %rsi, 40(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: movq %rbx, (%rdx) +; FALLBACK5-NEXT: addq $8, %rsp ; FALLBACK5-NEXT: popq %rbx ; FALLBACK5-NEXT: popq %r14 -; FALLBACK5-NEXT: popq %r15 ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: shl_64bytes: @@ -16593,9 +15885,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK7-LABEL: shl_64bytes: ; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: pushq %r15 ; FALLBACK7-NEXT: pushq %r14 ; FALLBACK7-NEXT: pushq %rbx +; FALLBACK7-NEXT: pushq %rax ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 @@ -16623,132 +15915,140 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: shldq %cl, %rdi, %rax ; FALLBACK7-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK7-NEXT: shldq %cl, %r10, %rdi -; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK7-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK7-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK7-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK7-NEXT: movq %r14, %r15 -; FALLBACK7-NEXT: shldq %cl, %r9, %r15 -; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK7-NEXT: shldq %cl, %r14, %r8 -; FALLBACK7-NEXT: shlxq %rcx, %r11, %r9 +; FALLBACK7-NEXT: movq -56(%rsp,%r8), %r11 +; FALLBACK7-NEXT: shldq %cl, %r11, %r10 +; FALLBACK7-NEXT: movq -16(%rsp,%r8), %rbx +; FALLBACK7-NEXT: movq %rbx, %r14 +; FALLBACK7-NEXT: shldq %cl, %r9, %r14 +; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r9 +; FALLBACK7-NEXT: shldq %cl, %rbx, %r9 +; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r8 +; FALLBACK7-NEXT: shlxq %rcx, %r8, %rbx ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shldq %cl, %r11, %rbx -; FALLBACK7-NEXT: movq %r8, 56(%rdx) -; FALLBACK7-NEXT: movq %r15, 48(%rdx) -; FALLBACK7-NEXT: movq %rbx, 8(%rdx) +; FALLBACK7-NEXT: shldq %cl, %r8, %r11 +; FALLBACK7-NEXT: movq %r9, 56(%rdx) +; FALLBACK7-NEXT: movq %r14, 48(%rdx) +; FALLBACK7-NEXT: movq %r11, 8(%rdx) ; FALLBACK7-NEXT: movq %r10, 16(%rdx) ; FALLBACK7-NEXT: movq %rdi, 24(%rdx) ; FALLBACK7-NEXT: movq %rax, 32(%rdx) ; FALLBACK7-NEXT: movq %rsi, 40(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: movq %rbx, (%rdx) +; FALLBACK7-NEXT: addq $8, %rsp ; FALLBACK7-NEXT: popq %rbx ; FALLBACK7-NEXT: popq %r14 -; FALLBACK7-NEXT: popq %r15 ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: shl_64bytes: ; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %rbp ; FALLBACK8-NEXT: pushq %r15 ; FALLBACK8-NEXT: pushq %r14 ; FALLBACK8-NEXT: pushq %r13 ; FALLBACK8-NEXT: pushq %r12 ; FALLBACK8-NEXT: pushq %rbx +; FALLBACK8-NEXT: subq $40, %rsp ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK8-NEXT: movl (%rsi), %ecx ; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm1, (%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: leal (,%rcx,8), %eax ; FALLBACK8-NEXT: andl $56, %eax ; FALLBACK8-NEXT: andl $56, %ecx ; FALLBACK8-NEXT: negl %ecx -; FALLBACK8-NEXT: movslq %ecx, %r9 -; FALLBACK8-NEXT: movq -24(%rsp,%r9), %rdi -; FALLBACK8-NEXT: movq %rdi, %r10 +; FALLBACK8-NEXT: movslq %ecx, %r10 +; FALLBACK8-NEXT: movq 8(%rsp,%r10), %rdi +; FALLBACK8-NEXT: movq %rdi, %r12 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: shlq %cl, %r12 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK8-NEXT: movq %r11, %r8 +; FALLBACK8-NEXT: movq (%rsp,%r10), %r9 +; FALLBACK8-NEXT: movq %r9, %r8 ; FALLBACK8-NEXT: shrq %r8 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: orq %r10, %r8 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9), %rbx -; FALLBACK8-NEXT: movq %rbx, %r10 -; FALLBACK8-NEXT: shrq %r10 +; FALLBACK8-NEXT: shlq %cl, %r9 +; FALLBACK8-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK8-NEXT: movq -8(%rsp,%r10), %r11 +; FALLBACK8-NEXT: movq %r11, %r9 +; FALLBACK8-NEXT: shrq %r9 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: orq %r11, %r10 +; FALLBACK8-NEXT: shrq %cl, %r9 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r15 -; FALLBACK8-NEXT: movq %r15, %r11 +; FALLBACK8-NEXT: shlq %cl, %r11 +; FALLBACK8-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK8-NEXT: movq -16(%rsp,%r10), %rbp +; FALLBACK8-NEXT: movq %rbp, %r11 ; FALLBACK8-NEXT: shrq %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: orq %rbx, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r15 -; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r14 -; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r12 -; FALLBACK8-NEXT: movq %r12, %rbx -; FALLBACK8-NEXT: shrq %rbx +; FALLBACK8-NEXT: shlq %cl, %rbp +; FALLBACK8-NEXT: movq -24(%rsp,%r10), %rbx +; FALLBACK8-NEXT: movq %rbx, %r14 +; FALLBACK8-NEXT: shrq %r14 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbx -; FALLBACK8-NEXT: orq %r15, %rbx +; FALLBACK8-NEXT: shrq %cl, %r14 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r12 -; FALLBACK8-NEXT: movq %r14, %r15 -; FALLBACK8-NEXT: shrq %r15 +; FALLBACK8-NEXT: shlq %cl, %rbx +; FALLBACK8-NEXT: movq -32(%rsp,%r10), %r15 +; FALLBACK8-NEXT: movq %r15, %r13 +; FALLBACK8-NEXT: shrq %r13 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r15 -; FALLBACK8-NEXT: orq %r12, %r15 -; FALLBACK8-NEXT: movq -16(%rsp,%r9), %r12 -; FALLBACK8-NEXT: movq %r12, %r13 +; FALLBACK8-NEXT: shrq %cl, %r13 +; FALLBACK8-NEXT: orq %r12, %r8 +; FALLBACK8-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK8-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; FALLBACK8-NEXT: movq 16(%rsp,%r10), %r12 +; FALLBACK8-NEXT: movq %r12, %r8 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r13 +; FALLBACK8-NEXT: shlq %cl, %r8 +; FALLBACK8-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; FALLBACK8-NEXT: orq %rbp, %r14 ; FALLBACK8-NEXT: shrq %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %rdi -; FALLBACK8-NEXT: orq %r13, %rdi -; FALLBACK8-NEXT: movq -8(%rsp,%r9), %r9 +; FALLBACK8-NEXT: movq 24(%rsp,%r10), %r10 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: orq %rbx, %r13 +; FALLBACK8-NEXT: orq %r8, %rdi ; FALLBACK8-NEXT: shrq %r12 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r12 -; FALLBACK8-NEXT: orq %r9, %r12 +; FALLBACK8-NEXT: orq %r10, %r12 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r14 -; FALLBACK8-NEXT: movq %r14, (%rdx) +; FALLBACK8-NEXT: shlq %cl, %r15 +; FALLBACK8-NEXT: movq %r15, (%rdx) ; FALLBACK8-NEXT: movq %r12, 56(%rdx) ; FALLBACK8-NEXT: movq %rdi, 48(%rdx) -; FALLBACK8-NEXT: movq %r15, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %r13, 8(%rdx) +; FALLBACK8-NEXT: movq %r14, 16(%rdx) ; FALLBACK8-NEXT: movq %r11, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 32(%rdx) -; FALLBACK8-NEXT: movq %r8, 40(%rdx) +; FALLBACK8-NEXT: movq %r9, 32(%rdx) +; FALLBACK8-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; FALLBACK8-NEXT: movq %rax, 40(%rdx) +; FALLBACK8-NEXT: addq $40, %rsp ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: popq %r12 ; FALLBACK8-NEXT: popq %r13 ; FALLBACK8-NEXT: popq %r14 ; FALLBACK8-NEXT: popq %r15 +; FALLBACK8-NEXT: popq %rbp ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: shl_64bytes: ; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: pushq %r15 ; FALLBACK9-NEXT: pushq %r14 ; FALLBACK9-NEXT: pushq %rbx +; FALLBACK9-NEXT: pushq %rax ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK9-NEXT: movl (%rsi), %eax @@ -16770,29 +16070,29 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: shldq %cl, %rdi, %rax ; FALLBACK9-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK9-NEXT: shldq %cl, %r10, %rdi -; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK9-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK9-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK9-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK9-NEXT: movq %r14, %r15 -; FALLBACK9-NEXT: shldq %cl, %r9, %r15 -; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK9-NEXT: shldq %cl, %r14, %r8 -; FALLBACK9-NEXT: movq %r11, %r9 -; FALLBACK9-NEXT: shlq %cl, %r9 +; FALLBACK9-NEXT: movq -56(%rsp,%r8), %r11 +; FALLBACK9-NEXT: shldq %cl, %r11, %r10 +; FALLBACK9-NEXT: movq -16(%rsp,%r8), %rbx +; FALLBACK9-NEXT: movq %rbx, %r14 +; FALLBACK9-NEXT: shldq %cl, %r9, %r14 +; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r9 +; FALLBACK9-NEXT: shldq %cl, %rbx, %r9 +; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r8 +; FALLBACK9-NEXT: movq %r8, %rbx +; FALLBACK9-NEXT: shlq %cl, %rbx ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shldq %cl, %r11, %rbx -; FALLBACK9-NEXT: movq %r8, 56(%rdx) -; FALLBACK9-NEXT: movq %r15, 48(%rdx) -; FALLBACK9-NEXT: movq %rbx, 8(%rdx) +; FALLBACK9-NEXT: shldq %cl, %r8, %r11 +; FALLBACK9-NEXT: movq %r9, 56(%rdx) +; FALLBACK9-NEXT: movq %r14, 48(%rdx) +; FALLBACK9-NEXT: movq %r11, 8(%rdx) ; FALLBACK9-NEXT: movq %r10, 16(%rdx) ; FALLBACK9-NEXT: movq %rdi, 24(%rdx) ; FALLBACK9-NEXT: movq %rax, 32(%rdx) ; FALLBACK9-NEXT: movq %rsi, 40(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: movq %rbx, (%rdx) +; FALLBACK9-NEXT: addq $8, %rsp ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 -; FALLBACK9-NEXT: popq %r15 ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; @@ -16877,9 +16177,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK11-LABEL: shl_64bytes: ; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: pushq %r15 ; FALLBACK11-NEXT: pushq %r14 ; FALLBACK11-NEXT: pushq %rbx +; FALLBACK11-NEXT: pushq %rax ; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK11-NEXT: movl (%rsi), %eax @@ -16901,38 +16201,40 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: shldq %cl, %rdi, %rax ; FALLBACK11-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK11-NEXT: shldq %cl, %r10, %rdi -; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK11-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK11-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK11-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK11-NEXT: movq %r14, %r15 -; FALLBACK11-NEXT: shldq %cl, %r9, %r15 -; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK11-NEXT: shldq %cl, %r14, %r8 -; FALLBACK11-NEXT: shlxq %rcx, %r11, %r9 +; FALLBACK11-NEXT: movq -56(%rsp,%r8), %r11 +; FALLBACK11-NEXT: shldq %cl, %r11, %r10 +; FALLBACK11-NEXT: movq -16(%rsp,%r8), %rbx +; FALLBACK11-NEXT: movq %rbx, %r14 +; FALLBACK11-NEXT: shldq %cl, %r9, %r14 +; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r9 +; FALLBACK11-NEXT: shldq %cl, %rbx, %r9 +; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r8 +; FALLBACK11-NEXT: shlxq %rcx, %r8, %rbx ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shldq %cl, %r11, %rbx -; FALLBACK11-NEXT: movq %r8, 56(%rdx) -; FALLBACK11-NEXT: movq %r15, 48(%rdx) -; FALLBACK11-NEXT: movq %rbx, 8(%rdx) +; FALLBACK11-NEXT: shldq %cl, %r8, %r11 +; FALLBACK11-NEXT: movq %r9, 56(%rdx) +; FALLBACK11-NEXT: movq %r14, 48(%rdx) +; FALLBACK11-NEXT: movq %r11, 8(%rdx) ; FALLBACK11-NEXT: movq %r10, 16(%rdx) ; FALLBACK11-NEXT: movq %rdi, 24(%rdx) ; FALLBACK11-NEXT: movq %rax, 32(%rdx) ; FALLBACK11-NEXT: movq %rsi, 40(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: movq %rbx, (%rdx) +; FALLBACK11-NEXT: addq $8, %rsp ; FALLBACK11-NEXT: popq %rbx ; FALLBACK11-NEXT: popq %r14 -; FALLBACK11-NEXT: popq %r15 ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: shl_64bytes: ; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %rbp ; FALLBACK12-NEXT: pushq %r15 ; FALLBACK12-NEXT: pushq %r14 ; FALLBACK12-NEXT: pushq %r13 ; FALLBACK12-NEXT: pushq %r12 ; FALLBACK12-NEXT: pushq %rbx +; FALLBACK12-NEXT: subq $40, %rsp ; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK12-NEXT: movl (%rsi), %ecx ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -16942,89 +16244,95 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: andl $56, %eax ; FALLBACK12-NEXT: andl $56, %ecx ; FALLBACK12-NEXT: negl %ecx -; FALLBACK12-NEXT: movslq %ecx, %r9 -; FALLBACK12-NEXT: movq -24(%rsp,%r9), %rdi -; FALLBACK12-NEXT: movq %rdi, %r10 +; FALLBACK12-NEXT: movslq %ecx, %r10 +; FALLBACK12-NEXT: movq 8(%rsp,%r10), %rdi +; FALLBACK12-NEXT: movq %rdi, %r12 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK12-NEXT: movq %r11, %r8 +; FALLBACK12-NEXT: movq (%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq %r9, %r8 ; FALLBACK12-NEXT: shrq %r8 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: orq %r10, %r8 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9), %rbx -; FALLBACK12-NEXT: movq %rbx, %r10 -; FALLBACK12-NEXT: shrq %r10 +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK12-NEXT: movq -8(%rsp,%r10), %r11 +; FALLBACK12-NEXT: movq %r11, %r9 +; FALLBACK12-NEXT: shrq %r9 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: orq %r11, %r10 +; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r15 -; FALLBACK12-NEXT: movq %r15, %r11 +; FALLBACK12-NEXT: shlq %cl, %r11 +; FALLBACK12-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK12-NEXT: movq -16(%rsp,%r10), %rbp +; FALLBACK12-NEXT: movq %rbp, %r11 ; FALLBACK12-NEXT: shrq %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: orq %rbx, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r15 -; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r14 -; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r12 -; FALLBACK12-NEXT: movq %r12, %rbx -; FALLBACK12-NEXT: shrq %rbx +; FALLBACK12-NEXT: shlq %cl, %rbp +; FALLBACK12-NEXT: movq -24(%rsp,%r10), %rbx +; FALLBACK12-NEXT: movq %rbx, %r14 +; FALLBACK12-NEXT: shrq %r14 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: orq %r15, %rbx +; FALLBACK12-NEXT: shrq %cl, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r12 -; FALLBACK12-NEXT: movq %r14, %r15 -; FALLBACK12-NEXT: shrq %r15 +; FALLBACK12-NEXT: shlq %cl, %rbx +; FALLBACK12-NEXT: movq -32(%rsp,%r10), %r15 +; FALLBACK12-NEXT: movq %r15, %r13 +; FALLBACK12-NEXT: shrq %r13 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r15 -; FALLBACK12-NEXT: orq %r12, %r15 -; FALLBACK12-NEXT: movq -16(%rsp,%r9), %r12 -; FALLBACK12-NEXT: movq %r12, %r13 +; FALLBACK12-NEXT: shrq %cl, %r13 +; FALLBACK12-NEXT: orq %r12, %r8 +; FALLBACK12-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK12-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; FALLBACK12-NEXT: movq 16(%rsp,%r10), %r12 +; FALLBACK12-NEXT: movq %r12, %r8 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r13 +; FALLBACK12-NEXT: shlq %cl, %r8 +; FALLBACK12-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; FALLBACK12-NEXT: orq %rbp, %r14 ; FALLBACK12-NEXT: shrq %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %rdi -; FALLBACK12-NEXT: orq %r13, %rdi -; FALLBACK12-NEXT: movq -8(%rsp,%r9), %r9 +; FALLBACK12-NEXT: movq 24(%rsp,%r10), %r10 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: orq %rbx, %r13 +; FALLBACK12-NEXT: orq %r8, %rdi ; FALLBACK12-NEXT: shrq %r12 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r12 -; FALLBACK12-NEXT: orq %r9, %r12 +; FALLBACK12-NEXT: orq %r10, %r12 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r14 -; FALLBACK12-NEXT: movq %r14, (%rdx) +; FALLBACK12-NEXT: shlq %cl, %r15 +; FALLBACK12-NEXT: movq %r15, (%rdx) ; FALLBACK12-NEXT: movq %r12, 56(%rdx) ; FALLBACK12-NEXT: movq %rdi, 48(%rdx) -; FALLBACK12-NEXT: movq %r15, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %r13, 8(%rdx) +; FALLBACK12-NEXT: movq %r14, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 32(%rdx) -; FALLBACK12-NEXT: movq %r8, 40(%rdx) +; FALLBACK12-NEXT: movq %r9, 32(%rdx) +; FALLBACK12-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; FALLBACK12-NEXT: movq %rax, 40(%rdx) +; FALLBACK12-NEXT: addq $40, %rsp ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: popq %r12 ; FALLBACK12-NEXT: popq %r13 ; FALLBACK12-NEXT: popq %r14 ; FALLBACK12-NEXT: popq %r15 +; FALLBACK12-NEXT: popq %rbp ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: shl_64bytes: ; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: pushq %r15 ; FALLBACK13-NEXT: pushq %r14 ; FALLBACK13-NEXT: pushq %rbx +; FALLBACK13-NEXT: pushq %rax ; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK13-NEXT: movl (%rsi), %eax ; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -17043,29 +16351,29 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: shldq %cl, %rdi, %rax ; FALLBACK13-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK13-NEXT: shldq %cl, %r10, %rdi -; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK13-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK13-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK13-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK13-NEXT: movq %r14, %r15 -; FALLBACK13-NEXT: shldq %cl, %r9, %r15 -; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK13-NEXT: shldq %cl, %r14, %r8 -; FALLBACK13-NEXT: movq %r11, %r9 -; FALLBACK13-NEXT: shlq %cl, %r9 +; FALLBACK13-NEXT: movq -56(%rsp,%r8), %r11 +; FALLBACK13-NEXT: shldq %cl, %r11, %r10 +; FALLBACK13-NEXT: movq -16(%rsp,%r8), %rbx +; FALLBACK13-NEXT: movq %rbx, %r14 +; FALLBACK13-NEXT: shldq %cl, %r9, %r14 +; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r9 +; FALLBACK13-NEXT: shldq %cl, %rbx, %r9 +; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r8 +; FALLBACK13-NEXT: movq %r8, %rbx +; FALLBACK13-NEXT: shlq %cl, %rbx ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shldq %cl, %r11, %rbx -; FALLBACK13-NEXT: movq %r8, 56(%rdx) -; FALLBACK13-NEXT: movq %r15, 48(%rdx) -; FALLBACK13-NEXT: movq %rbx, 8(%rdx) +; FALLBACK13-NEXT: shldq %cl, %r8, %r11 +; FALLBACK13-NEXT: movq %r9, 56(%rdx) +; FALLBACK13-NEXT: movq %r14, 48(%rdx) +; FALLBACK13-NEXT: movq %r11, 8(%rdx) ; FALLBACK13-NEXT: movq %r10, 16(%rdx) ; FALLBACK13-NEXT: movq %rdi, 24(%rdx) ; FALLBACK13-NEXT: movq %rax, 32(%rdx) ; FALLBACK13-NEXT: movq %rsi, 40(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: movq %rbx, (%rdx) +; FALLBACK13-NEXT: addq $8, %rsp ; FALLBACK13-NEXT: popq %rbx ; FALLBACK13-NEXT: popq %r14 -; FALLBACK13-NEXT: popq %r15 ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; @@ -17147,9 +16455,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK15-LABEL: shl_64bytes: ; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: pushq %r15 ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx +; FALLBACK15-NEXT: pushq %rax ; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK15-NEXT: movl (%rsi), %eax ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -17168,28 +16476,28 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: shldq %cl, %rdi, %rax ; FALLBACK15-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK15-NEXT: shldq %cl, %r10, %rdi -; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK15-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK15-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK15-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK15-NEXT: movq %r14, %r15 -; FALLBACK15-NEXT: shldq %cl, %r9, %r15 -; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK15-NEXT: shldq %cl, %r14, %r8 -; FALLBACK15-NEXT: shlxq %rcx, %r11, %r9 +; FALLBACK15-NEXT: movq -56(%rsp,%r8), %r11 +; FALLBACK15-NEXT: shldq %cl, %r11, %r10 +; FALLBACK15-NEXT: movq -16(%rsp,%r8), %rbx +; FALLBACK15-NEXT: movq %rbx, %r14 +; FALLBACK15-NEXT: shldq %cl, %r9, %r14 +; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r9 +; FALLBACK15-NEXT: shldq %cl, %rbx, %r9 +; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r8 +; FALLBACK15-NEXT: shlxq %rcx, %r8, %rbx ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shldq %cl, %r11, %rbx -; FALLBACK15-NEXT: movq %r8, 56(%rdx) -; FALLBACK15-NEXT: movq %r15, 48(%rdx) -; FALLBACK15-NEXT: movq %rbx, 8(%rdx) +; FALLBACK15-NEXT: shldq %cl, %r8, %r11 +; FALLBACK15-NEXT: movq %r9, 56(%rdx) +; FALLBACK15-NEXT: movq %r14, 48(%rdx) +; FALLBACK15-NEXT: movq %r11, 8(%rdx) ; FALLBACK15-NEXT: movq %r10, 16(%rdx) ; FALLBACK15-NEXT: movq %rdi, 24(%rdx) ; FALLBACK15-NEXT: movq %rax, 32(%rdx) ; FALLBACK15-NEXT: movq %rsi, 40(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: movq %rbx, (%rdx) +; FALLBACK15-NEXT: addq $8, %rsp ; FALLBACK15-NEXT: popq %rbx ; FALLBACK15-NEXT: popq %r14 -; FALLBACK15-NEXT: popq %r15 ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; @@ -17199,233 +16507,206 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $204, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 16(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 20(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 24(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 28(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%eax), %ebp -; FALLBACK16-NEXT: movl 44(%eax), %ebx -; FALLBACK16-NEXT: movl 48(%eax), %edi -; FALLBACK16-NEXT: movl 52(%eax), %esi -; FALLBACK16-NEXT: movl 56(%eax), %edx -; FALLBACK16-NEXT: movl 60(%eax), %ecx +; FALLBACK16-NEXT: subl $220, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movups (%ecx), %xmm0 +; FALLBACK16-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK16-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK16-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK16-NEXT: movl (%eax), %eax -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: xorps %xmm4, %xmm4 +; FALLBACK16-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %eax, %edx -; FALLBACK16-NEXT: andl $60, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: subl %edx, %ecx -; FALLBACK16-NEXT: movl (%ecx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %edx -; FALLBACK16-NEXT: movl %ecx, %ebp +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: andl $60, %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK16-NEXT: subl %ecx, %edx +; FALLBACK16-NEXT: movl (%edx), %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 4(%edx), %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: shll $3, %eax ; FALLBACK16-NEXT: andl $24, %eax -; FALLBACK16-NEXT: movl %edx, %esi ; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %al, %ch -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 8(%ebp), %esi -; FALLBACK16-NEXT: movl %ebp, %edi -; FALLBACK16-NEXT: movl %esi, %ebp +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: notb %cl +; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 12(%edx), %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: movl 8(%edx), %edx +; FALLBACK16-NEXT: movl %edx, %ebp ; FALLBACK16-NEXT: shrl %ebp +; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx +; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: movl 20(%edi), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: movl 20(%ebx), %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 16(%edi), %esi -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: shrl %edx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: movl 16(%ebx), %edx +; FALLBACK16-NEXT: movl %edx, %esi +; FALLBACK16-NEXT: shrl %esi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, %edi ; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK16-NEXT: shrl %esi ; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movb %ch, %dl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: orl %ebp, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %esi +; FALLBACK16-NEXT: movl 28(%ebx), %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: movl %eax, %ebx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: movl 24(%esi), %eax +; FALLBACK16-NEXT: movl %eax, %edi +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: orl %ebp, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movl 28(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 24(%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK16-NEXT: movl 36(%esi), %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %eax +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: movl 32(%esi), %ebx +; FALLBACK16-NEXT: movl %ebx, %edi ; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: orl %ebp, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK16-NEXT: movl 44(%edi), %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %esi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%edx), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 32(%edx), %esi -; FALLBACK16-NEXT: movl %edx, %ebp -; FALLBACK16-NEXT: movl %esi, %edi +; FALLBACK16-NEXT: movl 40(%edi), %edi +; FALLBACK16-NEXT: movl %edi, %ebx +; FALLBACK16-NEXT: shrl %ebx +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 44(%ebp), %ebx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: orl %esi, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 40(%ebp), %esi -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 52(%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: negl %edx -; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl 52(%ecx), %ebx ; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: shrl %ebp +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: negl %ecx +; FALLBACK16-NEXT: movl 192(%esp,%ecx), %edx +; FALLBACK16-NEXT: movl %edx, %esi +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: orl %ebp, %esi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: shrl %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%edi), %edx +; FALLBACK16-NEXT: movl 60(%edi), %ebp ; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl 56(%edi), %ebx -; FALLBACK16-NEXT: movl %ebx, %edi +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 56(%edi), %ebp +; FALLBACK16-NEXT: movl %ebp, %edi ; FALLBACK16-NEXT: shrl %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %edx, %edi ; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: shrl %ebx ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK16-NEXT: orl %ebp, %ebx ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 56(%eax) +; FALLBACK16-NEXT: movl %ebx, 56(%eax) ; FALLBACK16-NEXT: movl %edi, 60(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) +; FALLBACK16-NEXT: movl %esi, 52(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 40(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -17448,7 +16729,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: movl %ecx, 12(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $204, %esp +; FALLBACK16-NEXT: addl $220, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx @@ -17462,66 +16743,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $188, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 20(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 28(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 36(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%ecx), %ebp -; FALLBACK17-NEXT: movl 44(%ecx), %ebx -; FALLBACK17-NEXT: movl 48(%ecx), %edi -; FALLBACK17-NEXT: movl 52(%ecx), %esi -; FALLBACK17-NEXT: movl 56(%ecx), %edx -; FALLBACK17-NEXT: movl 60(%ecx), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %ecx -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movups (%ecx), %xmm0 +; FALLBACK17-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK17-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK17-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK17-NEXT: movl (%eax), %ecx +; FALLBACK17-NEXT: xorps %xmm4, %xmm4 +; FALLBACK17-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ecx, %ebp ; FALLBACK17-NEXT: andl $60, %ebp ; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax @@ -17534,7 +16771,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: shldl %cl, %esi, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 4(%eax), %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %edi, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 16(%eax), %edi @@ -17558,55 +16795,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %edx, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%eax), %edx -; FALLBACK17-NEXT: movl 44(%eax), %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %esi, %edx +; FALLBACK17-NEXT: movl 40(%eax), %edi +; FALLBACK17-NEXT: movl 44(%eax), %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %edi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 56(%eax), %edx +; FALLBACK17-NEXT: shldl %cl, %esi, %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 56(%eax), %esi ; FALLBACK17-NEXT: movl 60(%eax), %edi -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl (%eax), %ebx -; FALLBACK17-NEXT: movl 52(%eax), %esi -; FALLBACK17-NEXT: shldl %cl, %esi, %edx +; FALLBACK17-NEXT: shldl %cl, %esi, %edi +; FALLBACK17-NEXT: movl 52(%eax), %edx +; FALLBACK17-NEXT: shldl %cl, %edx, %esi +; FALLBACK17-NEXT: movl (%eax), %eax +; FALLBACK17-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill +; FALLBACK17-NEXT: shll %cl, %eax ; FALLBACK17-NEXT: negl %ebp -; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl %edi, 60(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %ebx, %edx -; FALLBACK17-NEXT: shll %cl, %ebx -; FALLBACK17-NEXT: shldl %cl, %eax, %esi +; FALLBACK17-NEXT: movl 160(%esp,%ebp), %ebp +; FALLBACK17-NEXT: shldl %cl, %ebp, %edx ; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %edi, %eax -; FALLBACK17-NEXT: movl %eax, 48(%ebp) -; FALLBACK17-NEXT: movl %esi, 52(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK17-NEXT: shldl %cl, %ebx, %ebp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl %esi, 56(%ecx) +; FALLBACK17-NEXT: movl %edi, 60(%ecx) +; FALLBACK17-NEXT: movl %ebp, 48(%ecx) +; FALLBACK17-NEXT: movl %edx, 52(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 40(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 44(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 32(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 36(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 24(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 28(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 16(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 20(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 8(%ecx) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 12(%ecx) +; FALLBACK17-NEXT: movl %eax, (%ecx) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) -; FALLBACK17-NEXT: movl %edx, 4(%ebp) +; FALLBACK17-NEXT: movl %eax, 4(%ecx) ; FALLBACK17-NEXT: addl $188, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi @@ -17622,71 +16859,27 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $204, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 12(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 20(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 28(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 36(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebx -; FALLBACK18-NEXT: movl 44(%eax), %edi -; FALLBACK18-NEXT: movl 48(%eax), %esi -; FALLBACK18-NEXT: movl 52(%eax), %edx -; FALLBACK18-NEXT: movl 56(%eax), %ecx -; FALLBACK18-NEXT: movl 60(%eax), %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK18-NEXT: movl (%ebp), %ebp -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movups (%ecx), %xmm0 +; FALLBACK18-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK18-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK18-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK18-NEXT: movl (%eax), %eax +; FALLBACK18-NEXT: xorps %xmm4, %xmm4 +; FALLBACK18-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: leal (,%ebp,8), %edx +; FALLBACK18-NEXT: leal (,%eax,8), %edx ; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: andl $60, %ebp -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: andl $60, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK18-NEXT: subl %ebp, %edi +; FALLBACK18-NEXT: subl %eax, %edi ; FALLBACK18-NEXT: movl (%edi), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 4(%edi), %eax @@ -17832,81 +17025,37 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $204, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl (%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 20(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 28(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 36(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%ebp), %ebx -; FALLBACK19-NEXT: movl 44(%ebp), %edi -; FALLBACK19-NEXT: movl 48(%ebp), %esi -; FALLBACK19-NEXT: movl 52(%ebp), %edx -; FALLBACK19-NEXT: movl 56(%ebp), %ecx -; FALLBACK19-NEXT: movl 60(%ebp), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl (%ebp), %ebp -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: subl $188, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movups (%ecx), %xmm0 +; FALLBACK19-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK19-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK19-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK19-NEXT: movl (%eax), %ebp +; FALLBACK19-NEXT: xorps %xmm4, %xmm4 +; FALLBACK19-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: leal (,%ebp,8), %ecx ; FALLBACK19-NEXT: andl $24, %ecx ; FALLBACK19-NEXT: andl $60, %ebp ; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: subl %ebp, %eax -; FALLBACK19-NEXT: movl 4(%eax), %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%eax), %edi +; FALLBACK19-NEXT: movl 8(%eax), %esi ; FALLBACK19-NEXT: movl 12(%eax), %edx -; FALLBACK19-NEXT: movl %edx, %ebx -; FALLBACK19-NEXT: shldl %cl, %edi, %ebx -; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl %edx, %edi ; FALLBACK19-NEXT: shldl %cl, %esi, %edi ; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 4(%eax), %edi +; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %edi, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 16(%eax), %edi ; FALLBACK19-NEXT: movl 20(%eax), %esi ; FALLBACK19-NEXT: movl %esi, %ebx @@ -17934,52 +17083,48 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: shldl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shldl %cl, %esi, %ebx -; FALLBACK19-NEXT: movl 56(%eax), %edx +; FALLBACK19-NEXT: movl 56(%eax), %esi ; FALLBACK19-NEXT: movl 60(%eax), %edi -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl (%eax), %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 52(%eax), %esi -; FALLBACK19-NEXT: shldl %cl, %esi, %edx +; FALLBACK19-NEXT: shldl %cl, %esi, %edi +; FALLBACK19-NEXT: movl 52(%eax), %edx +; FALLBACK19-NEXT: shldl %cl, %edx, %esi +; FALLBACK19-NEXT: movl (%eax), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill ; FALLBACK19-NEXT: negl %ebp -; FALLBACK19-NEXT: movl 176(%esp,%ebp), %ebp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl %edx, 56(%eax) -; FALLBACK19-NEXT: movl %edi, 60(%eax) +; FALLBACK19-NEXT: movl 160(%esp,%ebp), %ebp +; FALLBACK19-NEXT: shldl %cl, %ebp, %edx +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: shldl %cl, %eax, %ebp +; FALLBACK19-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movl %esi, 56(%ecx) +; FALLBACK19-NEXT: movl %edi, 60(%ecx) +; FALLBACK19-NEXT: movl %ebp, 48(%ecx) +; FALLBACK19-NEXT: movl %edx, 52(%ecx) +; FALLBACK19-NEXT: movl %ebx, 40(%ecx) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: shldl %cl, %ebp, %esi -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: movl %edx, 44(%ecx) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %edx, %ebp -; FALLBACK19-NEXT: movl %ebp, 48(%eax) -; FALLBACK19-NEXT: movl %esi, 52(%eax) -; FALLBACK19-NEXT: movl %ebx, 40(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 44(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 32(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 36(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 24(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 28(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 16(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 20(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 8(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 12(%eax) -; FALLBACK19-NEXT: movl %edi, 4(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, (%eax) -; FALLBACK19-NEXT: addl $204, %esp +; FALLBACK19-NEXT: movl %edx, 32(%ecx) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 36(%ecx) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 24(%ecx) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 28(%ecx) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 16(%ecx) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 20(%ecx) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 8(%ecx) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 12(%ecx) +; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 4(%ecx) +; FALLBACK19-NEXT: movl %eax, (%ecx) +; FALLBACK19-NEXT: addl $188, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx @@ -17992,7 +17137,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $204, %esp +; FALLBACK20-NEXT: subl $220, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 @@ -18009,172 +17154,189 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: andl $60, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: subl %edx, %ecx -; FALLBACK20-NEXT: movl (%ecx), %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 4(%ecx), %edx -; FALLBACK20-NEXT: movl %ecx, %ebp +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: andl $60, %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK20-NEXT: subl %ecx, %edx +; FALLBACK20-NEXT: movl (%edx), %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 4(%edx), %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: shll $3, %eax ; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %esi ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %al, %ch -; FALLBACK20-NEXT: notb %ch -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %esi, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 12(%ebp), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 8(%ebp), %esi -; FALLBACK20-NEXT: movl %ebp, %edi -; FALLBACK20-NEXT: movl %esi, %ebp +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: notb %cl +; FALLBACK20-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 12(%edx), %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: movl 8(%edx), %edx +; FALLBACK20-NEXT: movl %edx, %ebp ; FALLBACK20-NEXT: shrl %ebp +; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx +; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl 20(%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: orl %edi, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: movl 20(%ebx), %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 16(%edi), %esi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: shrl %edx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl 16(%ebx), %edx +; FALLBACK20-NEXT: movl %edx, %esi +; FALLBACK20-NEXT: shrl %esi ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, %edi ; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK20-NEXT: shrl %esi ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %esi, %edi +; FALLBACK20-NEXT: movb %ch, %dl +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: orl %ebp, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %ebp, %edx -; FALLBACK20-NEXT: movl 28(%ebp), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 24(%ebp), %esi -; FALLBACK20-NEXT: movl %esi, %edi +; FALLBACK20-NEXT: movl %ebx, %esi +; FALLBACK20-NEXT: movl 28(%ebx), %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl %eax, %ebx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl 24(%esi), %eax +; FALLBACK20-NEXT: movl %eax, %edi ; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi +; FALLBACK20-NEXT: movl %ebx, %ecx +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: orl %ebp, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %esi, %ebp +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK20-NEXT: movl 36(%esi), %ebp ; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 36(%edx), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 32(%edx), %esi -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movl %esi, %edi +; FALLBACK20-NEXT: movl %ebx, %eax +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl 32(%esi), %ebx +; FALLBACK20-NEXT: movl %ebx, %edi ; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 44(%ebp), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 40(%ebp), %esi -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: orl %ebp, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: movl 44(%edi), %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 52(%ebp), %esi -; FALLBACK20-NEXT: movl %esi, %edi -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl 40(%edi), %edi +; FALLBACK20-NEXT: movl %edi, %ebx +; FALLBACK20-NEXT: shrl %ebx +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: negl %edx -; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: orl %esi, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl 52(%ecx), %ebx ; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: shrl %ebp +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: negl %ecx +; FALLBACK20-NEXT: movl 192(%esp,%ecx), %edx +; FALLBACK20-NEXT: movl %edx, %esi +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %edi, %ebp +; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: orl %ebp, %esi ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: shrl %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: movl 60(%edi), %edx +; FALLBACK20-NEXT: movl 60(%edi), %ebp ; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: movl 56(%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %edi +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 56(%edi), %ebp +; FALLBACK20-NEXT: movl %ebp, %edi ; FALLBACK20-NEXT: shrl %edi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi ; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: shrl %ebx ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: orl %ebp, %ebx ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl %edx, (%eax) -; FALLBACK20-NEXT: movl %esi, 56(%eax) +; FALLBACK20-NEXT: movl %ebx, 56(%eax) ; FALLBACK20-NEXT: movl %edi, 60(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 48(%eax) -; FALLBACK20-NEXT: movl %ebp, 52(%eax) +; FALLBACK20-NEXT: movl %esi, 52(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 40(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -18197,7 +17359,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl %ecx, 12(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 4(%eax) -; FALLBACK20-NEXT: addl $204, %esp +; FALLBACK20-NEXT: addl $220, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx @@ -18239,7 +17401,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: shldl %cl, %esi, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 4(%eax), %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl %edi, (%esp) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %edi, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 16(%eax), %edi @@ -18263,55 +17425,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %edx, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 40(%eax), %edx -; FALLBACK21-NEXT: movl 44(%eax), %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %edx, %edi +; FALLBACK21-NEXT: movl 40(%eax), %edi +; FALLBACK21-NEXT: movl 44(%eax), %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shldl %cl, %edi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shldl %cl, %esi, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%eax), %edx +; FALLBACK21-NEXT: movl 56(%eax), %esi ; FALLBACK21-NEXT: movl 60(%eax), %edi -; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl (%eax), %ebx -; FALLBACK21-NEXT: movl 52(%eax), %esi -; FALLBACK21-NEXT: shldl %cl, %esi, %edx +; FALLBACK21-NEXT: shldl %cl, %esi, %edi +; FALLBACK21-NEXT: movl 52(%eax), %edx +; FALLBACK21-NEXT: shldl %cl, %edx, %esi +; FALLBACK21-NEXT: movl (%eax), %eax +; FALLBACK21-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill +; FALLBACK21-NEXT: shll %cl, %eax ; FALLBACK21-NEXT: negl %ebp -; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl %edi, 60(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shldl %cl, %ebx, %edx -; FALLBACK21-NEXT: shll %cl, %ebx -; FALLBACK21-NEXT: shldl %cl, %eax, %esi +; FALLBACK21-NEXT: movl 160(%esp,%ebp), %ebp +; FALLBACK21-NEXT: shldl %cl, %ebp, %edx ; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK21-NEXT: shldl %cl, %edi, %eax -; FALLBACK21-NEXT: movl %eax, 48(%ebp) -; FALLBACK21-NEXT: movl %esi, 52(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK21-NEXT: shldl %cl, %ebx, %ebp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movl %esi, 56(%ecx) +; FALLBACK21-NEXT: movl %edi, 60(%ecx) +; FALLBACK21-NEXT: movl %ebp, 48(%ecx) +; FALLBACK21-NEXT: movl %edx, 52(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 40(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 44(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 32(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 36(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 24(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 28(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 16(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 20(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 8(%ecx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edx, 12(%ecx) +; FALLBACK21-NEXT: movl %eax, (%ecx) ; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) -; FALLBACK21-NEXT: movl %edx, 4(%ebp) +; FALLBACK21-NEXT: movl %eax, 4(%ecx) ; FALLBACK21-NEXT: addl $188, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -18493,7 +17655,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: pushl %ebx ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $204, %esp +; FALLBACK23-NEXT: subl $188, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK23-NEXT: movups (%ecx), %xmm0 @@ -18515,15 +17677,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: andl $60, %ebp ; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: subl %ebp, %eax -; FALLBACK23-NEXT: movl 4(%eax), %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 8(%eax), %edi +; FALLBACK23-NEXT: movl 8(%eax), %esi ; FALLBACK23-NEXT: movl 12(%eax), %edx -; FALLBACK23-NEXT: movl %edx, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl %edx, %edi ; FALLBACK23-NEXT: shldl %cl, %esi, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 4(%eax), %edi +; FALLBACK23-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK23-NEXT: shldl %cl, %edi, %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 16(%eax), %edi ; FALLBACK23-NEXT: movl 20(%eax), %esi ; FALLBACK23-NEXT: movl %esi, %ebx @@ -18551,52 +17713,48 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: shldl %cl, %ebx, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %esi, %ebx -; FALLBACK23-NEXT: movl 56(%eax), %edx +; FALLBACK23-NEXT: movl 56(%eax), %esi ; FALLBACK23-NEXT: movl 60(%eax), %edi -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: movl (%eax), %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 52(%eax), %esi -; FALLBACK23-NEXT: shldl %cl, %esi, %edx +; FALLBACK23-NEXT: shldl %cl, %esi, %edi +; FALLBACK23-NEXT: movl 52(%eax), %edx +; FALLBACK23-NEXT: shldl %cl, %edx, %esi +; FALLBACK23-NEXT: movl (%eax), %eax +; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill ; FALLBACK23-NEXT: negl %ebp -; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl %edx, 56(%eax) -; FALLBACK23-NEXT: movl %edi, 60(%eax) +; FALLBACK23-NEXT: movl 160(%esp,%ebp), %ebp +; FALLBACK23-NEXT: shldl %cl, %ebp, %edx +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: shldl %cl, %eax, %ebp +; FALLBACK23-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK23-NEXT: movl %esi, 56(%ecx) +; FALLBACK23-NEXT: movl %edi, 60(%ecx) +; FALLBACK23-NEXT: movl %ebp, 48(%ecx) +; FALLBACK23-NEXT: movl %edx, 52(%ecx) +; FALLBACK23-NEXT: movl %ebx, 40(%ecx) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: shldl %cl, %ebp, %esi -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: movl %edx, 44(%ecx) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shldl %cl, %edx, %ebp -; FALLBACK23-NEXT: movl %ebp, 48(%eax) -; FALLBACK23-NEXT: movl %esi, 52(%eax) -; FALLBACK23-NEXT: movl %ebx, 40(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 44(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 32(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 36(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 24(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 28(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 16(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 20(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 8(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 12(%eax) -; FALLBACK23-NEXT: movl %edi, 4(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, (%eax) -; FALLBACK23-NEXT: addl $204, %esp +; FALLBACK23-NEXT: movl %edx, 32(%ecx) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: movl %edx, 36(%ecx) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: movl %edx, 24(%ecx) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: movl %edx, 28(%ecx) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: movl %edx, 16(%ecx) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: movl %edx, 20(%ecx) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: movl %edx, 8(%ecx) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: movl %edx, 12(%ecx) +; FALLBACK23-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK23-NEXT: movl %edx, 4(%ecx) +; FALLBACK23-NEXT: movl %eax, (%ecx) +; FALLBACK23-NEXT: addl $188, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi ; FALLBACK23-NEXT: popl %ebx @@ -18609,7 +17767,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $204, %esp +; FALLBACK24-NEXT: subl $220, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 @@ -18620,172 +17778,189 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: andl $60, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: subl %edx, %ecx -; FALLBACK24-NEXT: movl (%ecx), %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 4(%ecx), %edx -; FALLBACK24-NEXT: movl %ecx, %ebp +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: andl $60, %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK24-NEXT: subl %ecx, %edx +; FALLBACK24-NEXT: movl (%edx), %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 4(%edx), %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: shll $3, %eax ; FALLBACK24-NEXT: andl $24, %eax -; FALLBACK24-NEXT: movl %edx, %esi ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %al, %ch -; FALLBACK24-NEXT: notb %ch -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %esi, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 12(%ebp), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 8(%ebp), %esi -; FALLBACK24-NEXT: movl %ebp, %edi -; FALLBACK24-NEXT: movl %esi, %ebp +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: notb %cl +; FALLBACK24-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 12(%edx), %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: movl 8(%edx), %edx +; FALLBACK24-NEXT: movl %edx, %ebp ; FALLBACK24-NEXT: shrl %ebp +; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx +; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl 20(%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: orl %edi, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: movl 20(%ebx), %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 16(%edi), %esi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: shrl %edx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl 16(%ebx), %edx +; FALLBACK24-NEXT: movl %edx, %esi +; FALLBACK24-NEXT: shrl %esi ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %esi, %edi ; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK24-NEXT: shrl %esi ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %esi, %edi +; FALLBACK24-NEXT: movb %ch, %dl +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: orl %ebp, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %ebp, %edx -; FALLBACK24-NEXT: movl 28(%ebp), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 24(%ebp), %esi -; FALLBACK24-NEXT: movl %esi, %edi +; FALLBACK24-NEXT: movl %ebx, %esi +; FALLBACK24-NEXT: movl 28(%ebx), %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl %eax, %ebx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl 24(%esi), %eax +; FALLBACK24-NEXT: movl %eax, %edi ; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: movl %ebx, %ecx +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: orl %ebp, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %esi, %ebp +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK24-NEXT: movl 36(%esi), %ebp ; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 36(%edx), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 32(%edx), %esi -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movl %esi, %edi +; FALLBACK24-NEXT: movl %ebx, %eax +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl 32(%esi), %ebx +; FALLBACK24-NEXT: movl %ebx, %edi ; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 44(%ebp), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 40(%ebp), %esi -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: orl %ebp, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: movl 44(%edi), %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 52(%ebp), %esi -; FALLBACK24-NEXT: movl %esi, %edi -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl 40(%edi), %edi +; FALLBACK24-NEXT: movl %edi, %ebx +; FALLBACK24-NEXT: shrl %ebx +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: negl %edx -; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: orl %esi, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl 52(%ecx), %ebx ; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: shrl %ebp +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: negl %ecx +; FALLBACK24-NEXT: movl 192(%esp,%ecx), %edx +; FALLBACK24-NEXT: movl %edx, %esi +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %edi, %ebp +; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: orl %ebp, %esi ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: shrl %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: movl 60(%edi), %edx +; FALLBACK24-NEXT: movl 60(%edi), %ebp ; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: movl 56(%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %edi +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 56(%edi), %ebp +; FALLBACK24-NEXT: movl %ebp, %edi ; FALLBACK24-NEXT: shrl %edi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi ; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: shrl %ebx ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK24-NEXT: orl %ebp, %ebx ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl %edx, (%eax) -; FALLBACK24-NEXT: movl %esi, 56(%eax) +; FALLBACK24-NEXT: movl %ebx, 56(%eax) ; FALLBACK24-NEXT: movl %edi, 60(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 48(%eax) -; FALLBACK24-NEXT: movl %ebp, 52(%eax) +; FALLBACK24-NEXT: movl %esi, 52(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 40(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -18808,7 +17983,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl %ecx, 12(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 4(%eax) -; FALLBACK24-NEXT: addl $204, %esp +; FALLBACK24-NEXT: addl $220, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx @@ -18845,7 +18020,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: shldl %cl, %esi, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 4(%eax), %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl %edi, (%esp) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %edi, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 16(%eax), %edi @@ -18869,55 +18044,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %edx, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 40(%eax), %edx -; FALLBACK25-NEXT: movl 44(%eax), %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %edx, %edi +; FALLBACK25-NEXT: movl 40(%eax), %edi +; FALLBACK25-NEXT: movl 44(%eax), %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shldl %cl, %edi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shldl %cl, %esi, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%eax), %edx +; FALLBACK25-NEXT: movl 56(%eax), %esi ; FALLBACK25-NEXT: movl 60(%eax), %edi -; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl (%eax), %ebx -; FALLBACK25-NEXT: movl 52(%eax), %esi -; FALLBACK25-NEXT: shldl %cl, %esi, %edx +; FALLBACK25-NEXT: shldl %cl, %esi, %edi +; FALLBACK25-NEXT: movl 52(%eax), %edx +; FALLBACK25-NEXT: shldl %cl, %edx, %esi +; FALLBACK25-NEXT: movl (%eax), %eax +; FALLBACK25-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill +; FALLBACK25-NEXT: shll %cl, %eax ; FALLBACK25-NEXT: negl %ebp -; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl %edi, 60(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shldl %cl, %ebx, %edx -; FALLBACK25-NEXT: shll %cl, %ebx -; FALLBACK25-NEXT: shldl %cl, %eax, %esi +; FALLBACK25-NEXT: movl 160(%esp,%ebp), %ebp +; FALLBACK25-NEXT: shldl %cl, %ebp, %edx ; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK25-NEXT: shldl %cl, %edi, %eax -; FALLBACK25-NEXT: movl %eax, 48(%ebp) -; FALLBACK25-NEXT: movl %esi, 52(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK25-NEXT: shldl %cl, %ebx, %ebp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: movl %esi, 56(%ecx) +; FALLBACK25-NEXT: movl %edi, 60(%ecx) +; FALLBACK25-NEXT: movl %ebp, 48(%ecx) +; FALLBACK25-NEXT: movl %edx, 52(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 40(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 44(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 32(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 36(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 24(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 28(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 16(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 20(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 8(%ecx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edx, 12(%ecx) +; FALLBACK25-NEXT: movl %eax, (%ecx) ; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) -; FALLBACK25-NEXT: movl %edx, 4(%ebp) +; FALLBACK25-NEXT: movl %eax, 4(%ecx) ; FALLBACK25-NEXT: addl $188, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -19095,104 +18270,100 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: pushl %ebx ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $204, %esp +; FALLBACK27-NEXT: subl $188, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK27-NEXT: movl (%eax), %ebx +; FALLBACK27-NEXT: movl (%eax), %ebp ; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: leal (,%ebx,8), %ecx +; FALLBACK27-NEXT: leal (,%ebp,8), %ecx ; FALLBACK27-NEXT: andl $24, %ecx -; FALLBACK27-NEXT: andl $60, %ebx +; FALLBACK27-NEXT: andl $60, %ebp ; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: subl %ebx, %eax -; FALLBACK27-NEXT: movl 4(%eax), %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 8(%eax), %edi +; FALLBACK27-NEXT: subl %ebp, %eax +; FALLBACK27-NEXT: movl 8(%eax), %esi ; FALLBACK27-NEXT: movl 12(%eax), %edx -; FALLBACK27-NEXT: movl %edx, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl %edx, %edi ; FALLBACK27-NEXT: shldl %cl, %esi, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 4(%eax), %edi +; FALLBACK27-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK27-NEXT: shldl %cl, %edi, %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 16(%eax), %edi ; FALLBACK27-NEXT: movl 20(%eax), %esi -; FALLBACK27-NEXT: movl %esi, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl %esi, %ebx +; FALLBACK27-NEXT: shldl %cl, %edi, %ebx +; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %edx, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 24(%eax), %edi ; FALLBACK27-NEXT: movl 28(%eax), %edx -; FALLBACK27-NEXT: movl %edx, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl %edx, %ebx +; FALLBACK27-NEXT: shldl %cl, %edi, %ebx +; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %esi, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 32(%eax), %edi ; FALLBACK27-NEXT: movl 36(%eax), %esi -; FALLBACK27-NEXT: movl %esi, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl %esi, %ebx +; FALLBACK27-NEXT: shldl %cl, %edi, %ebx +; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %edx, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 40(%eax), %ebp +; FALLBACK27-NEXT: movl 40(%eax), %ebx ; FALLBACK27-NEXT: movl 44(%eax), %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %ebp, %edx +; FALLBACK27-NEXT: shldl %cl, %ebx, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %esi, %ebp -; FALLBACK27-NEXT: movl 56(%eax), %edx +; FALLBACK27-NEXT: shldl %cl, %esi, %ebx +; FALLBACK27-NEXT: movl 56(%eax), %esi ; FALLBACK27-NEXT: movl 60(%eax), %edi -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: movl (%eax), %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 52(%eax), %esi -; FALLBACK27-NEXT: shldl %cl, %esi, %edx -; FALLBACK27-NEXT: negl %ebx -; FALLBACK27-NEXT: movl 176(%esp,%ebx), %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl %edx, 56(%eax) -; FALLBACK27-NEXT: movl %edi, 60(%eax) +; FALLBACK27-NEXT: shldl %cl, %esi, %edi +; FALLBACK27-NEXT: movl 52(%eax), %edx +; FALLBACK27-NEXT: shldl %cl, %edx, %esi +; FALLBACK27-NEXT: movl (%eax), %eax +; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill +; FALLBACK27-NEXT: negl %ebp +; FALLBACK27-NEXT: movl 160(%esp,%ebp), %ebp +; FALLBACK27-NEXT: shldl %cl, %ebp, %edx +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: shldl %cl, %eax, %ebp +; FALLBACK27-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK27-NEXT: movl %esi, 56(%ecx) +; FALLBACK27-NEXT: movl %edi, 60(%ecx) +; FALLBACK27-NEXT: movl %ebp, 48(%ecx) +; FALLBACK27-NEXT: movl %edx, 52(%ecx) +; FALLBACK27-NEXT: movl %ebx, 40(%ecx) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: shldl %cl, %ebx, %esi -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: movl %edx, 44(%ecx) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shldl %cl, %edx, %ebx -; FALLBACK27-NEXT: movl %ebx, 48(%eax) -; FALLBACK27-NEXT: movl %esi, 52(%eax) -; FALLBACK27-NEXT: movl %ebp, 40(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 44(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 32(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 36(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 24(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 28(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 16(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 20(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 8(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 12(%eax) -; FALLBACK27-NEXT: movl %edi, 4(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, (%eax) -; FALLBACK27-NEXT: addl $204, %esp +; FALLBACK27-NEXT: movl %edx, 32(%ecx) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: movl %edx, 36(%ecx) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: movl %edx, 24(%ecx) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: movl %edx, 28(%ecx) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: movl %edx, 16(%ecx) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: movl %edx, 20(%ecx) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: movl %edx, 8(%ecx) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: movl %edx, 12(%ecx) +; FALLBACK27-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK27-NEXT: movl %edx, 4(%ecx) +; FALLBACK27-NEXT: movl %eax, (%ecx) +; FALLBACK27-NEXT: addl $188, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi ; FALLBACK27-NEXT: popl %ebx @@ -19206,7 +18377,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $204, %esp +; FALLBACK28-NEXT: subl $220, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 @@ -19214,172 +18385,189 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: andl $60, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: subl %edx, %ecx -; FALLBACK28-NEXT: movl (%ecx), %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 4(%ecx), %edx -; FALLBACK28-NEXT: movl %ecx, %ebp +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: andl $60, %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK28-NEXT: subl %ecx, %edx +; FALLBACK28-NEXT: movl (%edx), %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 4(%edx), %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: shll $3, %eax ; FALLBACK28-NEXT: andl $24, %eax -; FALLBACK28-NEXT: movl %edx, %esi ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %al, %ch -; FALLBACK28-NEXT: notb %ch -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %esi, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 12(%ebp), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 8(%ebp), %esi -; FALLBACK28-NEXT: movl %ebp, %edi -; FALLBACK28-NEXT: movl %esi, %ebp +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: notb %cl +; FALLBACK28-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 12(%edx), %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: movl 8(%edx), %edx +; FALLBACK28-NEXT: movl %edx, %ebp ; FALLBACK28-NEXT: shrl %ebp +; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx +; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl 20(%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: orl %edi, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: movl 20(%ebx), %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 16(%edi), %esi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: shrl %edx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl 16(%ebx), %edx +; FALLBACK28-NEXT: movl %edx, %esi +; FALLBACK28-NEXT: shrl %esi ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, %edi ; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK28-NEXT: shrl %esi ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %esi, %edi +; FALLBACK28-NEXT: movb %ch, %dl +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: orl %ebp, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %ebp, %edx -; FALLBACK28-NEXT: movl 28(%ebp), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 24(%ebp), %esi -; FALLBACK28-NEXT: movl %esi, %edi +; FALLBACK28-NEXT: movl %ebx, %esi +; FALLBACK28-NEXT: movl 28(%ebx), %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl %eax, %ebx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl 24(%esi), %eax +; FALLBACK28-NEXT: movl %eax, %edi ; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi +; FALLBACK28-NEXT: movl %ebx, %ecx +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: orl %ebp, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %esi, %ebp +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK28-NEXT: movl 36(%esi), %ebp ; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 36(%edx), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 32(%edx), %esi -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movl %esi, %edi +; FALLBACK28-NEXT: movl %ebx, %eax +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl 32(%esi), %ebx +; FALLBACK28-NEXT: movl %ebx, %edi ; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 44(%ebp), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 40(%ebp), %esi -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: orl %ebp, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: movl 44(%edi), %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 52(%ebp), %esi -; FALLBACK28-NEXT: movl %esi, %edi -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl 40(%edi), %edi +; FALLBACK28-NEXT: movl %edi, %ebx +; FALLBACK28-NEXT: shrl %ebx +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: negl %edx -; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: orl %esi, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl 52(%ecx), %ebx ; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: shrl %ebp +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: negl %ecx +; FALLBACK28-NEXT: movl 192(%esp,%ecx), %edx +; FALLBACK28-NEXT: movl %edx, %esi +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %edi, %ebp +; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: orl %ebp, %esi ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: shrl %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: movl 60(%edi), %edx +; FALLBACK28-NEXT: movl 60(%edi), %ebp ; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: movl 56(%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %edi +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 56(%edi), %ebp +; FALLBACK28-NEXT: movl %ebp, %edi ; FALLBACK28-NEXT: shrl %edi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi ; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: shrl %ebx ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: orl %ebp, %ebx ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl %edx, (%eax) -; FALLBACK28-NEXT: movl %esi, 56(%eax) +; FALLBACK28-NEXT: movl %ebx, 56(%eax) ; FALLBACK28-NEXT: movl %edi, 60(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 48(%eax) -; FALLBACK28-NEXT: movl %ebp, 52(%eax) +; FALLBACK28-NEXT: movl %esi, 52(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 40(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -19402,7 +18590,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl %ecx, 12(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 4(%eax) -; FALLBACK28-NEXT: addl $204, %esp +; FALLBACK28-NEXT: addl $220, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx @@ -19436,7 +18624,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: shldl %cl, %esi, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 4(%eax), %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl %edi, (%esp) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %edi, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 16(%eax), %edi @@ -19460,55 +18648,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %edx, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 40(%eax), %edx -; FALLBACK29-NEXT: movl 44(%eax), %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %edx, %edi +; FALLBACK29-NEXT: movl 40(%eax), %edi +; FALLBACK29-NEXT: movl 44(%eax), %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shldl %cl, %edi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shldl %cl, %esi, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%eax), %edx +; FALLBACK29-NEXT: movl 56(%eax), %esi ; FALLBACK29-NEXT: movl 60(%eax), %edi -; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl (%eax), %ebx -; FALLBACK29-NEXT: movl 52(%eax), %esi -; FALLBACK29-NEXT: shldl %cl, %esi, %edx +; FALLBACK29-NEXT: shldl %cl, %esi, %edi +; FALLBACK29-NEXT: movl 52(%eax), %edx +; FALLBACK29-NEXT: shldl %cl, %edx, %esi +; FALLBACK29-NEXT: movl (%eax), %eax +; FALLBACK29-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill +; FALLBACK29-NEXT: shll %cl, %eax ; FALLBACK29-NEXT: negl %ebp -; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl %edi, 60(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shldl %cl, %ebx, %edx -; FALLBACK29-NEXT: shll %cl, %ebx -; FALLBACK29-NEXT: shldl %cl, %eax, %esi +; FALLBACK29-NEXT: movl 160(%esp,%ebp), %ebp +; FALLBACK29-NEXT: shldl %cl, %ebp, %edx ; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK29-NEXT: shldl %cl, %edi, %eax -; FALLBACK29-NEXT: movl %eax, 48(%ebp) -; FALLBACK29-NEXT: movl %esi, 52(%ebp) -; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) -; FALLBACK29-NEXT: movl %edx, 4(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK29-NEXT: shldl %cl, %ebx, %ebp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: movl %esi, 56(%ecx) +; FALLBACK29-NEXT: movl %edi, 60(%ecx) +; FALLBACK29-NEXT: movl %ebp, 48(%ecx) +; FALLBACK29-NEXT: movl %edx, 52(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 40(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 44(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 32(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 36(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 24(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 28(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 16(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 20(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 8(%ecx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edx, 12(%ecx) +; FALLBACK29-NEXT: movl %eax, (%ecx) +; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 4(%ecx) ; FALLBACK29-NEXT: addl $188, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -19683,101 +18871,97 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: pushl %ebx ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $204, %esp +; FALLBACK31-NEXT: subl $188, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK31-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK31-NEXT: movl (%eax), %ebx +; FALLBACK31-NEXT: movl (%eax), %ebp ; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: leal (,%ebx,8), %ecx +; FALLBACK31-NEXT: leal (,%ebp,8), %ecx ; FALLBACK31-NEXT: andl $24, %ecx -; FALLBACK31-NEXT: andl $60, %ebx +; FALLBACK31-NEXT: andl $60, %ebp ; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: subl %ebx, %eax -; FALLBACK31-NEXT: movl 4(%eax), %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 8(%eax), %edi +; FALLBACK31-NEXT: subl %ebp, %eax +; FALLBACK31-NEXT: movl 8(%eax), %esi ; FALLBACK31-NEXT: movl 12(%eax), %edx -; FALLBACK31-NEXT: movl %edx, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl %edx, %edi ; FALLBACK31-NEXT: shldl %cl, %esi, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 4(%eax), %edi +; FALLBACK31-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK31-NEXT: shldl %cl, %edi, %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 16(%eax), %edi ; FALLBACK31-NEXT: movl 20(%eax), %esi -; FALLBACK31-NEXT: movl %esi, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl %esi, %ebx +; FALLBACK31-NEXT: shldl %cl, %edi, %ebx +; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %edx, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 24(%eax), %edi ; FALLBACK31-NEXT: movl 28(%eax), %edx -; FALLBACK31-NEXT: movl %edx, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl %edx, %ebx +; FALLBACK31-NEXT: shldl %cl, %edi, %ebx +; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %esi, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 32(%eax), %edi ; FALLBACK31-NEXT: movl 36(%eax), %esi -; FALLBACK31-NEXT: movl %esi, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl %esi, %ebx +; FALLBACK31-NEXT: shldl %cl, %edi, %ebx +; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %edx, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 40(%eax), %ebp +; FALLBACK31-NEXT: movl 40(%eax), %ebx ; FALLBACK31-NEXT: movl 44(%eax), %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %ebp, %edx +; FALLBACK31-NEXT: shldl %cl, %ebx, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %esi, %ebp -; FALLBACK31-NEXT: movl 56(%eax), %edx +; FALLBACK31-NEXT: shldl %cl, %esi, %ebx +; FALLBACK31-NEXT: movl 56(%eax), %esi ; FALLBACK31-NEXT: movl 60(%eax), %edi -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: movl (%eax), %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 52(%eax), %esi -; FALLBACK31-NEXT: shldl %cl, %esi, %edx -; FALLBACK31-NEXT: negl %ebx -; FALLBACK31-NEXT: movl 176(%esp,%ebx), %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl %edx, 56(%eax) -; FALLBACK31-NEXT: movl %edi, 60(%eax) +; FALLBACK31-NEXT: shldl %cl, %esi, %edi +; FALLBACK31-NEXT: movl 52(%eax), %edx +; FALLBACK31-NEXT: shldl %cl, %edx, %esi +; FALLBACK31-NEXT: movl (%eax), %eax +; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill +; FALLBACK31-NEXT: negl %ebp +; FALLBACK31-NEXT: movl 160(%esp,%ebp), %ebp +; FALLBACK31-NEXT: shldl %cl, %ebp, %edx +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: shldl %cl, %eax, %ebp +; FALLBACK31-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK31-NEXT: movl %esi, 56(%ecx) +; FALLBACK31-NEXT: movl %edi, 60(%ecx) +; FALLBACK31-NEXT: movl %ebp, 48(%ecx) +; FALLBACK31-NEXT: movl %edx, 52(%ecx) +; FALLBACK31-NEXT: movl %ebx, 40(%ecx) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: shldl %cl, %ebx, %esi -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: movl %edx, 44(%ecx) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shldl %cl, %edx, %ebx -; FALLBACK31-NEXT: movl %ebx, 48(%eax) -; FALLBACK31-NEXT: movl %esi, 52(%eax) -; FALLBACK31-NEXT: movl %ebp, 40(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 44(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 32(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 36(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 24(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 28(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 16(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 20(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 8(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 12(%eax) -; FALLBACK31-NEXT: movl %edi, 4(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, (%eax) -; FALLBACK31-NEXT: addl $204, %esp +; FALLBACK31-NEXT: movl %edx, 32(%ecx) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: movl %edx, 36(%ecx) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: movl %edx, 24(%ecx) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: movl %edx, 28(%ecx) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: movl %edx, 16(%ecx) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: movl %edx, 20(%ecx) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: movl %edx, 8(%ecx) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: movl %edx, 12(%ecx) +; FALLBACK31-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK31-NEXT: movl %edx, 4(%ecx) +; FALLBACK31-NEXT: movl %eax, (%ecx) +; FALLBACK31-NEXT: addl $188, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi ; FALLBACK31-NEXT: popl %ebx @@ -19795,50 +18979,34 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: shl_64bytes_qwordOff: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pushq %rbx -; X64-SSE2-NEXT: movq (%rdi), %rax -; X64-SSE2-NEXT: movq 8(%rdi), %rcx -; X64-SSE2-NEXT: movq 16(%rdi), %r8 -; X64-SSE2-NEXT: movq 24(%rdi), %r9 -; X64-SSE2-NEXT: movq 32(%rdi), %r10 -; X64-SSE2-NEXT: movq 40(%rdi), %r11 -; X64-SSE2-NEXT: movq 48(%rdi), %rbx -; X64-SSE2-NEXT: movq 56(%rdi), %rdi -; X64-SSE2-NEXT: movl (%rsi), %esi -; X64-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movups (%rdi), %xmm0 +; X64-SSE2-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movups 32(%rdi), %xmm2 +; X64-SSE2-NEXT: movups 48(%rdi), %xmm3 +; X64-SSE2-NEXT: movl (%rsi), %eax +; X64-SSE2-NEXT: xorps %xmm4, %xmm4 +; X64-SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: shll $3, %esi -; X64-SSE2-NEXT: andl $56, %esi -; X64-SSE2-NEXT: negl %esi -; X64-SSE2-NEXT: movslq %esi, %rax -; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx -; X64-SSE2-NEXT: movq -56(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -48(%rsp,%rax), %r8 -; X64-SSE2-NEXT: movq -24(%rsp,%rax), %r9 -; X64-SSE2-NEXT: movq -32(%rsp,%rax), %r10 -; X64-SSE2-NEXT: movq -8(%rsp,%rax), %r11 -; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax -; X64-SSE2-NEXT: movq %rax, 48(%rdx) -; X64-SSE2-NEXT: movq %r11, 56(%rdx) -; X64-SSE2-NEXT: movq %r10, 32(%rdx) -; X64-SSE2-NEXT: movq %r9, 40(%rdx) -; X64-SSE2-NEXT: movq %r8, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rcx, (%rdx) -; X64-SSE2-NEXT: movq %rsi, 8(%rdx) -; X64-SSE2-NEXT: popq %rbx +; X64-SSE2-NEXT: shll $3, %eax +; X64-SSE2-NEXT: andl $56, %eax +; X64-SSE2-NEXT: negl %eax +; X64-SSE2-NEXT: cltq +; X64-SSE2-NEXT: movups -64(%rsp,%rax), %xmm0 +; X64-SSE2-NEXT: movups -48(%rsp,%rax), %xmm1 +; X64-SSE2-NEXT: movups -32(%rsp,%rax), %xmm2 +; X64-SSE2-NEXT: movups -16(%rsp,%rax), %xmm3 +; X64-SSE2-NEXT: movups %xmm3, 48(%rdx) +; X64-SSE2-NEXT: movups %xmm1, 16(%rdx) +; X64-SSE2-NEXT: movups %xmm2, 32(%rdx) +; X64-SSE2-NEXT: movups %xmm0, (%rdx) +; X64-SSE2-NEXT: popq %rax ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: shl_64bytes_qwordOff: @@ -19926,134 +19094,38 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou ; ; X86-SSE2-LABEL: shl_64bytes_qwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: pushl %ebx -; X86-SSE2-NEXT: pushl %edi -; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $188, %esp -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl (%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 12(%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 16(%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 24(%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 32(%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 36(%ecx), %eax -; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 40(%ecx), %ebp -; X86-SSE2-NEXT: movl 44(%ecx), %ebx -; X86-SSE2-NEXT: movl 48(%ecx), %edi -; X86-SSE2-NEXT: movl 52(%ecx), %esi -; X86-SSE2-NEXT: movl 56(%ecx), %edx -; X86-SSE2-NEXT: movl 60(%ecx), %eax +; X86-SSE2-NEXT: subl $140, %esp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movups 16(%edx), %xmm1 +; X86-SSE2-NEXT: movups 32(%edx), %xmm2 +; X86-SSE2-NEXT: movups 48(%edx), %xmm3 ; X86-SSE2-NEXT: movl (%ecx), %ecx -; X86-SSE2-NEXT: xorps %xmm0, %xmm0 -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: xorps %xmm4, %xmm4 +; X86-SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm4, (%esp) +; X86-SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: shll $3, %ecx ; X86-SSE2-NEXT: andl $56, %ecx -; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: subl %ecx, %eax -; X86-SSE2-NEXT: movl (%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 12(%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 16(%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 24(%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 36(%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 32(%eax), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%eax), %ebp -; X86-SSE2-NEXT: movl 40(%eax), %ebx -; X86-SSE2-NEXT: movl 52(%eax), %edi -; X86-SSE2-NEXT: movl 60(%eax), %esi -; X86-SSE2-NEXT: movl 56(%eax), %edx +; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: subl %ecx, %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movups 16(%edx), %xmm1 +; X86-SSE2-NEXT: movups 32(%edx), %xmm2 ; X86-SSE2-NEXT: negl %ecx -; X86-SSE2-NEXT: movl 160(%esp,%ecx), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %edx, 56(%eax) -; X86-SSE2-NEXT: movl %esi, 60(%eax) -; X86-SSE2-NEXT: movl %ecx, 48(%eax) -; X86-SSE2-NEXT: movl %edi, 52(%eax) -; X86-SSE2-NEXT: movl %ebx, 40(%eax) -; X86-SSE2-NEXT: movl %ebp, 44(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 32(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 36(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 28(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 16(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 20(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $188, %esp -; X86-SSE2-NEXT: popl %esi -; X86-SSE2-NEXT: popl %edi -; X86-SSE2-NEXT: popl %ebx -; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: movups 112(%esp,%ecx), %xmm3 +; X86-SSE2-NEXT: movups %xmm3, 48(%eax) +; X86-SSE2-NEXT: movups %xmm2, 32(%eax) +; X86-SSE2-NEXT: movups %xmm1, 16(%eax) +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $140, %esp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: shl_64bytes_qwordOff: @@ -20161,110 +19233,110 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: ashr_64bytes: ; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %rbp ; FALLBACK0-NEXT: pushq %r15 ; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %r13 ; FALLBACK0-NEXT: pushq %r12 ; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rax -; FALLBACK0-NEXT: movq 8(%rdi), %rcx -; FALLBACK0-NEXT: movq 16(%rdi), %r8 -; FALLBACK0-NEXT: movq 24(%rdi), %r9 -; FALLBACK0-NEXT: movq 32(%rdi), %r10 -; FALLBACK0-NEXT: movq 40(%rdi), %r11 -; FALLBACK0-NEXT: movq 48(%rdi), %rbx -; FALLBACK0-NEXT: movq 56(%rdi), %r14 +; FALLBACK0-NEXT: subq $24, %rsp +; FALLBACK0-NEXT: movups (%rdi), %xmm0 +; FALLBACK0-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK0-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK0-NEXT: movq 48(%rdi), %rax +; FALLBACK0-NEXT: movq 56(%rdi), %rcx ; FALLBACK0-NEXT: movl (%rsi), %edi -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: sarq $63, %r14 -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: sarq $63, %rcx +; FALLBACK0-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, (%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: leal (,%rdi,8), %eax ; FALLBACK0-NEXT: andl $56, %eax ; FALLBACK0-NEXT: andl $56, %edi -; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 -; FALLBACK0-NEXT: movq %r8, %r11 +; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %r15 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: shrq %cl, %r15 +; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %rcx +; FALLBACK0-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %r8, %r8 +; FALLBACK0-NEXT: leaq (%rcx,%rcx), %r8 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r10, %r8 -; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq %r10, %r15 +; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %r9 +; FALLBACK0-NEXT: movq %r9, %r12 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r15 -; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 +; FALLBACK0-NEXT: shrq %cl, %r12 +; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r14 ; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: orq %r15, %r11 +; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %rbx +; FALLBACK0-NEXT: movq %rbx, %r13 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: addq %r10, %r10 +; FALLBACK0-NEXT: shrq %cl, %r13 +; FALLBACK0-NEXT: addq %r9, %r9 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: movq %rbx, %r12 +; FALLBACK0-NEXT: shlq %cl, %r9 +; FALLBACK0-NEXT: orq %r15, %r8 +; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %r15 +; FALLBACK0-NEXT: movq %r15, %r10 ; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r12 -; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 -; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: orq %r12, %r11 +; FALLBACK0-NEXT: movq -64(%rsp,%rdi), %rbp +; FALLBACK0-NEXT: leaq (,%rbp,2), %r12 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r15 -; FALLBACK0-NEXT: orq %r12, %r15 +; FALLBACK0-NEXT: shlq %cl, %r12 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r14 -; FALLBACK0-NEXT: addq %rbx, %rbx +; FALLBACK0-NEXT: addq %r15, %r15 ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rbx -; FALLBACK0-NEXT: orq %r14, %rbx +; FALLBACK0-NEXT: shlq %cl, %r15 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbp +; FALLBACK0-NEXT: orq %r13, %r9 +; FALLBACK0-NEXT: orq %r10, %r12 +; FALLBACK0-NEXT: movq -56(%rsp,%rdi), %rdi +; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload ; FALLBACK0-NEXT: shrq %cl, %r13 -; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 +; FALLBACK0-NEXT: orq %r14, %r15 +; FALLBACK0-NEXT: orq %rbp, %r10 +; FALLBACK0-NEXT: addq %rbx, %rbx ; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r14 -; FALLBACK0-NEXT: orq %r13, %r14 +; FALLBACK0-NEXT: shlq %cl, %rbx +; FALLBACK0-NEXT: orq %r13, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: sarq %cl, %rdi ; FALLBACK0-NEXT: movq %rdi, 56(%rdx) -; FALLBACK0-NEXT: movq %r14, 48(%rdx) -; FALLBACK0-NEXT: movq %rbx, 32(%rdx) -; FALLBACK0-NEXT: movq %r15, 40(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %rbx, 8(%rdx) +; FALLBACK0-NEXT: movq %r10, 48(%rdx) +; FALLBACK0-NEXT: movq %r15, 32(%rdx) +; FALLBACK0-NEXT: movq %r12, 40(%rdx) +; FALLBACK0-NEXT: movq %r9, 16(%rdx) ; FALLBACK0-NEXT: movq %r11, 24(%rdx) ; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) +; FALLBACK0-NEXT: addq $24, %rsp ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: popq %r12 ; FALLBACK0-NEXT: popq %r13 ; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: popq %r15 +; FALLBACK0-NEXT: popq %rbp ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: ashr_64bytes: @@ -20272,23 +19344,17 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK1-NEXT: pushq %r15 ; FALLBACK1-NEXT: pushq %r14 ; FALLBACK1-NEXT: pushq %rbx -; FALLBACK1-NEXT: movq (%rdi), %rcx -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %r10 -; FALLBACK1-NEXT: movq 32(%rdi), %r11 -; FALLBACK1-NEXT: movq 40(%rdi), %rbx -; FALLBACK1-NEXT: movq 48(%rdi), %r14 +; FALLBACK1-NEXT: movups (%rdi), %xmm0 +; FALLBACK1-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK1-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK1-NEXT: movq 48(%rdi), %rcx ; FALLBACK1-NEXT: movq 56(%rdi), %rdi ; FALLBACK1-NEXT: movl (%rsi), %eax ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: sarq $63, %rdi ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -20301,34 +19367,35 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK1-NEXT: leal (,%rax,8), %ecx ; FALLBACK1-NEXT: andl $56, %ecx ; FALLBACK1-NEXT: andl $56, %eax -; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK1-NEXT: movq %r9, %r8 -; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK1-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK1-NEXT: movq %r9, %rsi +; FALLBACK1-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK1-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK1-NEXT: movq %r10, %r8 +; FALLBACK1-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK1-NEXT: movq %r11, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx +; FALLBACK1-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK1-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK1-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r14 ; FALLBACK1-NEXT: movq %r14, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi +; FALLBACK1-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK1-NEXT: shrdq %cl, %r14, %rax ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: sarq %cl, %rax -; FALLBACK1-NEXT: movq %r11, 48(%rdx) -; FALLBACK1-NEXT: movq %rax, 56(%rdx) -; FALLBACK1-NEXT: movq %r10, 32(%rdx) -; FALLBACK1-NEXT: movq %r15, 40(%rdx) -; FALLBACK1-NEXT: movq %rdi, 16(%rdx) -; FALLBACK1-NEXT: movq %rbx, 24(%rdx) -; FALLBACK1-NEXT: movq %rsi, (%rdx) -; FALLBACK1-NEXT: movq %r8, 8(%rdx) +; FALLBACK1-NEXT: sarq %cl, %r11 +; FALLBACK1-NEXT: movq %r15, 8(%rdx) +; FALLBACK1-NEXT: movq %r9, 48(%rdx) +; FALLBACK1-NEXT: movq %r11, 56(%rdx) +; FALLBACK1-NEXT: movq %rdi, 32(%rdx) +; FALLBACK1-NEXT: movq %rbx, 40(%rdx) +; FALLBACK1-NEXT: movq %r8, 16(%rdx) +; FALLBACK1-NEXT: movq %rsi, 24(%rdx) +; FALLBACK1-NEXT: movq %rax, (%rdx) ; FALLBACK1-NEXT: popq %rbx ; FALLBACK1-NEXT: popq %r14 ; FALLBACK1-NEXT: popq %r15 @@ -20343,23 +19410,17 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %r10 -; FALLBACK2-NEXT: movq 32(%rdi), %r11 -; FALLBACK2-NEXT: movq 40(%rdi), %rbx -; FALLBACK2-NEXT: movq 48(%rdi), %r14 +; FALLBACK2-NEXT: movups (%rdi), %xmm0 +; FALLBACK2-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK2-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK2-NEXT: movq 48(%rdi), %rcx ; FALLBACK2-NEXT: movq 56(%rdi), %rdi ; FALLBACK2-NEXT: movl (%rsi), %eax ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: sarq $63, %rdi ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -20369,55 +19430,55 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: leal (,%rax,8), %ecx -; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: leal (,%rax,8), %esi +; FALLBACK2-NEXT: andl $56, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 -; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %rcx +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rdi +; FALLBACK2-NEXT: shrxq %rsi, %rdi, %r12 +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r13 +; FALLBACK2-NEXT: shrxq %rsi, %rcx, %r9 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r14 +; FALLBACK2-NEXT: shrxq %rsi, %r13, %r15 +; FALLBACK2-NEXT: movl %esi, %ebx +; FALLBACK2-NEXT: notb %bl +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rbp +; FALLBACK2-NEXT: leaq (,%rbp,2), %r8 +; FALLBACK2-NEXT: shlxq %rbx, %r8, %r8 +; FALLBACK2-NEXT: orq %r11, %r8 +; FALLBACK2-NEXT: leaq (,%r13,2), %r11 +; FALLBACK2-NEXT: shlxq %rbx, %r11, %r11 +; FALLBACK2-NEXT: orq %r12, %r11 +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r12, %r13 +; FALLBACK2-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: sarxq %rsi, %rax, %rsi +; FALLBACK2-NEXT: addq %rdi, %rdi +; FALLBACK2-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: leaq (%r12,%r12), %r9 +; FALLBACK2-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK2-NEXT: orq %r14, %r9 ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 +; FALLBACK2-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK2-NEXT: orq %r15, %r10 ; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax +; FALLBACK2-NEXT: shlxq %rbx, %rax, %rax ; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) +; FALLBACK2-NEXT: addq %rcx, %rcx +; FALLBACK2-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK2-NEXT: orq %rbp, %rcx +; FALLBACK2-NEXT: movq %rsi, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 8(%rdx) ; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r9, 40(%rdx) +; FALLBACK2-NEXT: movq %rdi, 16(%rdx) +; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 @@ -20432,23 +19493,17 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: pushq %r15 ; FALLBACK3-NEXT: pushq %r14 ; FALLBACK3-NEXT: pushq %rbx -; FALLBACK3-NEXT: movq (%rdi), %rcx -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %r10 -; FALLBACK3-NEXT: movq 32(%rdi), %r11 -; FALLBACK3-NEXT: movq 40(%rdi), %rbx -; FALLBACK3-NEXT: movq 48(%rdi), %r14 +; FALLBACK3-NEXT: movups (%rdi), %xmm0 +; FALLBACK3-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK3-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK3-NEXT: movq 48(%rdi), %rcx ; FALLBACK3-NEXT: movq 56(%rdi), %rdi ; FALLBACK3-NEXT: movl (%rsi), %eax ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: sarq $63, %rdi ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -20461,34 +19516,35 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: leal (,%rax,8), %ecx ; FALLBACK3-NEXT: andl $56, %ecx ; FALLBACK3-NEXT: andl $56, %eax -; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK3-NEXT: movq %r9, %r8 -; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK3-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK3-NEXT: movq %r9, %rsi +; FALLBACK3-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK3-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK3-NEXT: movq %r10, %r8 +; FALLBACK3-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK3-NEXT: movq %r11, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx +; FALLBACK3-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK3-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK3-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r14 ; FALLBACK3-NEXT: movq %r14, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax +; FALLBACK3-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK3-NEXT: sarxq %rcx, %r11, %r10 ; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK3-NEXT: movq %r11, 48(%rdx) -; FALLBACK3-NEXT: movq %r10, 32(%rdx) -; FALLBACK3-NEXT: movq %r15, 40(%rdx) -; FALLBACK3-NEXT: movq %rdi, 16(%rdx) -; FALLBACK3-NEXT: movq %rbx, 24(%rdx) -; FALLBACK3-NEXT: movq %rsi, (%rdx) -; FALLBACK3-NEXT: movq %r8, 8(%rdx) -; FALLBACK3-NEXT: movq %rax, 56(%rdx) +; FALLBACK3-NEXT: shrdq %cl, %r14, %rax +; FALLBACK3-NEXT: movq %r15, 8(%rdx) +; FALLBACK3-NEXT: movq %r9, 48(%rdx) +; FALLBACK3-NEXT: movq %rdi, 32(%rdx) +; FALLBACK3-NEXT: movq %rbx, 40(%rdx) +; FALLBACK3-NEXT: movq %r8, 16(%rdx) +; FALLBACK3-NEXT: movq %rsi, 24(%rdx) +; FALLBACK3-NEXT: movq %rax, (%rdx) +; FALLBACK3-NEXT: movq %r10, 56(%rdx) ; FALLBACK3-NEXT: popq %rbx ; FALLBACK3-NEXT: popq %r14 ; FALLBACK3-NEXT: popq %r15 @@ -20502,7 +19558,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: pushq %r13 ; FALLBACK4-NEXT: pushq %r12 ; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: pushq %rax +; FALLBACK4-NEXT: subq $24, %rsp ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 @@ -20515,8 +19571,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: sarq $63, %rcx -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, (%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) @@ -20526,72 +19582,74 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: leal (,%rdi,8), %eax ; FALLBACK4-NEXT: andl $56, %eax ; FALLBACK4-NEXT: andl $56, %edi -; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r9 +; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %r15 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: shrq %cl, %r15 +; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %rcx +; FALLBACK4-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r9,%r9), %r8 +; FALLBACK4-NEXT: leaq (%rcx,%rcx), %r8 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r8 -; FALLBACK4-NEXT: orq %r10, %r8 -; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK4-NEXT: movq %r10, %rbx +; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r9 +; FALLBACK4-NEXT: movq %r9, %r12 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbx -; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %r12 -; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK4-NEXT: shrq %cl, %r12 +; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %r14 +; FALLBACK4-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r11 -; FALLBACK4-NEXT: orq %rbx, %r11 -; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK4-NEXT: movq %rbx, %r14 +; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %rbx +; FALLBACK4-NEXT: movq %rbx, %r13 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r14 -; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: shrq %cl, %r13 +; FALLBACK4-NEXT: addq %r9, %r9 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r14, %r10 -; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r14 -; FALLBACK4-NEXT: movq %r14, %r13 +; FALLBACK4-NEXT: shlq %cl, %r9 +; FALLBACK4-NEXT: orq %r15, %r8 +; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %r15 +; FALLBACK4-NEXT: movq %r15, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r13 -; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %rbp -; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: orq %r12, %r11 +; FALLBACK4-NEXT: movq -64(%rsp,%rdi), %rbp +; FALLBACK4-NEXT: leaq (,%rbp,2), %r12 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r15 -; FALLBACK4-NEXT: orq %r13, %r15 +; FALLBACK4-NEXT: shlq %cl, %r12 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r12 -; FALLBACK4-NEXT: addq %r14, %r14 +; FALLBACK4-NEXT: shrq %cl, %r14 +; FALLBACK4-NEXT: addq %r15, %r15 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r14 -; FALLBACK4-NEXT: orq %r12, %r14 +; FALLBACK4-NEXT: shlq %cl, %r15 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %rbp -; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r12 +; FALLBACK4-NEXT: orq %r13, %r9 +; FALLBACK4-NEXT: orq %r10, %r12 +; FALLBACK4-NEXT: movq -56(%rsp,%rdi), %rdi +; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r10 ; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r12 -; FALLBACK4-NEXT: orq %rbp, %r12 +; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 +; FALLBACK4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; FALLBACK4-NEXT: shrq %cl, %r13 +; FALLBACK4-NEXT: orq %r14, %r15 +; FALLBACK4-NEXT: orq %rbp, %r10 ; FALLBACK4-NEXT: addq %rbx, %rbx ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r9, %rbx +; FALLBACK4-NEXT: orq %r13, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: sarq %cl, %rdi ; FALLBACK4-NEXT: movq %rdi, 56(%rdx) ; FALLBACK4-NEXT: movq %rbx, 8(%rdx) -; FALLBACK4-NEXT: movq %r12, 48(%rdx) -; FALLBACK4-NEXT: movq %r14, 32(%rdx) -; FALLBACK4-NEXT: movq %r15, 40(%rdx) -; FALLBACK4-NEXT: movq %r10, 16(%rdx) +; FALLBACK4-NEXT: movq %r10, 48(%rdx) +; FALLBACK4-NEXT: movq %r15, 32(%rdx) +; FALLBACK4-NEXT: movq %r12, 40(%rdx) +; FALLBACK4-NEXT: movq %r9, 16(%rdx) ; FALLBACK4-NEXT: movq %r11, 24(%rdx) ; FALLBACK4-NEXT: movq %r8, (%rdx) -; FALLBACK4-NEXT: addq $8, %rsp +; FALLBACK4-NEXT: addq $24, %rsp ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: popq %r12 ; FALLBACK4-NEXT: popq %r13 @@ -20642,11 +19700,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r15 +; FALLBACK5-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK5-NEXT: movq %r14, %r15 ; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK5-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK5-NEXT: shrdq %cl, %r14, %rax ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: sarq %cl, %r11 ; FALLBACK5-NEXT: movq %r15, 8(%rdx) @@ -20656,7 +19714,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: movq %rbx, 40(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r14, (%rdx) +; FALLBACK5-NEXT: movq %rax, (%rdx) ; FALLBACK5-NEXT: popq %rbx ; FALLBACK5-NEXT: popq %r14 ; FALLBACK5-NEXT: popq %r15 @@ -20706,10 +19764,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movl %esi, %ebx ; FALLBACK6-NEXT: notb %bl ; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK6-NEXT: leaq (,%rbp,2), %r8 ; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK6-NEXT: leaq (,%r13,2), %r11 ; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK6-NEXT: orq %r12, %r11 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 @@ -20791,20 +19849,20 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r15 +; FALLBACK7-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK7-NEXT: movq %r14, %r15 ; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK7-NEXT: movq -128(%rsp,%rax), %rax ; FALLBACK7-NEXT: sarxq %rcx, %r11, %r10 ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK7-NEXT: shrdq %cl, %r14, %rax ; FALLBACK7-NEXT: movq %r15, 8(%rdx) ; FALLBACK7-NEXT: movq %r9, 48(%rdx) ; FALLBACK7-NEXT: movq %rdi, 32(%rdx) ; FALLBACK7-NEXT: movq %rbx, 40(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rsi, 24(%rdx) -; FALLBACK7-NEXT: movq %r14, (%rdx) +; FALLBACK7-NEXT: movq %rax, (%rdx) ; FALLBACK7-NEXT: movq %r10, 56(%rdx) ; FALLBACK7-NEXT: popq %rbx ; FALLBACK7-NEXT: popq %r14 @@ -20819,7 +19877,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: pushq %r13 ; FALLBACK8-NEXT: pushq %r12 ; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: pushq %rax +; FALLBACK8-NEXT: subq $24, %rsp ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK8-NEXT: movq 48(%rdi), %rax @@ -20830,8 +19888,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: sarq $63, %rcx -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, (%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) @@ -20841,72 +19899,74 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: leal (,%rdi,8), %eax ; FALLBACK8-NEXT: andl $56, %eax ; FALLBACK8-NEXT: andl $56, %edi -; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r9 +; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %r15 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: shrq %cl, %r15 +; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %rcx +; FALLBACK8-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r9,%r9), %r8 +; FALLBACK8-NEXT: leaq (%rcx,%rcx), %r8 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r8 -; FALLBACK8-NEXT: orq %r10, %r8 -; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK8-NEXT: movq %r10, %rbx +; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r9 +; FALLBACK8-NEXT: movq %r9, %r12 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbx -; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %r12 -; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK8-NEXT: shrq %cl, %r12 +; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %r14 +; FALLBACK8-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r11 -; FALLBACK8-NEXT: orq %rbx, %r11 -; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK8-NEXT: movq %rbx, %r14 +; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %rbx +; FALLBACK8-NEXT: movq %rbx, %r13 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r14 -; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: shrq %cl, %r13 +; FALLBACK8-NEXT: addq %r9, %r9 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r14, %r10 -; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r14 -; FALLBACK8-NEXT: movq %r14, %r13 +; FALLBACK8-NEXT: shlq %cl, %r9 +; FALLBACK8-NEXT: orq %r15, %r8 +; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %r15 +; FALLBACK8-NEXT: movq %r15, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r13 -; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %rbp -; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: orq %r12, %r11 +; FALLBACK8-NEXT: movq -64(%rsp,%rdi), %rbp +; FALLBACK8-NEXT: leaq (,%rbp,2), %r12 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r15 -; FALLBACK8-NEXT: orq %r13, %r15 +; FALLBACK8-NEXT: shlq %cl, %r12 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r12 -; FALLBACK8-NEXT: addq %r14, %r14 +; FALLBACK8-NEXT: shrq %cl, %r14 +; FALLBACK8-NEXT: addq %r15, %r15 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r14 -; FALLBACK8-NEXT: orq %r12, %r14 +; FALLBACK8-NEXT: shlq %cl, %r15 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %rbp -; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r12 +; FALLBACK8-NEXT: orq %r13, %r9 +; FALLBACK8-NEXT: orq %r10, %r12 +; FALLBACK8-NEXT: movq -56(%rsp,%rdi), %rdi +; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r10 ; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r12 -; FALLBACK8-NEXT: orq %rbp, %r12 +; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 +; FALLBACK8-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; FALLBACK8-NEXT: shrq %cl, %r13 +; FALLBACK8-NEXT: orq %r14, %r15 +; FALLBACK8-NEXT: orq %rbp, %r10 ; FALLBACK8-NEXT: addq %rbx, %rbx ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r9, %rbx +; FALLBACK8-NEXT: orq %r13, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: sarq %cl, %rdi ; FALLBACK8-NEXT: movq %rdi, 56(%rdx) ; FALLBACK8-NEXT: movq %rbx, 8(%rdx) -; FALLBACK8-NEXT: movq %r12, 48(%rdx) -; FALLBACK8-NEXT: movq %r14, 32(%rdx) -; FALLBACK8-NEXT: movq %r15, 40(%rdx) -; FALLBACK8-NEXT: movq %r10, 16(%rdx) +; FALLBACK8-NEXT: movq %r10, 48(%rdx) +; FALLBACK8-NEXT: movq %r15, 32(%rdx) +; FALLBACK8-NEXT: movq %r12, 40(%rdx) +; FALLBACK8-NEXT: movq %r9, 16(%rdx) ; FALLBACK8-NEXT: movq %r11, 24(%rdx) ; FALLBACK8-NEXT: movq %r8, (%rdx) -; FALLBACK8-NEXT: addq $8, %rsp +; FALLBACK8-NEXT: addq $24, %rsp ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: popq %r12 ; FALLBACK8-NEXT: popq %r13 @@ -20956,11 +20016,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK9-NEXT: movq %r14, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK9-NEXT: shrdq %cl, %r14, %rax ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: sarq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) @@ -20970,7 +20030,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r14, (%rdx) +; FALLBACK9-NEXT: movq %rax, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 ; FALLBACK9-NEXT: popq %r15 @@ -21019,10 +20079,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movl %esi, %ebx ; FALLBACK10-NEXT: notb %bl ; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK10-NEXT: leaq (,%rbp,2), %r8 ; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK10-NEXT: leaq (,%r13,2), %r11 ; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK10-NEXT: orq %r12, %r11 ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 @@ -21103,20 +20163,20 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r15 +; FALLBACK11-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK11-NEXT: movq %r14, %r15 ; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK11-NEXT: movq -128(%rsp,%rax), %rax ; FALLBACK11-NEXT: sarxq %rcx, %r11, %r10 ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK11-NEXT: shrdq %cl, %r14, %rax ; FALLBACK11-NEXT: movq %r15, 8(%rdx) ; FALLBACK11-NEXT: movq %r9, 48(%rdx) ; FALLBACK11-NEXT: movq %rdi, 32(%rdx) ; FALLBACK11-NEXT: movq %rbx, 40(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) ; FALLBACK11-NEXT: movq %rsi, 24(%rdx) -; FALLBACK11-NEXT: movq %r14, (%rdx) +; FALLBACK11-NEXT: movq %rax, (%rdx) ; FALLBACK11-NEXT: movq %r10, 56(%rdx) ; FALLBACK11-NEXT: popq %rbx ; FALLBACK11-NEXT: popq %r14 @@ -21132,7 +20192,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: pushq %r13 ; FALLBACK12-NEXT: pushq %r12 ; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: pushq %rax +; FALLBACK12-NEXT: subq $24, %rsp ; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK12-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK12-NEXT: movq 48(%rdi), %rax @@ -21143,8 +20203,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: sarq $63, %rcx -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, (%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) @@ -21154,72 +20214,74 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: leal (,%rdi,8), %eax ; FALLBACK12-NEXT: andl $56, %eax ; FALLBACK12-NEXT: andl $56, %edi -; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r9 +; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %r15 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: shrq %cl, %r15 +; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %rcx +; FALLBACK12-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r9,%r9), %r8 +; FALLBACK12-NEXT: leaq (%rcx,%rcx), %r8 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r8 -; FALLBACK12-NEXT: orq %r10, %r8 -; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx +; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r9 +; FALLBACK12-NEXT: movq %r9, %r12 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %r12 -; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK12-NEXT: shrq %cl, %r12 +; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %r14 +; FALLBACK12-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 -; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK12-NEXT: movq %rbx, %r14 +; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %rbx +; FALLBACK12-NEXT: movq %rbx, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: shrq %cl, %r13 +; FALLBACK12-NEXT: addq %r9, %r9 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r14 -; FALLBACK12-NEXT: movq %r14, %r13 +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: orq %r15, %r8 +; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %r15 +; FALLBACK12-NEXT: movq %r15, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %rbp -; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: orq %r12, %r11 +; FALLBACK12-NEXT: movq -64(%rsp,%rdi), %rbp +; FALLBACK12-NEXT: leaq (,%rbp,2), %r12 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r15 -; FALLBACK12-NEXT: orq %r13, %r15 +; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r12 -; FALLBACK12-NEXT: addq %r14, %r14 +; FALLBACK12-NEXT: shrq %cl, %r14 +; FALLBACK12-NEXT: addq %r15, %r15 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r14 -; FALLBACK12-NEXT: orq %r12, %r14 +; FALLBACK12-NEXT: shlq %cl, %r15 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r12 +; FALLBACK12-NEXT: orq %r13, %r9 +; FALLBACK12-NEXT: orq %r10, %r12 +; FALLBACK12-NEXT: movq -56(%rsp,%rdi), %rdi +; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r10 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r12 -; FALLBACK12-NEXT: orq %rbp, %r12 +; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 +; FALLBACK12-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; FALLBACK12-NEXT: shrq %cl, %r13 +; FALLBACK12-NEXT: orq %r14, %r15 +; FALLBACK12-NEXT: orq %rbp, %r10 ; FALLBACK12-NEXT: addq %rbx, %rbx ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r9, %rbx +; FALLBACK12-NEXT: orq %r13, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: sarq %cl, %rdi ; FALLBACK12-NEXT: movq %rdi, 56(%rdx) ; FALLBACK12-NEXT: movq %rbx, 8(%rdx) -; FALLBACK12-NEXT: movq %r12, 48(%rdx) -; FALLBACK12-NEXT: movq %r14, 32(%rdx) -; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) +; FALLBACK12-NEXT: movq %r10, 48(%rdx) +; FALLBACK12-NEXT: movq %r15, 32(%rdx) +; FALLBACK12-NEXT: movq %r12, 40(%rdx) +; FALLBACK12-NEXT: movq %r9, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %r8, (%rdx) -; FALLBACK12-NEXT: addq $8, %rsp +; FALLBACK12-NEXT: addq $24, %rsp ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: popq %r12 ; FALLBACK12-NEXT: popq %r13 @@ -21269,11 +20331,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK13-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK13-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK13-NEXT: movq %rax, %r15 +; FALLBACK13-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK13-NEXT: movq %r14, %r15 ; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK13-NEXT: movq -128(%rsp,%rax), %rax +; FALLBACK13-NEXT: shrdq %cl, %r14, %rax ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: sarq %cl, %r11 ; FALLBACK13-NEXT: movq %r15, 8(%rdx) @@ -21283,7 +20345,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: movq %rbx, 40(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r14, (%rdx) +; FALLBACK13-NEXT: movq %rax, (%rdx) ; FALLBACK13-NEXT: popq %rbx ; FALLBACK13-NEXT: popq %r14 ; FALLBACK13-NEXT: popq %r15 @@ -21332,10 +20394,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movl %esi, %ebx ; FALLBACK14-NEXT: notb %bl ; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK14-NEXT: leaq (,%rbp,2), %r8 ; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK14-NEXT: leaq (,%r13,2), %r11 ; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK14-NEXT: orq %r12, %r11 ; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 @@ -21416,20 +20478,20 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: movq -120(%rsp,%rax), %r14 +; FALLBACK15-NEXT: movq %r14, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK15-NEXT: movq -128(%rsp,%rax), %rax ; FALLBACK15-NEXT: sarxq %rcx, %r11, %r10 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: shrdq %cl, %r14, %rax ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) ; FALLBACK15-NEXT: movq %rdi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) ; FALLBACK15-NEXT: movq %rsi, 24(%rdx) -; FALLBACK15-NEXT: movq %r14, (%rdx) +; FALLBACK15-NEXT: movq %rax, (%rdx) ; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx ; FALLBACK15-NEXT: popq %r14 @@ -21443,62 +20505,24 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $204, %esp +; FALLBACK16-NEXT: subl $236, %esp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 16(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 20(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 24(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 28(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%ecx), %ebx -; FALLBACK16-NEXT: movl 44(%ecx), %edi -; FALLBACK16-NEXT: movl 48(%ecx), %esi -; FALLBACK16-NEXT: movl 52(%ecx), %edx -; FALLBACK16-NEXT: movl 56(%ecx), %eax +; FALLBACK16-NEXT: movups (%ecx), %xmm0 +; FALLBACK16-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK16-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK16-NEXT: movl 48(%ecx), %edx +; FALLBACK16-NEXT: movl 52(%ecx), %esi +; FALLBACK16-NEXT: movl 56(%ecx), %edi ; FALLBACK16-NEXT: movl 60(%ecx), %ecx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK16-NEXT: movl (%ebp), %ebp -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl (%eax), %eax ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: sarl $31, %ecx ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -21516,158 +20540,176 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, %ecx -; FALLBACK16-NEXT: movl %ebp, %esi -; FALLBACK16-NEXT: andl $60, %esi -; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK16-NEXT: shll $3, %ecx -; FALLBACK16-NEXT: andl $24, %ecx -; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: andl $60, %ecx +; FALLBACK16-NEXT: movl 100(%esp,%ecx), %edx ; FALLBACK16-NEXT: movl %ecx, %ebx -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK16-NEXT: shll $3, %eax +; FALLBACK16-NEXT: andl $24, %eax +; FALLBACK16-NEXT: movl %edx, %esi +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 104(%esp,%ebx), %ebp +; FALLBACK16-NEXT: leal (,%ebp,2), %esi +; FALLBACK16-NEXT: movb %al, %ch ; FALLBACK16-NEXT: notb %ch ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 96(%esp,%ebx), %esi +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl %eax, %edi +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %eax +; FALLBACK16-NEXT: movl %edi, %edx +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 112(%esp,%ebx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%eax,%eax), %edi +; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %eax, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %eax +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: leal (%edi,%edi), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %edx, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %ebp, %ebp +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 124(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %ebp +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 128(%esp,%ebx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %ebp +; FALLBACK16-NEXT: addl %eax, %eax ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: movl %edx, %ebx ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK16-NEXT: movl 132(%esp,%esi), %edx ; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %edx, %eax +; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: addl %ebp, %ebp +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 136(%esp,%esi), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %eax, %eax ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %esi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax +; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebx, %edx -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: addl %edx, %edx ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi -; FALLBACK16-NEXT: movl %edi, %eax -; FALLBACK16-NEXT: movl %edx, %ebx +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 140(%esp,%esi), %edx +; FALLBACK16-NEXT: movl %edx, %edi ; FALLBACK16-NEXT: movl %ebx, %ecx -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 144(%esp,%esi), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebp, %edx ; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: addl %edi, %edi +; FALLBACK16-NEXT: movl %ebx, %edi +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: addl %edx, %edx ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl %esi, %ebx +; FALLBACK16-NEXT: movl 148(%esp,%esi), %esi ; FALLBACK16-NEXT: movl %esi, %eax -; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: movl %edi, %ecx +; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx -; FALLBACK16-NEXT: leal (%edx,%edx), %ebp +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 152(%esp,%ebx), %eax +; FALLBACK16-NEXT: leal (%eax,%eax), %ebp ; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl %edi, %edx +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl %edx, %eax +; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK16-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK16-NEXT: movl 156(%esp,%ebx), %ebx ; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK16-NEXT: orl %eax, %edx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx @@ -21700,7 +20742,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: movl %ecx, (%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $204, %esp +; FALLBACK16-NEXT: addl $236, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx @@ -21714,61 +20756,23 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $188, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl (%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 20(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 28(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 36(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%eax), %ebp -; FALLBACK17-NEXT: movl 44(%eax), %ebx -; FALLBACK17-NEXT: movl 48(%eax), %edi +; FALLBACK17-NEXT: movups (%eax), %xmm0 +; FALLBACK17-NEXT: movups 16(%eax), %xmm1 +; FALLBACK17-NEXT: movups 32(%eax), %xmm2 +; FALLBACK17-NEXT: movl 48(%eax), %edx ; FALLBACK17-NEXT: movl 52(%eax), %esi -; FALLBACK17-NEXT: movl 56(%eax), %edx +; FALLBACK17-NEXT: movl 56(%eax), %edi ; FALLBACK17-NEXT: movl 60(%eax), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl (%ecx), %ecx -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: sarl $31, %eax ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -21786,91 +20790,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %ecx, %ebx +; FALLBACK17-NEXT: andl $60, %ebx +; FALLBACK17-NEXT: movl 56(%esp,%ebx), %edx +; FALLBACK17-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shll $3, %ecx ; FALLBACK17-NEXT: andl $24, %ecx ; FALLBACK17-NEXT: shrdl %cl, %edx, %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl 64(%esp,%ebx), %edi +; FALLBACK17-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, %esi ; FALLBACK17-NEXT: shrdl %cl, %edi, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK17-NEXT: movl 68(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK17-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %edi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK17-NEXT: movl 84(%esp,%ebx), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl %esi, %edx ; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl 96(%esp,%ebx), %edi +; FALLBACK17-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %edi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi ; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 104(%esp,%ebx), %eax +; FALLBACK17-NEXT: movl 100(%esp,%ebx), %edx +; FALLBACK17-NEXT: movl %edx, %esi +; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: shrdl %cl, %edx, %edi +; FALLBACK17-NEXT: movl 108(%esp,%ebx), %edx +; FALLBACK17-NEXT: shrdl %cl, %edx, %eax +; FALLBACK17-NEXT: movl 48(%esp,%ebx), %ebx +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: sarl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 60(%ebp) -; FALLBACK17-NEXT: movl %esi, 48(%ebp) -; FALLBACK17-NEXT: movl %edi, 52(%ebp) +; FALLBACK17-NEXT: sarl %cl, %edx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl %eax, 56(%ecx) +; FALLBACK17-NEXT: movl %edx, 60(%ecx) +; FALLBACK17-NEXT: movl %edi, 48(%ecx) +; FALLBACK17-NEXT: movl %esi, 52(%ecx) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 40(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) +; FALLBACK17-NEXT: movl %eax, 44(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) +; FALLBACK17-NEXT: movl %eax, 32(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) +; FALLBACK17-NEXT: movl %eax, 36(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) +; FALLBACK17-NEXT: movl %eax, 24(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) +; FALLBACK17-NEXT: movl %eax, 28(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) +; FALLBACK17-NEXT: movl %eax, 16(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) +; FALLBACK17-NEXT: movl %eax, 20(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) +; FALLBACK17-NEXT: movl %eax, 8(%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) +; FALLBACK17-NEXT: movl %eax, 12(%ecx) +; FALLBACK17-NEXT: movl %ebx, (%ecx) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) +; FALLBACK17-NEXT: movl %eax, 4(%ecx) ; FALLBACK17-NEXT: addl $188, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi @@ -21886,60 +20889,22 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $204, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 12(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 20(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 28(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 36(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebp -; FALLBACK18-NEXT: movl 44(%eax), %ebx -; FALLBACK18-NEXT: movl 48(%eax), %edi -; FALLBACK18-NEXT: movl 52(%eax), %esi -; FALLBACK18-NEXT: movl 56(%eax), %edx -; FALLBACK18-NEXT: movl 60(%eax), %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movups (%ecx), %xmm0 +; FALLBACK18-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK18-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK18-NEXT: movl 48(%ecx), %edx +; FALLBACK18-NEXT: movl 52(%ecx), %esi +; FALLBACK18-NEXT: movl 56(%ecx), %edi +; FALLBACK18-NEXT: movl 60(%ecx), %ecx ; FALLBACK18-NEXT: movl (%eax), %eax -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: sarl $31, %ecx ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -22042,7 +21007,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK18-NEXT: orl %ecx, %esi ; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK18-NEXT: leal (,%ebp,2), %ecx ; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax ; FALLBACK18-NEXT: shrxl %edx, %eax, %edi @@ -22101,61 +21066,23 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $188, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl (%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 20(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 28(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 36(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%eax), %ebp -; FALLBACK19-NEXT: movl 44(%eax), %ebx -; FALLBACK19-NEXT: movl 48(%eax), %edi +; FALLBACK19-NEXT: movups (%eax), %xmm0 +; FALLBACK19-NEXT: movups 16(%eax), %xmm1 +; FALLBACK19-NEXT: movups 32(%eax), %xmm2 +; FALLBACK19-NEXT: movl 48(%eax), %edx ; FALLBACK19-NEXT: movl 52(%eax), %esi -; FALLBACK19-NEXT: movl 56(%eax), %edx +; FALLBACK19-NEXT: movl 56(%eax), %edi ; FALLBACK19-NEXT: movl 60(%eax), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl (%ecx), %ecx -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: sarl $31, %eax ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -22176,12 +21103,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl %ecx, %ebp ; FALLBACK19-NEXT: andl $60, %ebp ; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 52(%esp,%ebp), %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shll $3, %ecx ; FALLBACK19-NEXT: andl $24, %ecx -; FALLBACK19-NEXT: shrdl %cl, %edx, %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %edx, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %esi @@ -22209,7 +21136,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx @@ -22221,10 +21148,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl %edi, %edx ; FALLBACK19-NEXT: shrdl %cl, %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %edi, %esi +; FALLBACK19-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %edi, %eax ; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl %eax, 56(%ebp) ; FALLBACK19-NEXT: movl %esi, 48(%ebp) @@ -22232,7 +21159,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl %ebx, 40(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 44(%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 32(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 36(%ebp) @@ -22246,12 +21173,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl %eax, 20(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK19-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: shrdl %cl, %edx, %edi +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 12(%ebp) ; FALLBACK19-NEXT: movl %edi, (%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 4(%ebp) @@ -22269,7 +21196,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $204, %esp +; FALLBACK20-NEXT: subl $236, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 @@ -22304,150 +21231,176 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %esi -; FALLBACK20-NEXT: andl $60, %esi -; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: andl $60, %ecx +; FALLBACK20-NEXT: movl 100(%esp,%ecx), %edx +; FALLBACK20-NEXT: movl %ecx, %ebx ; FALLBACK20-NEXT: shll $3, %eax ; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %edi +; FALLBACK20-NEXT: movl %edx, %esi ; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 104(%esp,%ebx), %ebp +; FALLBACK20-NEXT: leal (,%ebp,2), %esi ; FALLBACK20-NEXT: movb %al, %ch ; FALLBACK20-NEXT: notb %ch ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl %eax, %edi +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: addl %edx, %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %edi, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: movl %edi, %edx +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 112(%esp,%ebx), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%eax,%eax), %edi ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %eax, %eax +; FALLBACK20-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%edi,%edi), %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 124(%esp,%ebx), %esi +; FALLBACK20-NEXT: movl %esi, %ebp ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %eax +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 128(%esp,%ebx), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %eax, %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %edx, %ebx +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK20-NEXT: movl 132(%esp,%esi), %edx +; FALLBACK20-NEXT: movl %edx, %eax +; FALLBACK20-NEXT: movb %bl, %cl ; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK20-NEXT: leal (%edx,%edx), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 136(%esp,%esi), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %eax, %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 140(%esp,%esi), %edx +; FALLBACK20-NEXT: movl %edx, %edi +; FALLBACK20-NEXT: movl %ebx, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl 144(%esp,%esi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: addl %edi, %edi +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: movl %ebx, %edi +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %edx, %edx ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK20-NEXT: movl 148(%esp,%esi), %esi +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: movl %edi, %ecx +; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 152(%esp,%ebx), %eax ; FALLBACK20-NEXT: leal (%eax,%eax), %ebp +; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: movl %edi, %edx +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK20-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: movl 156(%esp,%ebx), %ebx ; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK20-NEXT: orl %eax, %edx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx @@ -22480,7 +21433,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl %ecx, (%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 4(%eax) -; FALLBACK20-NEXT: addl $204, %esp +; FALLBACK20-NEXT: addl $236, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx @@ -22528,91 +21481,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %ecx, %ebx +; FALLBACK21-NEXT: andl $60, %ebx +; FALLBACK21-NEXT: movl 56(%esp,%ebx), %edx +; FALLBACK21-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shll $3, %ecx ; FALLBACK21-NEXT: andl $24, %ecx ; FALLBACK21-NEXT: shrdl %cl, %edx, %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 64(%esp,%ebx), %edi +; FALLBACK21-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, %esi ; FALLBACK21-NEXT: shrdl %cl, %edi, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK21-NEXT: movl 68(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK21-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %edi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK21-NEXT: movl 84(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl %esi, %edx ; FALLBACK21-NEXT: shrdl %cl, %eax, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: movl 96(%esp,%ebx), %edi +; FALLBACK21-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %edi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK21-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK21-NEXT: movl 104(%esp,%ebx), %eax +; FALLBACK21-NEXT: movl 100(%esp,%ebx), %edx +; FALLBACK21-NEXT: movl %edx, %esi +; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: movl 108(%esp,%ebx), %edx +; FALLBACK21-NEXT: shrdl %cl, %edx, %eax +; FALLBACK21-NEXT: movl 48(%esp,%ebx), %ebx +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: sarl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 60(%ebp) -; FALLBACK21-NEXT: movl %esi, 48(%ebp) -; FALLBACK21-NEXT: movl %edi, 52(%ebp) +; FALLBACK21-NEXT: sarl %cl, %edx +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movl %eax, 56(%ecx) +; FALLBACK21-NEXT: movl %edx, 60(%ecx) +; FALLBACK21-NEXT: movl %edi, 48(%ecx) +; FALLBACK21-NEXT: movl %esi, 52(%ecx) ; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) +; FALLBACK21-NEXT: movl %eax, 40(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) +; FALLBACK21-NEXT: movl %eax, 44(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) +; FALLBACK21-NEXT: movl %eax, 32(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) +; FALLBACK21-NEXT: movl %eax, 36(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) +; FALLBACK21-NEXT: movl %eax, 24(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) +; FALLBACK21-NEXT: movl %eax, 28(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) +; FALLBACK21-NEXT: movl %eax, 16(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) +; FALLBACK21-NEXT: movl %eax, 20(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) +; FALLBACK21-NEXT: movl %eax, 8(%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) +; FALLBACK21-NEXT: movl %eax, 12(%ecx) +; FALLBACK21-NEXT: movl %ebx, (%ecx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 4(%ebp) +; FALLBACK21-NEXT: movl %eax, 4(%ecx) ; FALLBACK21-NEXT: addl $188, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -22746,7 +21698,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %ecx, %esi ; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK22-NEXT: leal (,%ebp,2), %ecx ; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax ; FALLBACK22-NEXT: shrxl %edx, %eax, %edi @@ -22842,12 +21794,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl %ecx, %ebp ; FALLBACK23-NEXT: andl $60, %ebp ; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 52(%esp,%ebp), %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shll $3, %ecx ; FALLBACK23-NEXT: andl $24, %ecx -; FALLBACK23-NEXT: shrdl %cl, %edx, %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %edx, %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %esi @@ -22887,10 +21839,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl %edi, %edx ; FALLBACK23-NEXT: shrdl %cl, %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %edi, %esi +; FALLBACK23-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %edi, %eax ; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK23-NEXT: movl %eax, 56(%ebp) ; FALLBACK23-NEXT: movl %esi, 48(%ebp) @@ -22912,12 +21864,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl %eax, 20(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 8(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 12(%ebp) ; FALLBACK23-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK23-NEXT: shrdl %cl, %edx, %edi +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 12(%ebp) ; FALLBACK23-NEXT: movl %edi, (%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 4(%ebp) @@ -22935,7 +21887,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $204, %esp +; FALLBACK24-NEXT: subl $236, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 @@ -22968,150 +21920,176 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %eax, %esi -; FALLBACK24-NEXT: andl $60, %esi -; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: andl $60, %ecx +; FALLBACK24-NEXT: movl 100(%esp,%ecx), %edx +; FALLBACK24-NEXT: movl %ecx, %ebx ; FALLBACK24-NEXT: shll $3, %eax ; FALLBACK24-NEXT: andl $24, %eax -; FALLBACK24-NEXT: movl %edx, %edi +; FALLBACK24-NEXT: movl %edx, %esi ; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 104(%esp,%ebx), %ebp +; FALLBACK24-NEXT: leal (,%ebp,2), %esi ; FALLBACK24-NEXT: movb %al, %ch ; FALLBACK24-NEXT: notb %ch ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl %eax, %edi +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: addl %edx, %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %edi, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: movl %edi, %edx +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 112(%esp,%ebx), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%eax,%eax), %edi ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movl %eax, %edx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %eax, %eax +; FALLBACK24-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%edi,%edi), %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 124(%esp,%ebx), %esi +; FALLBACK24-NEXT: movl %esi, %ebp ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %eax +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 128(%esp,%ebx), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %eax, %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %edx, %ebx +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK24-NEXT: movl 132(%esp,%esi), %edx +; FALLBACK24-NEXT: movl %edx, %eax +; FALLBACK24-NEXT: movb %bl, %cl ; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK24-NEXT: leal (%edx,%edx), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 136(%esp,%esi), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %eax, %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %edx, %edx ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 140(%esp,%esi), %edx +; FALLBACK24-NEXT: movl %edx, %edi +; FALLBACK24-NEXT: movl %ebx, %ecx +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl 144(%esp,%esi), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: addl %edi, %edi +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: movl %ebx, %edi +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %edx, %edx ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK24-NEXT: movl 148(%esp,%esi), %esi +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: movl %edi, %ecx +; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 152(%esp,%ebx), %eax ; FALLBACK24-NEXT: leal (%eax,%eax), %ebp +; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: movl %edi, %edx +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK24-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK24-NEXT: movl 156(%esp,%ebx), %ebx ; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK24-NEXT: orl %eax, %edx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx @@ -23144,7 +22122,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl %ecx, (%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 4(%eax) -; FALLBACK24-NEXT: addl $204, %esp +; FALLBACK24-NEXT: addl $236, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx @@ -23191,91 +22169,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %ecx, %ebx +; FALLBACK25-NEXT: andl $60, %ebx +; FALLBACK25-NEXT: movl 56(%esp,%ebx), %edx +; FALLBACK25-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shll $3, %ecx ; FALLBACK25-NEXT: andl $24, %ecx ; FALLBACK25-NEXT: shrdl %cl, %edx, %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 64(%esp,%ebx), %edi +; FALLBACK25-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, %esi ; FALLBACK25-NEXT: shrdl %cl, %edi, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK25-NEXT: movl 68(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK25-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %edi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK25-NEXT: movl 84(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl %esi, %edx ; FALLBACK25-NEXT: shrdl %cl, %eax, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %edx, %edi +; FALLBACK25-NEXT: movl 96(%esp,%ebx), %edi +; FALLBACK25-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %edi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK25-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK25-NEXT: movl 104(%esp,%ebx), %eax +; FALLBACK25-NEXT: movl 100(%esp,%ebx), %edx +; FALLBACK25-NEXT: movl %edx, %esi +; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: shrdl %cl, %edx, %edi +; FALLBACK25-NEXT: movl 108(%esp,%ebx), %edx +; FALLBACK25-NEXT: shrdl %cl, %edx, %eax +; FALLBACK25-NEXT: movl 48(%esp,%ebx), %ebx +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: sarl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 60(%ebp) -; FALLBACK25-NEXT: movl %esi, 48(%ebp) -; FALLBACK25-NEXT: movl %edi, 52(%ebp) +; FALLBACK25-NEXT: sarl %cl, %edx +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: movl %eax, 56(%ecx) +; FALLBACK25-NEXT: movl %edx, 60(%ecx) +; FALLBACK25-NEXT: movl %edi, 48(%ecx) +; FALLBACK25-NEXT: movl %esi, 52(%ecx) ; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) +; FALLBACK25-NEXT: movl %eax, 40(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) +; FALLBACK25-NEXT: movl %eax, 44(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) +; FALLBACK25-NEXT: movl %eax, 32(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) +; FALLBACK25-NEXT: movl %eax, 36(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) +; FALLBACK25-NEXT: movl %eax, 24(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) +; FALLBACK25-NEXT: movl %eax, 28(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) +; FALLBACK25-NEXT: movl %eax, 16(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) +; FALLBACK25-NEXT: movl %eax, 20(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) +; FALLBACK25-NEXT: movl %eax, 8(%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) +; FALLBACK25-NEXT: movl %eax, 12(%ecx) +; FALLBACK25-NEXT: movl %ebx, (%ecx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 4(%ebp) +; FALLBACK25-NEXT: movl %eax, 4(%ecx) ; FALLBACK25-NEXT: addl $188, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -23408,7 +22385,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK26-NEXT: orl %ecx, %esi ; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK26-NEXT: leal (,%ebp,2), %ecx ; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax ; FALLBACK26-NEXT: shrxl %edx, %eax, %edi @@ -23503,12 +22480,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: movl %ecx, %ebp ; FALLBACK27-NEXT: andl $60, %ebp ; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 52(%esp,%ebp), %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shll $3, %ecx ; FALLBACK27-NEXT: andl $24, %ecx -; FALLBACK27-NEXT: shrdl %cl, %edx, %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %edx, %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %esi @@ -23548,10 +22525,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: movl %edi, %edx ; FALLBACK27-NEXT: shrdl %cl, %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %edi, %esi +; FALLBACK27-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %edi, %eax ; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK27-NEXT: movl %eax, 56(%ebp) ; FALLBACK27-NEXT: movl %esi, 48(%ebp) @@ -23573,12 +22550,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: movl %eax, 20(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 8(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 12(%ebp) ; FALLBACK27-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK27-NEXT: shrdl %cl, %edx, %edi +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 12(%ebp) ; FALLBACK27-NEXT: movl %edi, (%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 4(%ebp) @@ -23597,7 +22574,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $204, %esp +; FALLBACK28-NEXT: subl $236, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 @@ -23630,150 +22607,176 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %eax, %esi -; FALLBACK28-NEXT: andl $60, %esi -; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: andl $60, %ecx +; FALLBACK28-NEXT: movl 100(%esp,%ecx), %edx +; FALLBACK28-NEXT: movl %ecx, %ebx ; FALLBACK28-NEXT: shll $3, %eax ; FALLBACK28-NEXT: andl $24, %eax -; FALLBACK28-NEXT: movl %edx, %edi +; FALLBACK28-NEXT: movl %edx, %esi ; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 104(%esp,%ebx), %ebp +; FALLBACK28-NEXT: leal (,%ebp,2), %esi ; FALLBACK28-NEXT: movb %al, %ch ; FALLBACK28-NEXT: notb %ch ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movl %eax, %edi +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: addl %edx, %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %edi, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: movl %edi, %edx +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 112(%esp,%ebx), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%eax,%eax), %edi ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movl %eax, %edx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %eax, %eax +; FALLBACK28-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%edi,%edi), %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 124(%esp,%ebx), %esi +; FALLBACK28-NEXT: movl %esi, %ebp ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %eax +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 128(%esp,%ebx), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %eax, %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %edx, %ebx +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK28-NEXT: movl 132(%esp,%esi), %edx +; FALLBACK28-NEXT: movl %edx, %eax +; FALLBACK28-NEXT: movb %bl, %cl ; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK28-NEXT: leal (%edx,%edx), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 136(%esp,%esi), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %eax, %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 140(%esp,%esi), %edx +; FALLBACK28-NEXT: movl %edx, %edi +; FALLBACK28-NEXT: movl %ebx, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl 144(%esp,%esi), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: addl %edi, %edi +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: movl %ebx, %edi +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %edx, %edx ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK28-NEXT: movl 148(%esp,%esi), %esi +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: movl %edi, %ecx +; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 152(%esp,%ebx), %eax ; FALLBACK28-NEXT: leal (%eax,%eax), %ebp +; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: movl %edi, %edx +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK28-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: movl 156(%esp,%ebx), %ebx ; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK28-NEXT: orl %eax, %edx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx @@ -23806,7 +22809,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl %ecx, (%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 4(%eax) -; FALLBACK28-NEXT: addl $204, %esp +; FALLBACK28-NEXT: addl $236, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx @@ -23853,91 +22856,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %ecx, %ebx +; FALLBACK29-NEXT: andl $60, %ebx +; FALLBACK29-NEXT: movl 56(%esp,%ebx), %edx +; FALLBACK29-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shll $3, %ecx ; FALLBACK29-NEXT: andl $24, %ecx ; FALLBACK29-NEXT: shrdl %cl, %edx, %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 64(%esp,%ebx), %edi +; FALLBACK29-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, %esi ; FALLBACK29-NEXT: shrdl %cl, %edi, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK29-NEXT: movl 68(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK29-NEXT: movl 76(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %edi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK29-NEXT: movl 84(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl %esi, %edx ; FALLBACK29-NEXT: shrdl %cl, %eax, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %edx, %edi +; FALLBACK29-NEXT: movl 96(%esp,%ebx), %edi +; FALLBACK29-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %edi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK29-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK29-NEXT: movl 104(%esp,%ebx), %eax +; FALLBACK29-NEXT: movl 100(%esp,%ebx), %edx +; FALLBACK29-NEXT: movl %edx, %esi +; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: shrdl %cl, %edx, %edi +; FALLBACK29-NEXT: movl 108(%esp,%ebx), %edx +; FALLBACK29-NEXT: shrdl %cl, %edx, %eax +; FALLBACK29-NEXT: movl 48(%esp,%ebx), %ebx +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: sarl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 60(%ebp) -; FALLBACK29-NEXT: movl %esi, 48(%ebp) -; FALLBACK29-NEXT: movl %edi, 52(%ebp) +; FALLBACK29-NEXT: sarl %cl, %edx +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: movl %eax, 56(%ecx) +; FALLBACK29-NEXT: movl %edx, 60(%ecx) +; FALLBACK29-NEXT: movl %edi, 48(%ecx) +; FALLBACK29-NEXT: movl %esi, 52(%ecx) ; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) +; FALLBACK29-NEXT: movl %eax, 40(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) +; FALLBACK29-NEXT: movl %eax, 44(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) +; FALLBACK29-NEXT: movl %eax, 32(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) +; FALLBACK29-NEXT: movl %eax, 36(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) +; FALLBACK29-NEXT: movl %eax, 24(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) +; FALLBACK29-NEXT: movl %eax, 28(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) +; FALLBACK29-NEXT: movl %eax, 16(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) +; FALLBACK29-NEXT: movl %eax, 20(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) +; FALLBACK29-NEXT: movl %eax, 8(%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) +; FALLBACK29-NEXT: movl %eax, 12(%ecx) +; FALLBACK29-NEXT: movl %ebx, (%ecx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 4(%ebp) +; FALLBACK29-NEXT: movl %eax, 4(%ecx) ; FALLBACK29-NEXT: addl $188, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -24070,7 +23072,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK30-NEXT: orl %ecx, %esi ; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK30-NEXT: leal (,%ebp,2), %ecx ; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax ; FALLBACK30-NEXT: shrxl %edx, %eax, %edi @@ -24165,12 +23167,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: movl %ecx, %ebp ; FALLBACK31-NEXT: andl $60, %ebp ; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 52(%esp,%ebp), %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shll $3, %ecx ; FALLBACK31-NEXT: andl $24, %ecx -; FALLBACK31-NEXT: shrdl %cl, %edx, %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %edx, %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %esi @@ -24210,10 +23212,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: movl %edi, %edx ; FALLBACK31-NEXT: shrdl %cl, %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %edi, %esi +; FALLBACK31-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %edi, %eax ; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK31-NEXT: movl %eax, 56(%ebp) ; FALLBACK31-NEXT: movl %esi, 48(%ebp) @@ -24235,12 +23237,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: movl %eax, 20(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 8(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 12(%ebp) ; FALLBACK31-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK31-NEXT: shrdl %cl, %edx, %edi +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 12(%ebp) ; FALLBACK31-NEXT: movl %edi, (%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 4(%ebp) @@ -24263,51 +23265,37 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: ashr_64bytes_qwordOff: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pushq %rbx -; X64-SSE2-NEXT: movq (%rdi), %rax -; X64-SSE2-NEXT: movq 8(%rdi), %rcx -; X64-SSE2-NEXT: movq 16(%rdi), %r8 -; X64-SSE2-NEXT: movq 24(%rdi), %r9 -; X64-SSE2-NEXT: movq 32(%rdi), %r10 -; X64-SSE2-NEXT: movq 40(%rdi), %r11 -; X64-SSE2-NEXT: movq 48(%rdi), %rbx -; X64-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movups (%rdi), %xmm0 +; X64-SSE2-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movups 32(%rdi), %xmm2 +; X64-SSE2-NEXT: movq 48(%rdi), %rax +; X64-SSE2-NEXT: movq 56(%rdi), %rcx ; X64-SSE2-NEXT: movl (%rsi), %esi -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: sarq $63, %rdi -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: sarq $63, %rcx +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: andl $7, %esi -; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax -; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx -; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi -; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8 -; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9 -; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10 -; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11 -; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi -; X64-SSE2-NEXT: movq %rsi, 48(%rdx) -; X64-SSE2-NEXT: movq %r11, 56(%rdx) -; X64-SSE2-NEXT: movq %r10, 32(%rdx) -; X64-SSE2-NEXT: movq %r9, 40(%rdx) -; X64-SSE2-NEXT: movq %r8, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) -; X64-SSE2-NEXT: movq %rcx, 8(%rdx) -; X64-SSE2-NEXT: popq %rbx +; X64-SSE2-NEXT: movups -128(%rsp,%rsi,8), %xmm0 +; X64-SSE2-NEXT: movups -112(%rsp,%rsi,8), %xmm1 +; X64-SSE2-NEXT: movups -96(%rsp,%rsi,8), %xmm2 +; X64-SSE2-NEXT: movups -80(%rsp,%rsi,8), %xmm3 +; X64-SSE2-NEXT: movups %xmm3, 48(%rdx) +; X64-SSE2-NEXT: movups %xmm1, 16(%rdx) +; X64-SSE2-NEXT: movups %xmm2, 32(%rdx) +; X64-SSE2-NEXT: movups %xmm0, (%rdx) +; X64-SSE2-NEXT: popq %rax ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: ashr_64bytes_qwordOff: @@ -24381,142 +23369,58 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no ; ; X86-SSE2-LABEL: ashr_64bytes_qwordOff: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $188, %esp -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 12(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 16(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 24(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 32(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 36(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 40(%eax), %ebp -; X86-SSE2-NEXT: movl 44(%eax), %ebx -; X86-SSE2-NEXT: movl 48(%eax), %edi -; X86-SSE2-NEXT: movl 52(%eax), %esi -; X86-SSE2-NEXT: movl 56(%eax), %edx -; X86-SSE2-NEXT: movl 60(%eax), %ecx +; X86-SSE2-NEXT: subl $128, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movups (%edx), %xmm0 +; X86-SSE2-NEXT: movups 16(%edx), %xmm1 +; X86-SSE2-NEXT: movups 32(%edx), %xmm2 +; X86-SSE2-NEXT: movl 48(%edx), %esi +; X86-SSE2-NEXT: movl 52(%edx), %edi +; X86-SSE2-NEXT: movl 56(%edx), %ebx +; X86-SSE2-NEXT: movl 60(%edx), %edx +; X86-SSE2-NEXT: movl (%ecx), %ecx ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-SSE2-NEXT: sarl $31, %edx ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: sarl $31, %ecx -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $7, %eax -; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp -; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx -; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi -; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi -; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx -; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 56(%eax) -; X86-SSE2-NEXT: movl %edx, 60(%eax) -; X86-SSE2-NEXT: movl %esi, 48(%eax) -; X86-SSE2-NEXT: movl %edi, 52(%eax) -; X86-SSE2-NEXT: movl %ebx, 40(%eax) -; X86-SSE2-NEXT: movl %ebp, 44(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 32(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 36(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 28(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 16(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 20(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $188, %esp +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: andl $7, %ecx +; X86-SSE2-NEXT: movups (%esp,%ecx,8), %xmm0 +; X86-SSE2-NEXT: movups 16(%esp,%ecx,8), %xmm1 +; X86-SSE2-NEXT: movups 32(%esp,%ecx,8), %xmm2 +; X86-SSE2-NEXT: movups 48(%esp,%ecx,8), %xmm3 +; X86-SSE2-NEXT: movups %xmm3, 48(%eax) +; X86-SSE2-NEXT: movups %xmm2, 32(%eax) +; X86-SSE2-NEXT: movups %xmm1, 16(%eax) +; X86-SSE2-NEXT: movups %xmm0, (%eax) +; X86-SSE2-NEXT: addl $128, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx -; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: ashr_64bytes_qwordOff: diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 338e104fbe8f0..7180a82bee4aa 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -236,20 +236,20 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorl %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl @@ -363,17 +363,17 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorl %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -405,27 +405,27 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: testb $32, %al -; X86-NO-BMI2-NO-SHLD-NEXT: cmovnel %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: cmovnel %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: cmovel %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 4(%edi) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edi) ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -486,20 +486,20 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovnel %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl @@ -588,58 +588,54 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $60, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%eax), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%eax), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%ebp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $60, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -655,35 +651,29 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%esi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -700,45 +690,39 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -755,36 +739,30 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -879,58 +857,53 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: subl $60, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 8(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $60, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -940,45 +913,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: +; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%esi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%esi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes: @@ -990,46 +960,41 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 28(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1039,45 +1004,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, (%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%esi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %bitOff = load i128, ptr %bitOff.ptr, align 1 @@ -1089,23 +1051,23 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_16bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: ; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %al -; X64-NO-BMI2-NO-SHLD-NEXT: cmovneq %r8, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: cmovneq %r8, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: cmoveq %r8, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_16bytes: @@ -1164,7 +1126,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $60, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx @@ -1175,7 +1137,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -1184,41 +1146,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 8(%ebp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%ebx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ebx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $60, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1234,38 +1198,38 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%esi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1340,39 +1304,39 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1389,161 +1353,148 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax -; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r9d +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9,8), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil ; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9,8), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r9,8), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9,8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al ; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r10, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r9, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes: @@ -1552,119 +1503,107 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $124, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ebx,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx,4), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ebx,4), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx,4), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx,4), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi,4), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx,4), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax,4), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $124, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1677,72 +1616,57 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $108, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx,4), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx,4), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx,4), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebx,4), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx,4), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx,4), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $108, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -1757,92 +1681,78 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, 32(%esp,%esi,4), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 28(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1856,73 +1766,57 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $108, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx,4), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx,4), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx,4), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebx,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx,4), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx,4), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -1937,75 +1831,65 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax -; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X64-NO-BMI2-NO-SHLD-NEXT: andb $24, %cl ; X64-NO-BMI2-NO-SHLD-NEXT: negb %cl -; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r10), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r10), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r8), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil ; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r10), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r10), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r8), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r8), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $24, %al @@ -2014,75 +1898,68 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl ; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $24, %al @@ -2091,15 +1968,16 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rax, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes: @@ -2108,120 +1986,109 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $124, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: andb $28, %al ; X86-NO-BMI2-NO-SHLD-NEXT: negb %al -; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $124, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -2235,71 +2102,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $28, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 4(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -2315,101 +2167,82 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 28(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2424,73 +2257,55 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $28, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%edi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%edi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%edi), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -2506,16 +2321,15 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx ; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -2523,53 +2337,52 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r9d +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9,8), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil ; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9,8), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r9,8), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9,8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -2579,31 +2392,29 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al ; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -2615,39 +2426,37 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %r10, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -2657,18 +2466,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r9, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes: @@ -2677,126 +2487,121 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $124, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ecx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ecx), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp,4), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ebx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %dh +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebp,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx,4), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ebx,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx,4), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ebx,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx,4), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax,4), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $124, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -2809,78 +2614,69 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $108, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups (%eax), %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx,4), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx,4), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx,4), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebx,4), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx,4), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx,4), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $108, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2895,28 +2691,18 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ecx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -2929,46 +2715,50 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi @@ -2977,16 +2767,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -3000,30 +2789,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $108, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%eax), %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -3035,44 +2814,44 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx,4), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx,4), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx,4), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebx,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx,4), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx,4), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -3094,101 +2873,96 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: subq $24, %rsp +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %edi +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm4, %xmm4 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, (%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: notl %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rdi), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rdi), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (,%r13,2), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rdi), %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: addq $24, %rsp ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 @@ -3202,60 +2976,52 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 48(%rdi), %xmm3 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm4, %xmm4 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 56(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15 @@ -3270,80 +3036,72 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 48(%rdi), %xmm3 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r8d +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %ebp +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rbp, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rbp, %rbx, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (,%r13,2), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rbp, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r13, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r8b ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r8, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r8, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r8, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r8, %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 @@ -3358,59 +3116,52 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 48(%rdi), %xmm3 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r10, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 56(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15 @@ -3422,256 +3173,220 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: subl $220, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm4, %xmm4 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 56(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 48(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 52(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 40(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 44(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 32(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 36(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $220, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -3686,150 +3401,105 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm4, %xmm4 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%esi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 52(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -3845,168 +3515,122 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx @@ -4015,7 +3639,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4055,148 +3679,104 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 52(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 60(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -4212,113 +3792,113 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi -; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: subq $40, %rsp +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm4, %xmm4 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, (%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi -; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: negl %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: movslq %ecx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%r10), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%r10), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r10), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r10), %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rbx), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rbx), %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r10), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rbx), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rbx), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r10), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NO-BMI2-NO-SHLD-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%r10), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rsp,%r10), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 48(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 56(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 56(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 32(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: addq $40, %rsp ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes: @@ -4326,61 +3906,54 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm4, %xmm4 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %eax, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r8), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r9), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r9), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r9), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r8), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 48(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 32(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 @@ -4394,82 +3967,75 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $24, %rsp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm3, (%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %eax, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rsi), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rax, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r13, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r14, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r9d +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r9b ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rsp,%rsi), %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbp, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %r13, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %rbp, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $24, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -4483,60 +4049,53 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %eax, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r8), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rsi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r8), %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r8), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r8), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14 @@ -4548,258 +4107,231 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $220, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm4, %xmm4 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: subl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch ; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 192(%esp,%eax), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 56(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 52(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 40(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 44(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 32(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 36(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 56(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 48(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 52(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $220, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -4813,151 +4345,107 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%ecx), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm4, %xmm4 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %ebp, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %edi, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%ebp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 60(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 52(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 40(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -4972,73 +4460,29 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax @@ -5078,18 +4522,18 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax @@ -5104,7 +4548,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx @@ -5112,11 +4556,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax @@ -5153,7 +4597,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) @@ -5161,7 +4605,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) @@ -5188,155 +4632,107 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%ebx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%ebx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %ebx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 176(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 52(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -5357,105 +4753,102 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: subq $24, %rsp +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rcx +; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %edi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rcx +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, (%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: notl %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rdi), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rdi), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (,%r13,2), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rdi), %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: addq $24, %rsp ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 @@ -5469,23 +4862,17 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rcx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -5499,34 +4886,34 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 56(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15 @@ -5541,23 +4928,17 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -5571,54 +4952,54 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r8d +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %ebp +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rbp, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rbp, %rbx, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (,%r13,2), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rbp, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r13, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r8b ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r8, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r8, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r8, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r8, %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 @@ -5633,23 +5014,17 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -5663,33 +5038,34 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r10, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 56(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15 @@ -5701,62 +5077,24 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $220, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%eax), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 32(%eax), %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5774,195 +5112,203 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edi), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 56(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 48(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 52(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 40(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 44(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 32(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 36(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $220, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -5977,60 +5323,22 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -6083,56 +5391,55 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 52(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 40(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -6148,62 +5455,22 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ecx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -6376,60 +5643,22 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -6451,18 +5680,18 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi @@ -6483,7 +5712,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi @@ -6495,10 +5724,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) @@ -6506,7 +5735,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax) @@ -6520,12 +5749,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, (%esp), %edx # 4-byte Folded Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index 81c4d5d71084c..ff0907a8c1ef2 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -434,58 +434,54 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx -; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: subl $44, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movdqa %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $3, %dl -; X86-SHLD-NEXT: andb $12, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx), %ebx -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $3, %al +; X86-SHLD-NEXT: andb $12, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-SHLD-NEXT: movb %bl, (%eax) -; X86-SHLD-NEXT: addl $40, %esp -; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movb %dl, (%eax) +; X86-SHLD-NEXT: addl $44, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: @@ -566,58 +562,54 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: subl $44, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movdqa %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $3, %dl -; X86-SHLD-NEXT: andb $12, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $3, %al +; X86-SHLD-NEXT: andb $12, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movw %si, (%eax) -; X86-SHLD-NEXT: addl $40, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movw %dx, (%eax) +; X86-SHLD-NEXT: addl $44, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: @@ -697,58 +689,54 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: subl $44, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movdqa %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $3, %dl -; X86-SHLD-NEXT: andb $12, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $3, %al +; X86-SHLD-NEXT: andb $12, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movl %esi, (%eax) -; X86-SHLD-NEXT: addl $40, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl %edx, (%eax) +; X86-SHLD-NEXT: addl $44, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: @@ -845,26 +833,26 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al ; X86-NO-BMI2-NO-SHLD-NEXT: notb %al -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -874,36 +862,34 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $32, %esp +; X86-SHLD-NEXT: subl $36, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movdqa %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $3, %dl -; X86-SHLD-NEXT: andb $12, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl 8(%esp,%edx), %esi -; X86-SHLD-NEXT: movl (%esp,%edx), %edi -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-SHLD-NEXT: movl %edx, %ebx -; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $3, %al +; X86-SHLD-NEXT: andb $12, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl 8(%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %esi +; X86-SHLD-NEXT: movl %esi, %edi ; X86-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-SHLD-NEXT: movl %ebx, 4(%eax) -; X86-SHLD-NEXT: movl %edi, (%eax) -; X86-SHLD-NEXT: addl $32, %esp +; X86-SHLD-NEXT: movl (%esp,%eax), %eax +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-SHLD-NEXT: movl %eax, (%ecx) +; X86-SHLD-NEXT: addl $36, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi -; X86-SHLD-NEXT: popl %ebx ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: @@ -913,35 +899,35 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -998,58 +984,54 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx -; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: subl $76, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm0 ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $5, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx -; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax,4), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax,4), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-SHLD-NEXT: movb %bl, (%eax) -; X86-SHLD-NEXT: addl $72, %esp -; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movb %dl, (%eax) +; X86-SHLD-NEXT: addl $76, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1143,58 +1125,54 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: subl $76, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm0 ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $5, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax,4), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax,4), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movw %si, (%eax) -; X86-SHLD-NEXT: addl $72, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movw %dx, (%eax) +; X86-SHLD-NEXT: addl $76, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1287,58 +1265,54 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: subl $76, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm0 ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $5, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax,4), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax,4), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movl %esi, (%eax) -; X86-SHLD-NEXT: addl $72, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl %edx, (%eax) +; X86-SHLD-NEXT: addl $76, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1395,8 +1369,8 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx @@ -1464,26 +1438,26 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebp,4), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp,4), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al ; X86-NO-BMI2-NO-SHLD-NEXT: notb %al -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -1493,36 +1467,34 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $64, %esp +; X86-SHLD-NEXT: subl $68, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm0 ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $5, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi -; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi -; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-SHLD-NEXT: movl %edx, %ebx -; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl 8(%esp,%eax,4), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax,4), %esi +; X86-SHLD-NEXT: movl %esi, %edi ; X86-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-SHLD-NEXT: movl %ebx, 4(%eax) -; X86-SHLD-NEXT: movl %edi, (%eax) -; X86-SHLD-NEXT: addl $64, %esp +; X86-SHLD-NEXT: movl (%esp,%eax,4), %eax +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-SHLD-NEXT: movl %eax, (%ecx) +; X86-SHLD-NEXT: addl $68, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi -; X86-SHLD-NEXT: popl %ebx ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1532,35 +1504,35 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1594,21 +1566,21 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) @@ -1635,9 +1607,9 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq @@ -1690,10 +1662,10 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq @@ -1718,44 +1690,50 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, (%esp) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi @@ -1770,7 +1748,7 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: pushl %ebx ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $92, %esp +; X86-SHLD-NEXT: subl $76, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movups (%eax), %xmm0 @@ -1779,28 +1757,27 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-SHLD-NEXT: movl %ecx, %eax ; X86-SHLD-NEXT: shrb $5, %al -; X86-SHLD-NEXT: movzbl %al, %ebx -; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi -; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax -; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi -; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SHLD-NEXT: shrdl %cl, %esi, %edi -; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp -; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp -; X86-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-SHLD-NEXT: movl %esi, 8(%edx) -; X86-SHLD-NEXT: movl %edi, 4(%edx) +; X86-SHLD-NEXT: movzbl %al, %esi +; X86-SHLD-NEXT: movl 8(%esp,%esi,4), %eax +; X86-SHLD-NEXT: movl 4(%esp,%esi,4), %edi +; X86-SHLD-NEXT: movl %edi, %edx +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl 12(%esp,%esi,4), %ebx +; X86-SHLD-NEXT: shrdl %cl, %ebx, %eax +; X86-SHLD-NEXT: movl 16(%esp,%esi,4), %ebp +; X86-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SHLD-NEXT: movl (%esp,%esi,4), %esi ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SHLD-NEXT: shrdl %cl, %esi, %eax -; X86-SHLD-NEXT: movl %eax, (%edx) -; X86-SHLD-NEXT: addl $92, %esp +; X86-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-SHLD-NEXT: movl %ebx, 12(%ebp) +; X86-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-SHLD-NEXT: movl %edx, 4(%ebp) +; X86-SHLD-NEXT: movl %esi, (%ebp) +; X86-SHLD-NEXT: addl $76, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi ; X86-SHLD-NEXT: popl %ebx @@ -1937,9 +1914,7 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 @@ -1953,26 +1928,24 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $140, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx -; X86-SHLD-NEXT: subl $136, %esp -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: subl $140, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movups (%ecx), %xmm0 @@ -1986,15 +1959,15 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: leal (,%edx,8), %ecx -; X86-SHLD-NEXT: andl $60, %edx -; X86-SHLD-NEXT: movl (%esp,%edx), %ebx -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-SHLD-NEXT: andl $60, %eax +; X86-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-SHLD-NEXT: movb %bl, (%eax) -; X86-SHLD-NEXT: addl $136, %esp -; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movb %dl, (%eax) +; X86-SHLD-NEXT: addl $140, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -2101,9 +2074,7 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 @@ -2117,26 +2088,24 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movw %ax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $140, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $136, %esp -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: subl $140, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movups (%ecx), %xmm0 @@ -2150,15 +2119,15 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: leal (,%edx,8), %ecx -; X86-SHLD-NEXT: andl $60, %edx -; X86-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-SHLD-NEXT: andl $60, %eax +; X86-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movw %si, (%eax) -; X86-SHLD-NEXT: addl $136, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movw %dx, (%eax) +; X86-SHLD-NEXT: addl $140, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -2264,9 +2233,7 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 @@ -2280,26 +2247,24 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $140, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $136, %esp -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: subl $140, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movups (%ecx), %xmm0 @@ -2313,15 +2278,15 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: leal (,%edx,8), %ecx -; X86-SHLD-NEXT: andl $60, %edx -; X86-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-SHLD-NEXT: andl $60, %eax +; X86-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movl %esi, (%eax) -; X86-SHLD-NEXT: addl $136, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl %edx, (%eax) +; X86-SHLD-NEXT: addl $140, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -2385,8 +2350,8 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx @@ -2466,30 +2431,30 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $140, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -2499,15 +2464,13 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $128, %esp +; X86-SHLD-NEXT: subl $132, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: movups (%eax), %xmm0 +; X86-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -2517,23 +2480,23 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %esi -; X86-SHLD-NEXT: andl $60, %esi -; X86-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-SHLD-NEXT: movl (%esp,%esi), %edx -; X86-SHLD-NEXT: movl 4(%esp,%esi), %esi +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: andl $60, %eax +; X86-SHLD-NEXT: movl 8(%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %esi ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: andl $24, %ecx -; X86-SHLD-NEXT: movl %esi, %ebx -; X86-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-SHLD-NEXT: movl %esi, %edi +; X86-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-SHLD-NEXT: movl (%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-SHLD-NEXT: movl %ebx, 4(%eax) -; X86-SHLD-NEXT: movl %edx, (%eax) -; X86-SHLD-NEXT: addl $128, %esp +; X86-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-SHLD-NEXT: movl %eax, (%ecx) +; X86-SHLD-NEXT: addl $132, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi -; X86-SHLD-NEXT: popl %ebx ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -2611,13 +2574,12 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax @@ -2625,6 +2587,7 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) @@ -2659,9 +2622,9 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rax @@ -2727,12 +2690,11 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rax, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rax, %rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; @@ -2758,47 +2720,49 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $156, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi @@ -2813,7 +2777,7 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: pushl %ebx ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $156, %esp +; X86-SHLD-NEXT: subl $140, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movups (%eax), %xmm0 @@ -2826,29 +2790,28 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movl %ecx, %edi -; X86-SHLD-NEXT: andl $60, %edi -; X86-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-SHLD-NEXT: movl 16(%esp,%edi), %eax -; X86-SHLD-NEXT: movl 20(%esp,%edi), %ebx -; X86-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: andl $60, %edx +; X86-SHLD-NEXT: movl 8(%esp,%edx), %eax +; X86-SHLD-NEXT: movl 4(%esp,%edx), %esi ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: andl $24, %ecx -; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-SHLD-NEXT: movl 28(%esp,%edi), %ebp -; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-SHLD-NEXT: movl 32(%esp,%edi), %edi -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: shrdl %cl, %edi, %ebp -; X86-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-SHLD-NEXT: movl %esi, 8(%edx) -; X86-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-SHLD-NEXT: movl %esi, %edi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-SHLD-NEXT: movl 12(%esp,%edx), %ebx +; X86-SHLD-NEXT: shrdl %cl, %ebx, %eax +; X86-SHLD-NEXT: movl 16(%esp,%edx), %ebp +; X86-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SHLD-NEXT: movl (%esp,%edx), %edx ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SHLD-NEXT: shrdl %cl, %esi, %eax -; X86-SHLD-NEXT: movl %eax, (%edx) -; X86-SHLD-NEXT: addl $156, %esp +; X86-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-SHLD-NEXT: movl %ebx, 12(%ebp) +; X86-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-SHLD-NEXT: movl %edi, 4(%ebp) +; X86-SHLD-NEXT: movl %edx, (%ebp) +; X86-SHLD-NEXT: addl $140, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi ; X86-SHLD-NEXT: popl %ebx @@ -2889,7 +2852,7 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp @@ -2947,41 +2910,41 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: notb %r8b -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx @@ -2990,9 +2953,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 @@ -3008,37 +2971,37 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %edi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %r15, %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r14, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 8(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -3124,10 +3087,10 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r9, %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r10 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rsi, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r10 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) @@ -3143,7 +3106,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $172, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $188, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 @@ -3159,80 +3122,87 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 16(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3241,7 +3211,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $172, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: addl $188, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -3270,13 +3240,12 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movl %ecx, %edi ; X86-SHLD-NEXT: andl $60, %edi -; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx +; X86-SHLD-NEXT: movl 24(%esp,%edi), %eax ; X86-SHLD-NEXT: movl 20(%esp,%edi), %esi ; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: andl $24, %ecx -; X86-SHLD-NEXT: movl %edx, %eax -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SHLD-NEXT: movl 28(%esp,%edi), %edx ; X86-SHLD-NEXT: shrdl %cl, %edx, %eax @@ -3284,30 +3253,30 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: movl 32(%esp,%edi), %ebp ; X86-SHLD-NEXT: shrdl %cl, %ebp, %edx ; X86-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SHLD-NEXT: movl 36(%esp,%edi), %esi -; X86-SHLD-NEXT: shrdl %cl, %esi, %ebp -; X86-SHLD-NEXT: movl 40(%esp,%edi), %edx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movl 36(%esp,%edi), %ebx +; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp +; X86-SHLD-NEXT: movl 40(%esp,%edi), %esi +; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx ; X86-SHLD-NEXT: movl 44(%esp,%edi), %eax -; X86-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-SHLD-NEXT: movl 16(%esp,%edi), %ebx -; X86-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-SHLD-NEXT: shrdl %cl, %edi, %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SHLD-NEXT: movl %eax, 28(%edi) -; X86-SHLD-NEXT: movl %edx, 24(%edi) -; X86-SHLD-NEXT: movl %esi, 20(%edi) -; X86-SHLD-NEXT: movl %ebp, 16(%edi) -; X86-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-SHLD-NEXT: movl %eax, 12(%edi) +; X86-SHLD-NEXT: shrdl %cl, %eax, %esi +; X86-SHLD-NEXT: movl 48(%esp,%edi), %edx +; X86-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-SHLD-NEXT: movl 16(%esp,%edi), %edi +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movl %eax, 28(%edx) +; X86-SHLD-NEXT: movl %esi, 24(%edx) +; X86-SHLD-NEXT: movl %ebx, 20(%edx) +; X86-SHLD-NEXT: movl %ebp, 16(%edx) +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SHLD-NEXT: movl %eax, 8(%edi) +; X86-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-SHLD-NEXT: movl %eax, 12(%edx) ; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SHLD-NEXT: movl %eax, 4(%edi) -; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: movl %eax, 8(%edx) ; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SHLD-NEXT: shrdl %cl, %eax, %ebx -; X86-SHLD-NEXT: movl %ebx, (%edi) +; X86-SHLD-NEXT: movl %eax, 4(%edx) +; X86-SHLD-NEXT: movl %edi, (%edx) ; X86-SHLD-NEXT: addl $156, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi @@ -3349,7 +3318,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 8d36eef952a2b..5dafba127584c 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -260,23 +260,21 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %dl, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 @@ -385,19 +383,19 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %dx, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -506,19 +504,19 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -605,56 +603,52 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx -; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: subl $44, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $3, %dl -; X86-SHLD-NEXT: andb $12, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx), %ebx -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $3, %al +; X86-SHLD-NEXT: andb $12, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-SHLD-NEXT: movb %bl, (%eax) -; X86-SHLD-NEXT: addl $40, %esp -; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movb %dl, (%eax) +; X86-SHLD-NEXT: addl $44, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: @@ -767,56 +761,52 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: subl $44, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $3, %dl -; X86-SHLD-NEXT: andb $12, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $3, %al +; X86-SHLD-NEXT: andb $12, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movw %si, (%eax) -; X86-SHLD-NEXT: addl $40, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movw %dx, (%eax) +; X86-SHLD-NEXT: addl $44, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: @@ -928,56 +918,52 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: subl $44, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $3, %dl -; X86-SHLD-NEXT: andb $12, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $3, %al +; X86-SHLD-NEXT: andb $12, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movl %esi, (%eax) -; X86-SHLD-NEXT: addl $40, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl %edx, (%eax) +; X86-SHLD-NEXT: addl $44, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: @@ -1105,26 +1091,26 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al ; X86-NO-BMI2-NO-SHLD-NEXT: notb %al -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -1134,35 +1120,33 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $32, %esp +; X86-SHLD-NEXT: subl $36, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups (%eax), %xmm0 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $3, %dl -; X86-SHLD-NEXT: andb $12, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl 8(%esp,%edx), %esi -; X86-SHLD-NEXT: movl (%esp,%edx), %edi -; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-SHLD-NEXT: movl %edx, %ebx -; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $3, %al +; X86-SHLD-NEXT: andb $12, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl 8(%esp,%eax), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax), %esi +; X86-SHLD-NEXT: movl %esi, %edi ; X86-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-SHLD-NEXT: movl %ebx, 4(%eax) -; X86-SHLD-NEXT: movl %edi, (%eax) -; X86-SHLD-NEXT: addl $32, %esp +; X86-SHLD-NEXT: movl (%esp,%eax), %eax +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-SHLD-NEXT: movl %eax, (%ecx) +; X86-SHLD-NEXT: addl $36, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi -; X86-SHLD-NEXT: popl %ebx ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: @@ -1172,34 +1156,34 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1258,60 +1242,56 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx -; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: subl $76, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: movups (%eax), %xmm0 +; X86-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $5, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx -; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax,4), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax,4), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-SHLD-NEXT: movb %bl, (%eax) -; X86-SHLD-NEXT: addl $72, %esp -; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movb %dl, (%eax) +; X86-SHLD-NEXT: addl $76, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca: @@ -1406,60 +1386,56 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: subl $76, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: movups (%eax), %xmm0 +; X86-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $5, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax,4), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax,4), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movw %si, (%eax) -; X86-SHLD-NEXT: addl $72, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movw %dx, (%eax) +; X86-SHLD-NEXT: addl $76, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca: @@ -1553,60 +1529,56 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: subl $76, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: movups (%eax), %xmm0 +; X86-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $5, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi -; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl (%esp,%eax,4), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax,4), %eax ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movl %esi, (%eax) -; X86-SHLD-NEXT: addl $72, %esp -; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl %edx, (%eax) +; X86-SHLD-NEXT: addl $76, %esp ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca: @@ -1663,8 +1635,8 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx @@ -1735,26 +1707,26 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebp,4), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp,4), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al ; X86-NO-BMI2-NO-SHLD-NEXT: notb %al -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -1764,37 +1736,35 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: ; X86-SHLD: # %bb.0: -; X86-SHLD-NEXT: pushl %ebx ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $64, %esp +; X86-SHLD-NEXT: subl $68, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: movups (%eax), %xmm0 +; X86-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-SHLD-NEXT: shll $3, %ecx ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %edx -; X86-SHLD-NEXT: shrb $5, %dl -; X86-SHLD-NEXT: movzbl %dl, %edx -; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi -; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi -; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-SHLD-NEXT: movl %edx, %ebx -; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %eax +; X86-SHLD-NEXT: movl 8(%esp,%eax,4), %edx +; X86-SHLD-NEXT: movl 4(%esp,%eax,4), %esi +; X86-SHLD-NEXT: movl %esi, %edi ; X86-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-SHLD-NEXT: movl %ebx, 4(%eax) -; X86-SHLD-NEXT: movl %edi, (%eax) -; X86-SHLD-NEXT: addl $64, %esp +; X86-SHLD-NEXT: movl (%esp,%eax,4), %eax +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-SHLD-NEXT: movl %eax, (%ecx) +; X86-SHLD-NEXT: addl $68, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi -; X86-SHLD-NEXT: popl %ebx ; X86-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: @@ -1804,36 +1774,36 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 -; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1866,21 +1836,21 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl ; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) @@ -1908,9 +1878,9 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq @@ -1965,10 +1935,10 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq @@ -1980,58 +1950,65 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%eax), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%ebp,2), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi @@ -2046,7 +2023,7 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-SHLD-NEXT: pushl %ebx ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi -; X86-SHLD-NEXT: subl $92, %esp +; X86-SHLD-NEXT: subl $76, %esp ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movups (%eax), %xmm0 @@ -2056,28 +2033,27 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-SHLD-NEXT: movl %ecx, %eax ; X86-SHLD-NEXT: shrb $5, %al -; X86-SHLD-NEXT: movzbl %al, %ebx -; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi -; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax -; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi -; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SHLD-NEXT: shrdl %cl, %esi, %edi -; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp -; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp -; X86-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-SHLD-NEXT: movl %esi, 8(%edx) -; X86-SHLD-NEXT: movl %edi, 4(%edx) +; X86-SHLD-NEXT: movzbl %al, %esi +; X86-SHLD-NEXT: movl 8(%esp,%esi,4), %eax +; X86-SHLD-NEXT: movl 4(%esp,%esi,4), %edi +; X86-SHLD-NEXT: movl %edi, %edx +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl 12(%esp,%esi,4), %ebx +; X86-SHLD-NEXT: shrdl %cl, %ebx, %eax +; X86-SHLD-NEXT: movl 16(%esp,%esi,4), %ebp +; X86-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SHLD-NEXT: movl (%esp,%esi,4), %esi ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SHLD-NEXT: shrdl %cl, %esi, %eax -; X86-SHLD-NEXT: movl %eax, (%edx) -; X86-SHLD-NEXT: addl $92, %esp +; X86-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-SHLD-NEXT: movl %ebx, 12(%ebp) +; X86-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-SHLD-NEXT: movl %edx, 4(%ebp) +; X86-SHLD-NEXT: movl %esi, (%ebp) +; X86-SHLD-NEXT: addl $76, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi ; X86-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/widen_arith-5.ll b/llvm/test/CodeGen/X86/widen_arith-5.ll index 466249d1bf1d4..021d91ab4d067 100644 --- a/llvm/test/CodeGen/X86/widen_arith-5.ll +++ b/llvm/test/CodeGen/X86/widen_arith-5.ll @@ -13,7 +13,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind { ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pmovsxbd {{.*#+}} xmm0 = [3,3,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,u] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %forcond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll index 56001468898e4..9675f150dce68 100644 --- a/llvm/test/CodeGen/X86/widen_bitcnt.ll +++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll @@ -40,7 +40,7 @@ define <4 x i32> @widen_ctpop_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; ; AVX2-LABEL: widen_ctpop_v2i32_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 @@ -64,7 +64,7 @@ define <4 x i32> @widen_ctpop_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; ; AVX512VL-LABEL: widen_ctpop_v2i32_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %xmm3, %xmm4, %xmm3 @@ -138,8 +138,7 @@ define <8 x i32> @widen_ctpop_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -157,10 +156,9 @@ define <8 x i32> @widen_ctpop_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -246,8 +244,7 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 @@ -273,10 +270,9 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32 ; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VL-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 @@ -375,7 +371,7 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX2-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 @@ -476,8 +472,7 @@ define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -608,8 +603,7 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm4 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm5 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -728,7 +722,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX2-NEXT: vpcmpeqb %xmm6, %xmm4, %xmm7 @@ -829,8 +823,7 @@ define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -961,8 +954,7 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm4 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm5 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -1067,7 +1059,7 @@ define <4 x i32> @widen_cttz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm4 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -1170,8 +1162,7 @@ define <8 x i32> @widen_cttz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1287,8 +1278,7 @@ define <8 x i32> @widen_cttz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; AVX2-NEXT: vpandn %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 @@ -1397,7 +1387,7 @@ define <4 x i32> @widen_cttz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm4 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 @@ -1500,8 +1490,7 @@ define <8 x i32> @widen_cttz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1617,8 +1606,7 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; AVX2-NEXT: vpandn %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/widen_bitops-0.ll b/llvm/test/CodeGen/X86/widen_bitops-0.ll index 7d91502694ce4..7781192298bca 100644 --- a/llvm/test/CodeGen/X86/widen_bitops-0.ll +++ b/llvm/test/CodeGen/X86/widen_bitops-0.ll @@ -138,9 +138,9 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X86-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: pand %xmm0, %xmm1 -; X86-NEXT: movd %xmm1, %eax ; X86-NEXT: pextrb $1, %xmm1, %edx ; X86-NEXT: pextrb $2, %xmm1, %ecx +; X86-NEXT: movd %xmm1, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: # kill: def $dl killed $dl killed $edx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx @@ -155,9 +155,9 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X64-NEXT: pinsrb $1, %esi, %xmm1 ; X64-NEXT: pinsrb $2, %edx, %xmm1 ; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: pextrb $1, %xmm1, %edx ; X64-NEXT: pextrb $2, %xmm1, %ecx +; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: # kill: def $dl killed $dl killed $edx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx @@ -179,9 +179,9 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X86-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: pxor %xmm0, %xmm1 -; X86-NEXT: movd %xmm1, %eax ; X86-NEXT: pextrb $1, %xmm1, %edx ; X86-NEXT: pextrb $2, %xmm1, %ecx +; X86-NEXT: movd %xmm1, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: # kill: def $dl killed $dl killed $edx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx @@ -196,9 +196,9 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X64-NEXT: pinsrb $1, %esi, %xmm1 ; X64-NEXT: pinsrb $2, %edx, %xmm1 ; X64-NEXT: pxor %xmm0, %xmm1 -; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: pextrb $1, %xmm1, %edx ; X64-NEXT: pextrb $2, %xmm1, %ecx +; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: # kill: def $dl killed $dl killed $edx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx @@ -220,9 +220,9 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X86-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: por %xmm0, %xmm1 -; X86-NEXT: movd %xmm1, %eax ; X86-NEXT: pextrb $1, %xmm1, %edx ; X86-NEXT: pextrb $2, %xmm1, %ecx +; X86-NEXT: movd %xmm1, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: # kill: def $dl killed $dl killed $edx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx @@ -237,9 +237,9 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X64-NEXT: pinsrb $1, %esi, %xmm1 ; X64-NEXT: pinsrb $2, %edx, %xmm1 ; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: pextrb $1, %xmm1, %edx ; X64-NEXT: pextrb $2, %xmm1, %ecx +; X64-NEXT: movd %xmm1, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: # kill: def $dl killed $dl killed $edx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/widen_cast-2.ll b/llvm/test/CodeGen/X86/widen_cast-2.ll index cd06f27dcc55c..69a9e20cbff9b 100644 --- a/llvm/test/CodeGen/X86/widen_cast-2.ll +++ b/llvm/test/CodeGen/X86/widen_cast-2.ll @@ -22,9 +22,9 @@ define void @convert(ptr %dst, ptr %src) nounwind { ; CHECK-NEXT: psubw %xmm0, %xmm2 ; CHECK-NEXT: psubw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax) -; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) ; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) ; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) +; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: cmpl $3, (%esp) ; CHECK-NEXT: jle .LBB0_2 diff --git a/llvm/test/CodeGen/X86/widen_cast-4.ll b/llvm/test/CodeGen/X86/widen_cast-4.ll index 7468e229e3a84..58335a2325b51 100644 --- a/llvm/test/CodeGen/X86/widen_cast-4.ll +++ b/llvm/test/CodeGen/X86/widen_cast-4.ll @@ -23,8 +23,8 @@ define void @update(ptr %dst_i, ptr %src_i, i32 %n) nounwind { ; WIDE-NEXT: leal (,%eax,8), %edx ; WIDE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIDE-NEXT: addl %edx, %ecx -; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; WIDE-NEXT: addl {{[0-9]+}}(%esp), %edx +; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; WIDE-NEXT: movl %edx, {{[0-9]+}}(%esp) ; WIDE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; WIDE-NEXT: psubb %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll index d4555207d5559..85e700daf1f92 100644 --- a/llvm/test/CodeGen/X86/widen_conv-4.ll +++ b/llvm/test/CodeGen/X86/widen_conv-4.ll @@ -32,9 +32,9 @@ define void @convert_v7i16_v7f32(ptr %dst.addr, <7 x i16> %src) nounwind { ; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X86-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 -; X86-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1 ; X86-SSE42-NEXT: extractps $2, %xmm0, 24(%eax) ; X86-SSE42-NEXT: extractps $1, %xmm0, 20(%eax) +; X86-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1 ; X86-SSE42-NEXT: movups %xmm1, (%eax) ; X86-SSE42-NEXT: movss %xmm0, 16(%eax) ; X86-SSE42-NEXT: retl @@ -59,8 +59,8 @@ define void @convert_v7i16_v7f32(ptr %dst.addr, <7 x i16> %src) nounwind { ; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 -; X64-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1 ; X64-SSE42-NEXT: extractps $2, %xmm0, 24(%rdi) +; X64-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1 ; X64-SSE42-NEXT: movlps %xmm0, 16(%rdi) ; X64-SSE42-NEXT: movups %xmm1, (%rdi) ; X64-SSE42-NEXT: retq diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll index f8cde4cf223a7..bbcda5da00a14 100644 --- a/llvm/test/CodeGen/X86/widen_fadd.ll +++ b/llvm/test/CodeGen/X86/widen_fadd.ll @@ -15,8 +15,8 @@ define void @widen_fadd_v2f32_v4f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-NEXT: addps %xmm0, %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movlps %xmm2, (%rdx) -; SSE-NEXT: movlps %xmm0, 8(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movups %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: widen_fadd_v2f32_v4f32: @@ -55,14 +55,14 @@ define void @widen_fadd_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-NEXT: addps %xmm0, %xmm4 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: addps %xmm2, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: movlps %xmm4, (%rdx) -; SSE-NEXT: movlps %xmm0, 8(%rdx) -; SSE-NEXT: movlps %xmm1, 16(%rdx) -; SSE-NEXT: movlps %xmm2, 24(%rdx) +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movups %xmm4, (%rdx) +; SSE-NEXT: movups %xmm0, 16(%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: widen_fadd_v2f32_v8f32: @@ -116,38 +116,38 @@ define void @widen_fadd_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) { define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-LABEL: widen_fadd_v2f32_v16f32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: addps %xmm0, %xmm4 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: addps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: addps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: addps %xmm3, %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: addps %xmm2, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: addps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: addps %xmm2, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: addps %xmm3, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: addps %xmm3, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero -; SSE-NEXT: addps %xmm3, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero -; SSE-NEXT: addps %xmm3, %xmm8 -; SSE-NEXT: movlps %xmm4, (%rdx) -; SSE-NEXT: movlps %xmm0, 8(%rdx) -; SSE-NEXT: movlps %xmm1, 16(%rdx) -; SSE-NEXT: movlps %xmm2, 24(%rdx) -; SSE-NEXT: movlps %xmm5, 32(%rdx) -; SSE-NEXT: movlps %xmm6, 40(%rdx) -; SSE-NEXT: movlps %xmm7, 48(%rdx) -; SSE-NEXT: movlps %xmm8, 56(%rdx) +; SSE-NEXT: addps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movups %xmm0, (%rdx) +; SSE-NEXT: movups %xmm1, 16(%rdx) +; SSE-NEXT: movups %xmm3, 32(%rdx) +; SSE-NEXT: movups %xmm4, 48(%rdx) ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: widen_fadd_v2f32_v16f32: @@ -254,14 +254,14 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7 ; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6] +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6] ; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0 -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2 -; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10] -; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 -; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0 +; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,2,8,10,0,2,8,10] +; AVX512VL-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm3 +; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm3, %zmm2 +; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -374,16 +374,27 @@ define <16 x float> @widen_fadd_v4f32_v16f32_const(<4 x float> %x, <4 x float> % ; SSE-NEXT: addps %xmm4, %xmm3 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: widen_fadd_v4f32_v16f32_const: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] -; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1OR2-NEXT: vaddps %ymm1, %ymm2, %ymm1 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: widen_fadd_v4f32_v16f32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: widen_fadd_v4f32_v16f32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: widen_fadd_v4f32_v16f32_const: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll index fdf895921ca67..28a662fa1104d 100644 --- a/llvm/test/CodeGen/X86/widen_fdiv.ll +++ b/llvm/test/CodeGen/X86/widen_fdiv.ll @@ -15,8 +15,8 @@ define void @widen_fdiv_v2f32_v4f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-NEXT: divps %xmm2, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: divps %xmm2, %xmm1 -; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movlps %xmm1, 8(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movups %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: widen_fdiv_v2f32_v4f32: @@ -55,14 +55,14 @@ define void @widen_fdiv_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-NEXT: divps %xmm4, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: divps %xmm4, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: divps %xmm4, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: divps %xmm4, %xmm3 -; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movlps %xmm1, 8(%rdx) -; SSE-NEXT: movlps %xmm2, 16(%rdx) -; SSE-NEXT: movlps %xmm3, 24(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: divps %xmm1, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: divps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movups %xmm0, (%rdx) +; SSE-NEXT: movups %xmm2, 16(%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: widen_fdiv_v2f32_v8f32: @@ -104,37 +104,37 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-LABEL: widen_fdiv_v2f32_v16f32: ; SSE: # %bb.0: ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: divps %xmm4, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: divps %xmm4, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: divps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: divps %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: divps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: divps %xmm3, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: divps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: divps %xmm4, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: divps %xmm5, %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: divps %xmm6, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero -; SSE-NEXT: divps %xmm7, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero -; SSE-NEXT: divps %xmm8, %xmm7 -; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movlps %xmm1, 8(%rdx) -; SSE-NEXT: movlps %xmm2, 16(%rdx) -; SSE-NEXT: movlps %xmm3, 24(%rdx) -; SSE-NEXT: movlps %xmm4, 32(%rdx) -; SSE-NEXT: movlps %xmm5, 40(%rdx) -; SSE-NEXT: movlps %xmm6, 48(%rdx) -; SSE-NEXT: movlps %xmm7, 56(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movups %xmm0, (%rdx) +; SSE-NEXT: movups %xmm1, 16(%rdx) +; SSE-NEXT: movups %xmm2, 32(%rdx) +; SSE-NEXT: movups %xmm3, 48(%rdx) ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: widen_fdiv_v2f32_v16f32: @@ -202,13 +202,13 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero ; AVX512VL-NEXT: vdivps %xmm4, %xmm3, %xmm3 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10] +; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vmovups (%rdi), %ymm4 +; AVX512VL-NEXT: vdivps (%rsi), %ymm4, %ymm4 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,2,8,10,0,2,8,10] -; AVX512VL-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512VL-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 -; AVX512VL-NEXT: vmovups (%rdi), %ymm0 -; AVX512VL-NEXT: vdivps (%rsi), %ymm0, %ymm0 -; AVX512VL-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vpermi2pd %zmm2, %zmm0, %zmm3 +; AVX512VL-NEXT: vinsertf64x4 $0, %ymm4, %zmm3, %zmm0 ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll index 16baa068fc24f..bfd0b891b88d9 100644 --- a/llvm/test/CodeGen/X86/widen_fmul.ll +++ b/llvm/test/CodeGen/X86/widen_fmul.ll @@ -15,8 +15,8 @@ define void @widen_fmul_v2f32_v4f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-NEXT: mulps %xmm0, %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: movlps %xmm2, (%rdx) -; SSE-NEXT: movlps %xmm0, 8(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movups %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: widen_fmul_v2f32_v4f32: @@ -55,14 +55,14 @@ define void @widen_fmul_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-NEXT: mulps %xmm0, %xmm4 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: mulps %xmm3, %xmm2 -; SSE-NEXT: movlps %xmm4, (%rdx) -; SSE-NEXT: movlps %xmm0, 8(%rdx) -; SSE-NEXT: movlps %xmm1, 16(%rdx) -; SSE-NEXT: movlps %xmm2, 24(%rdx) +; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movups %xmm4, (%rdx) +; SSE-NEXT: movups %xmm0, 16(%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: widen_fmul_v2f32_v8f32: @@ -116,38 +116,38 @@ define void @widen_fmul_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) { define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-LABEL: widen_fmul_v2f32_v16f32: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: mulps %xmm0, %xmm4 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: mulps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: mulps %xmm3, %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: mulps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: mulps %xmm2, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: mulps %xmm3, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: mulps %xmm3, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero -; SSE-NEXT: mulps %xmm3, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero -; SSE-NEXT: mulps %xmm3, %xmm8 -; SSE-NEXT: movlps %xmm4, (%rdx) -; SSE-NEXT: movlps %xmm0, 8(%rdx) -; SSE-NEXT: movlps %xmm1, 16(%rdx) -; SSE-NEXT: movlps %xmm2, 24(%rdx) -; SSE-NEXT: movlps %xmm5, 32(%rdx) -; SSE-NEXT: movlps %xmm6, 40(%rdx) -; SSE-NEXT: movlps %xmm7, 48(%rdx) -; SSE-NEXT: movlps %xmm8, 56(%rdx) +; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movups %xmm0, (%rdx) +; SSE-NEXT: movups %xmm1, 16(%rdx) +; SSE-NEXT: movups %xmm3, 32(%rdx) +; SSE-NEXT: movups %xmm4, 48(%rdx) ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: widen_fmul_v2f32_v16f32: @@ -254,14 +254,14 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512VL-NEXT: vmulps %xmm7, %xmm8, %xmm7 ; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6] +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6] ; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0 -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2 -; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10] -; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 -; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0 +; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,2,8,10,0,2,8,10] +; AVX512VL-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm3 +; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm3, %zmm2 +; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -374,16 +374,27 @@ define <16 x float> @widen_fmul_v4f32_v16f32_const(<4 x float> %x, <4 x float> % ; SSE-NEXT: mulps %xmm4, %xmm3 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: widen_fmul_v4f32_v16f32_const: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] -; AVX1OR2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1OR2-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: widen_fmul_v4f32_v16f32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: widen_fmul_v4f32_v16f32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; AVX2-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: widen_fmul_v4f32_v16f32_const: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll index 8dcd887ab4144..74436f798e11a 100644 --- a/llvm/test/CodeGen/X86/widen_fsub.ll +++ b/llvm/test/CodeGen/X86/widen_fsub.ll @@ -15,8 +15,8 @@ define void @widen_fsub_v2f32_v4f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-NEXT: subps %xmm2, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: subps %xmm2, %xmm1 -; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movlps %xmm1, 8(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movups %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: widen_fsub_v2f32_v4f32: @@ -55,14 +55,14 @@ define void @widen_fsub_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-NEXT: subps %xmm4, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: subps %xmm4, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: subps %xmm4, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: subps %xmm4, %xmm3 -; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movlps %xmm1, 8(%rdx) -; SSE-NEXT: movlps %xmm2, 16(%rdx) -; SSE-NEXT: movlps %xmm3, 24(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: subps %xmm1, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movups %xmm0, (%rdx) +; SSE-NEXT: movups %xmm2, 16(%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: widen_fsub_v2f32_v8f32: @@ -117,37 +117,37 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; SSE-LABEL: widen_fsub_v2f32_v16f32: ; SSE: # %bb.0: ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: subps %xmm4, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: subps %xmm4, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: subps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: subps %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: subps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: subps %xmm3, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: subps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: subps %xmm4, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: subps %xmm5, %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: subps %xmm6, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero -; SSE-NEXT: subps %xmm7, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero -; SSE-NEXT: subps %xmm8, %xmm7 -; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movlps %xmm1, 8(%rdx) -; SSE-NEXT: movlps %xmm2, 16(%rdx) -; SSE-NEXT: movlps %xmm3, 24(%rdx) -; SSE-NEXT: movlps %xmm4, 32(%rdx) -; SSE-NEXT: movlps %xmm5, 40(%rdx) -; SSE-NEXT: movlps %xmm6, 48(%rdx) -; SSE-NEXT: movlps %xmm7, 56(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movups %xmm0, (%rdx) +; SSE-NEXT: movups %xmm1, 16(%rdx) +; SSE-NEXT: movups %xmm2, 32(%rdx) +; SSE-NEXT: movups %xmm3, 48(%rdx) ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: widen_fsub_v2f32_v16f32: @@ -254,14 +254,14 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) { ; AVX512VL-NEXT: vsubps %xmm8, %xmm7, %xmm7 ; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6] +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6] ; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0 -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2 -; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10] -; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 -; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0 +; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,2,8,10,0,2,8,10] +; AVX512VL-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm3 +; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm3, %zmm2 +; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -374,16 +374,27 @@ define <16 x float> @widen_fsub_v4f32_v16f32_const(<4 x float> %x, <4 x float> % ; SSE-NEXT: subps %xmm4, %xmm3 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: widen_fsub_v4f32_v16f32_const: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0] -; AVX1OR2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1OR2-NEXT: vaddps %ymm1, %ymm2, %ymm1 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: widen_fsub_v4f32_v16f32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0] +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: widen_fsub_v4f32_v16f32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0,-2.0E+0] +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: widen_fsub_v4f32_v16f32_const: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/widen_load-2.ll b/llvm/test/CodeGen/X86/widen_load-2.ll index 4d0a5da4e7cb6..c16969dc2d16d 100644 --- a/llvm/test/CodeGen/X86/widen_load-2.ll +++ b/llvm/test/CodeGen/X86/widen_load-2.ll @@ -38,18 +38,18 @@ define void @add3i32_2(ptr sret(%i32vec3) %ret, ptr %ap, ptr %bp) { ; X86-LABEL: add3i32_2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pinsrd $1, 4(%edx), %xmm0 -; X86-NEXT: pinsrd $2, 8(%edx), %xmm0 +; X86-NEXT: pinsrd $1, 4(%eax), %xmm0 +; X86-NEXT: pinsrd $2, 8(%eax), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: movd %xmm1, (%eax) ; X86-NEXT: pextrd $1, %xmm1, 4(%eax) ; X86-NEXT: pextrd $2, %xmm1, 8(%eax) +; X86-NEXT: movd %xmm1, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i32_2: @@ -81,9 +81,9 @@ define void @add7i32(ptr sret(%i32vec7) %ret, ptr %ap, ptr %bp) { ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -94,8 +94,8 @@ define void @add7i32(ptr sret(%i32vec7) %ret, ptr %ap, ptr %bp) { ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddd (%rdx), %xmm0 ; X64-NEXT: paddd 16(%rdx), %xmm1 -; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) +; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i32vec7, ptr %ap, align 16 @@ -150,9 +150,9 @@ define void @add3i16(ptr nocapture sret(%i16vec3) %ret, ptr %ap, ptr %bp) nounwi ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pinsrw $2, 4(%edx), %xmm0 +; X86-NEXT: pinsrw $2, 4(%ecx), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: pinsrw $2, 4(%ecx), %xmm1 ; X86-NEXT: paddw %xmm0, %xmm1 @@ -215,18 +215,18 @@ define void @add12i16(ptr nocapture sret(%i16vec12) %ret, ptr %ap, ptr %bp) noun ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddw (%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add12i16: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movdqa (%rsi), %xmm0 ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddw (%rdx), %xmm0 ; X64-NEXT: paddw 16(%rdx), %xmm1 +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq @@ -317,11 +317,11 @@ define void @add31i8(ptr nocapture sret(%i8vec31) %ret, ptr %ap, ptr %bp) nounwi ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddb (%ecx), %xmm0 ; X86-NEXT: paddb 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) ; X86-NEXT: pextrw $6, %xmm1, 28(%eax) ; X86-NEXT: pextrb $14, %xmm1, 30(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -332,10 +332,10 @@ define void @add31i8(ptr nocapture sret(%i8vec31) %ret, ptr %ap, ptr %bp) nounwi ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddb (%rdx), %xmm0 ; X64-NEXT: paddb 16(%rdx), %xmm1 -; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) ; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) ; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) +; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i8vec31, ptr %ap, align 16 @@ -352,21 +352,20 @@ define void @rot(ptr nocapture sret(%i8vec3pack) %result, ptr %X, ptr %rot) noun ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movb $-98, 2(%edx) -; X86-NEXT: movw $-24930, (%edx) # imm = 0x9E9E -; X86-NEXT: movb $1, 2(%ecx) -; X86-NEXT: movw $257, (%ecx) # imm = 0x101 +; X86-NEXT: movb $-98, 2(%ecx) +; X86-NEXT: movw $-24930, (%ecx) # imm = 0x9E9E +; X86-NEXT: movb $1, 2(%eax) +; X86-NEXT: movw $257, (%eax) # imm = 0x101 ; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: psrlw $1, %xmm0 ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: pextrb $2, %xmm0, 2(%eax) ; X86-NEXT: pextrw $0, %xmm0, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: rot: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movb $-98, 2(%rsi) ; X64-NEXT: movw $-24930, (%rsi) # imm = 0x9E9E ; X64-NEXT: movb $1, 2(%rdx) @@ -374,6 +373,7 @@ define void @rot(ptr nocapture sret(%i8vec3pack) %result, ptr %X, ptr %rot) noun ; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: psrlw $1, %xmm0 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: pextrb $2, %xmm0, 2(%rdi) ; X64-NEXT: pextrw $0, %xmm0, (%rdi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/win64-byval.ll b/llvm/test/CodeGen/X86/win64-byval.ll index 573a0016e8772..9d4a495eed922 100644 --- a/llvm/test/CodeGen/X86/win64-byval.ll +++ b/llvm/test/CodeGen/X86/win64-byval.ll @@ -13,10 +13,8 @@ define void @bar() { ; CHECK-NEXT: .seh_stackalloc 56 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: movq .refptr.G(%rip), %rax -; CHECK-NEXT: movq (%rax), %rcx -; CHECK-NEXT: movq 8(%rax), %rax -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups (%rax), %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: callq foo ; CHECK-NEXT: nop @@ -38,10 +36,8 @@ define void @baz(ptr byval({ float, double }) %arg) { ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .seh_stackalloc 56 ; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: movq (%rcx), %rax -; CHECK-NEXT: movq 8(%rcx), %rcx -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups (%rcx), %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: callq foo ; CHECK-NEXT: nop @@ -64,18 +60,12 @@ define void @test() { ; CHECK-NEXT: .seh_stackalloc 136 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: movq .refptr.G(%rip), %rax -; CHECK-NEXT: movq (%rax), %rcx -; CHECK-NEXT: movq 8(%rax), %rax -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movups (%rax), %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $10, {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/x86-64-varargs.ll b/llvm/test/CodeGen/X86/x86-64-varargs.ll index f947327d4c562..eeda71792aa37 100644 --- a/llvm/test/CodeGen/X86/x86-64-varargs.ll +++ b/llvm/test/CodeGen/X86/x86-64-varargs.ll @@ -16,252 +16,252 @@ define void @func(...) nounwind { ; CHECK-X64-LABEL: func: ; CHECK-X64: ## %bb.0: ## %entry ; CHECK-X64-NEXT: pushq %rbx -; CHECK-X64-NEXT: subq $224, %rsp +; CHECK-X64-NEXT: subq $240, %rsp ; CHECK-X64-NEXT: testb %al, %al ; CHECK-X64-NEXT: je LBB0_47 ; CHECK-X64-NEXT: ## %bb.46: ## %entry -; CHECK-X64-NEXT: movaps %xmm0, 96(%rsp) -; CHECK-X64-NEXT: movaps %xmm1, 112(%rsp) -; CHECK-X64-NEXT: movaps %xmm2, 128(%rsp) -; CHECK-X64-NEXT: movaps %xmm3, 144(%rsp) -; CHECK-X64-NEXT: movaps %xmm4, 160(%rsp) -; CHECK-X64-NEXT: movaps %xmm5, 176(%rsp) -; CHECK-X64-NEXT: movaps %xmm6, 192(%rsp) -; CHECK-X64-NEXT: movaps %xmm7, 208(%rsp) +; CHECK-X64-NEXT: movaps %xmm0, 112(%rsp) +; CHECK-X64-NEXT: movaps %xmm1, 128(%rsp) +; CHECK-X64-NEXT: movaps %xmm2, 144(%rsp) +; CHECK-X64-NEXT: movaps %xmm3, 160(%rsp) +; CHECK-X64-NEXT: movaps %xmm4, 176(%rsp) +; CHECK-X64-NEXT: movaps %xmm5, 192(%rsp) +; CHECK-X64-NEXT: movaps %xmm6, 208(%rsp) +; CHECK-X64-NEXT: movaps %xmm7, 224(%rsp) ; CHECK-X64-NEXT: LBB0_47: ## %entry -; CHECK-X64-NEXT: movq %rdi, 48(%rsp) -; CHECK-X64-NEXT: movq %rsi, 56(%rsp) -; CHECK-X64-NEXT: movq %rdx, 64(%rsp) -; CHECK-X64-NEXT: movq %rcx, 72(%rsp) -; CHECK-X64-NEXT: movq %r8, 80(%rsp) -; CHECK-X64-NEXT: movq %r9, 88(%rsp) +; CHECK-X64-NEXT: movq %rdi, 64(%rsp) +; CHECK-X64-NEXT: movq %rsi, 72(%rsp) +; CHECK-X64-NEXT: movq %rdx, 80(%rsp) +; CHECK-X64-NEXT: movq %rcx, 88(%rsp) +; CHECK-X64-NEXT: movq %r8, 96(%rsp) +; CHECK-X64-NEXT: movq %r9, 104(%rsp) ; CHECK-X64-NEXT: movabsq $206158430208, %rax ## imm = 0x3000000000 -; CHECK-X64-NEXT: movq %rax, (%rsp) -; CHECK-X64-NEXT: leaq 240(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, 8(%rsp) -; CHECK-X64-NEXT: leaq 48(%rsp), %rax +; CHECK-X64-NEXT: leaq 256(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, 16(%rsp) -; CHECK-X64-NEXT: movl (%rsp), %ecx +; CHECK-X64-NEXT: leaq 64(%rsp), %rax +; CHECK-X64-NEXT: movq %rax, 24(%rsp) +; CHECK-X64-NEXT: movl 8(%rsp), %ecx ; CHECK-X64-NEXT: cmpl $48, %ecx ; CHECK-X64-NEXT: jae LBB0_2 ; CHECK-X64-NEXT: ## %bb.1: ## %entry -; CHECK-X64-NEXT: movq 16(%rsp), %rax +; CHECK-X64-NEXT: movq 24(%rsp), %rax ; CHECK-X64-NEXT: addq %rcx, %rax ; CHECK-X64-NEXT: addl $8, %ecx -; CHECK-X64-NEXT: movl %ecx, (%rsp) +; CHECK-X64-NEXT: movl %ecx, 8(%rsp) ; CHECK-X64-NEXT: jmp LBB0_3 ; CHECK-X64-NEXT: LBB0_2: ## %entry -; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, %rcx ; CHECK-X64-NEXT: addq $8, %rcx -; CHECK-X64-NEXT: movq %rcx, 8(%rsp) +; CHECK-X64-NEXT: movq %rcx, 16(%rsp) ; CHECK-X64-NEXT: LBB0_3: ## %entry ; CHECK-X64-NEXT: movl (%rax), %r10d -; CHECK-X64-NEXT: movl (%rsp), %ecx +; CHECK-X64-NEXT: movl 8(%rsp), %ecx ; CHECK-X64-NEXT: cmpl $48, %ecx ; CHECK-X64-NEXT: jae LBB0_5 ; CHECK-X64-NEXT: ## %bb.4: ## %entry -; CHECK-X64-NEXT: movq 16(%rsp), %rax +; CHECK-X64-NEXT: movq 24(%rsp), %rax ; CHECK-X64-NEXT: addq %rcx, %rax ; CHECK-X64-NEXT: addl $8, %ecx -; CHECK-X64-NEXT: movl %ecx, (%rsp) +; CHECK-X64-NEXT: movl %ecx, 8(%rsp) ; CHECK-X64-NEXT: jmp LBB0_6 ; CHECK-X64-NEXT: LBB0_5: ## %entry -; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, %rcx ; CHECK-X64-NEXT: addq $8, %rcx -; CHECK-X64-NEXT: movq %rcx, 8(%rsp) +; CHECK-X64-NEXT: movq %rcx, 16(%rsp) ; CHECK-X64-NEXT: LBB0_6: ## %entry ; CHECK-X64-NEXT: movl (%rax), %r11d -; CHECK-X64-NEXT: movl (%rsp), %ecx +; CHECK-X64-NEXT: movl 8(%rsp), %ecx ; CHECK-X64-NEXT: cmpl $48, %ecx ; CHECK-X64-NEXT: jae LBB0_8 ; CHECK-X64-NEXT: ## %bb.7: ## %entry -; CHECK-X64-NEXT: movq 16(%rsp), %rax +; CHECK-X64-NEXT: movq 24(%rsp), %rax ; CHECK-X64-NEXT: addq %rcx, %rax ; CHECK-X64-NEXT: addl $8, %ecx -; CHECK-X64-NEXT: movl %ecx, (%rsp) +; CHECK-X64-NEXT: movl %ecx, 8(%rsp) ; CHECK-X64-NEXT: jmp LBB0_9 ; CHECK-X64-NEXT: LBB0_8: ## %entry -; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, %rcx ; CHECK-X64-NEXT: addq $8, %rcx -; CHECK-X64-NEXT: movq %rcx, 8(%rsp) +; CHECK-X64-NEXT: movq %rcx, 16(%rsp) ; CHECK-X64-NEXT: LBB0_9: ## %entry ; CHECK-X64-NEXT: movl (%rax), %r9d +; CHECK-X64-NEXT: movq 24(%rsp), %rax +; CHECK-X64-NEXT: movq %rax, 48(%rsp) +; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq %rax, 32(%rsp) ; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, 40(%rsp) -; CHECK-X64-NEXT: movq (%rsp), %rax -; CHECK-X64-NEXT: movq 8(%rsp), %rcx -; CHECK-X64-NEXT: movq %rcx, 32(%rsp) -; CHECK-X64-NEXT: movq %rax, 24(%rsp) -; CHECK-X64-NEXT: movl 4(%rsp), %eax +; CHECK-X64-NEXT: movl 12(%rsp), %eax ; CHECK-X64-NEXT: cmpl $176, %eax ; CHECK-X64-NEXT: jae LBB0_11 ; CHECK-X64-NEXT: ## %bb.10: ## %entry ; CHECK-X64-NEXT: addl $16, %eax -; CHECK-X64-NEXT: movl %eax, 4(%rsp) +; CHECK-X64-NEXT: movl %eax, 12(%rsp) ; CHECK-X64-NEXT: jmp LBB0_12 ; CHECK-X64-NEXT: LBB0_11: ## %entry -; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: addq $8, %rax -; CHECK-X64-NEXT: movq %rax, 8(%rsp) +; CHECK-X64-NEXT: movq %rax, 16(%rsp) ; CHECK-X64-NEXT: LBB0_12: ## %entry -; CHECK-X64-NEXT: movl 28(%rsp), %ecx +; CHECK-X64-NEXT: movl 36(%rsp), %ecx ; CHECK-X64-NEXT: cmpl $176, %ecx ; CHECK-X64-NEXT: jae LBB0_14 ; CHECK-X64-NEXT: ## %bb.13: ## %entry -; CHECK-X64-NEXT: movq 40(%rsp), %rax +; CHECK-X64-NEXT: movq 48(%rsp), %rax ; CHECK-X64-NEXT: addq %rcx, %rax ; CHECK-X64-NEXT: addl $16, %ecx -; CHECK-X64-NEXT: movl %ecx, 28(%rsp) +; CHECK-X64-NEXT: movl %ecx, 36(%rsp) ; CHECK-X64-NEXT: jmp LBB0_15 ; CHECK-X64-NEXT: LBB0_14: ## %entry -; CHECK-X64-NEXT: movq 32(%rsp), %rax +; CHECK-X64-NEXT: movq 40(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, %rcx ; CHECK-X64-NEXT: addq $8, %rcx -; CHECK-X64-NEXT: movq %rcx, 32(%rsp) +; CHECK-X64-NEXT: movq %rcx, 40(%rsp) ; CHECK-X64-NEXT: LBB0_15: ## %entry ; CHECK-X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-X64-NEXT: movl (%rsp), %ecx +; CHECK-X64-NEXT: movl 8(%rsp), %ecx ; CHECK-X64-NEXT: cmpl $48, %ecx ; CHECK-X64-NEXT: jae LBB0_17 ; CHECK-X64-NEXT: ## %bb.16: ## %entry -; CHECK-X64-NEXT: movq 16(%rsp), %rax +; CHECK-X64-NEXT: movq 24(%rsp), %rax ; CHECK-X64-NEXT: addq %rcx, %rax ; CHECK-X64-NEXT: addl $8, %ecx -; CHECK-X64-NEXT: movl %ecx, (%rsp) +; CHECK-X64-NEXT: movl %ecx, 8(%rsp) ; CHECK-X64-NEXT: jmp LBB0_18 ; CHECK-X64-NEXT: LBB0_17: ## %entry -; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, %rcx ; CHECK-X64-NEXT: addq $8, %rcx -; CHECK-X64-NEXT: movq %rcx, 8(%rsp) +; CHECK-X64-NEXT: movq %rcx, 16(%rsp) ; CHECK-X64-NEXT: LBB0_18: ## %entry ; CHECK-X64-NEXT: movl (%rax), %r8d -; CHECK-X64-NEXT: movl 24(%rsp), %eax +; CHECK-X64-NEXT: movl 32(%rsp), %eax ; CHECK-X64-NEXT: cmpl $48, %eax ; CHECK-X64-NEXT: jae LBB0_20 ; CHECK-X64-NEXT: ## %bb.19: ## %entry ; CHECK-X64-NEXT: addl $8, %eax -; CHECK-X64-NEXT: movl %eax, 24(%rsp) +; CHECK-X64-NEXT: movl %eax, 32(%rsp) ; CHECK-X64-NEXT: jmp LBB0_21 ; CHECK-X64-NEXT: LBB0_20: ## %entry -; CHECK-X64-NEXT: movq 32(%rsp), %rax +; CHECK-X64-NEXT: movq 40(%rsp), %rax ; CHECK-X64-NEXT: addq $8, %rax -; CHECK-X64-NEXT: movq %rax, 32(%rsp) +; CHECK-X64-NEXT: movq %rax, 40(%rsp) ; CHECK-X64-NEXT: LBB0_21: ## %entry -; CHECK-X64-NEXT: movl (%rsp), %eax +; CHECK-X64-NEXT: movl 8(%rsp), %eax ; CHECK-X64-NEXT: cmpl $48, %eax ; CHECK-X64-NEXT: jae LBB0_23 ; CHECK-X64-NEXT: ## %bb.22: ## %entry ; CHECK-X64-NEXT: addl $8, %eax -; CHECK-X64-NEXT: movl %eax, (%rsp) +; CHECK-X64-NEXT: movl %eax, 8(%rsp) ; CHECK-X64-NEXT: jmp LBB0_24 ; CHECK-X64-NEXT: LBB0_23: ## %entry -; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: addq $8, %rax -; CHECK-X64-NEXT: movq %rax, 8(%rsp) +; CHECK-X64-NEXT: movq %rax, 16(%rsp) ; CHECK-X64-NEXT: LBB0_24: ## %entry -; CHECK-X64-NEXT: movl 24(%rsp), %ecx +; CHECK-X64-NEXT: movl 32(%rsp), %ecx ; CHECK-X64-NEXT: cmpl $48, %ecx ; CHECK-X64-NEXT: jae LBB0_26 ; CHECK-X64-NEXT: ## %bb.25: ## %entry -; CHECK-X64-NEXT: movq 40(%rsp), %rax +; CHECK-X64-NEXT: movq 48(%rsp), %rax ; CHECK-X64-NEXT: addq %rcx, %rax ; CHECK-X64-NEXT: addl $8, %ecx -; CHECK-X64-NEXT: movl %ecx, 24(%rsp) +; CHECK-X64-NEXT: movl %ecx, 32(%rsp) ; CHECK-X64-NEXT: jmp LBB0_27 ; CHECK-X64-NEXT: LBB0_26: ## %entry -; CHECK-X64-NEXT: movq 32(%rsp), %rax +; CHECK-X64-NEXT: movq 40(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, %rcx ; CHECK-X64-NEXT: addq $8, %rcx -; CHECK-X64-NEXT: movq %rcx, 32(%rsp) +; CHECK-X64-NEXT: movq %rcx, 40(%rsp) ; CHECK-X64-NEXT: LBB0_27: ## %entry ; CHECK-X64-NEXT: movq (%rax), %rcx -; CHECK-X64-NEXT: movl (%rsp), %edx +; CHECK-X64-NEXT: movl 8(%rsp), %edx ; CHECK-X64-NEXT: cmpl $48, %edx ; CHECK-X64-NEXT: jae LBB0_29 ; CHECK-X64-NEXT: ## %bb.28: ## %entry -; CHECK-X64-NEXT: movq 16(%rsp), %rax +; CHECK-X64-NEXT: movq 24(%rsp), %rax ; CHECK-X64-NEXT: addq %rdx, %rax ; CHECK-X64-NEXT: addl $8, %edx -; CHECK-X64-NEXT: movl %edx, (%rsp) +; CHECK-X64-NEXT: movl %edx, 8(%rsp) ; CHECK-X64-NEXT: jmp LBB0_30 ; CHECK-X64-NEXT: LBB0_29: ## %entry -; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, %rdx ; CHECK-X64-NEXT: addq $8, %rdx -; CHECK-X64-NEXT: movq %rdx, 8(%rsp) +; CHECK-X64-NEXT: movq %rdx, 16(%rsp) ; CHECK-X64-NEXT: LBB0_30: ## %entry ; CHECK-X64-NEXT: movl (%rax), %edx -; CHECK-X64-NEXT: movl 24(%rsp), %eax +; CHECK-X64-NEXT: movl 32(%rsp), %eax ; CHECK-X64-NEXT: cmpl $48, %eax ; CHECK-X64-NEXT: jae LBB0_32 ; CHECK-X64-NEXT: ## %bb.31: ## %entry ; CHECK-X64-NEXT: addl $8, %eax -; CHECK-X64-NEXT: movl %eax, 24(%rsp) +; CHECK-X64-NEXT: movl %eax, 32(%rsp) ; CHECK-X64-NEXT: jmp LBB0_33 ; CHECK-X64-NEXT: LBB0_32: ## %entry -; CHECK-X64-NEXT: movq 32(%rsp), %rax +; CHECK-X64-NEXT: movq 40(%rsp), %rax ; CHECK-X64-NEXT: addq $8, %rax -; CHECK-X64-NEXT: movq %rax, 32(%rsp) +; CHECK-X64-NEXT: movq %rax, 40(%rsp) ; CHECK-X64-NEXT: LBB0_33: ## %entry -; CHECK-X64-NEXT: movl 4(%rsp), %eax +; CHECK-X64-NEXT: movl 12(%rsp), %eax ; CHECK-X64-NEXT: cmpl $176, %eax ; CHECK-X64-NEXT: jae LBB0_35 ; CHECK-X64-NEXT: ## %bb.34: ## %entry ; CHECK-X64-NEXT: addl $16, %eax -; CHECK-X64-NEXT: movl %eax, 4(%rsp) +; CHECK-X64-NEXT: movl %eax, 12(%rsp) ; CHECK-X64-NEXT: jmp LBB0_36 ; CHECK-X64-NEXT: LBB0_35: ## %entry -; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: addq $8, %rax -; CHECK-X64-NEXT: movq %rax, 8(%rsp) +; CHECK-X64-NEXT: movq %rax, 16(%rsp) ; CHECK-X64-NEXT: LBB0_36: ## %entry -; CHECK-X64-NEXT: movl 28(%rsp), %esi +; CHECK-X64-NEXT: movl 36(%rsp), %esi ; CHECK-X64-NEXT: cmpl $176, %esi ; CHECK-X64-NEXT: jae LBB0_38 ; CHECK-X64-NEXT: ## %bb.37: ## %entry -; CHECK-X64-NEXT: movq 40(%rsp), %rax +; CHECK-X64-NEXT: movq 48(%rsp), %rax ; CHECK-X64-NEXT: addq %rsi, %rax ; CHECK-X64-NEXT: addl $16, %esi -; CHECK-X64-NEXT: movl %esi, 28(%rsp) +; CHECK-X64-NEXT: movl %esi, 36(%rsp) ; CHECK-X64-NEXT: jmp LBB0_39 ; CHECK-X64-NEXT: LBB0_38: ## %entry -; CHECK-X64-NEXT: movq 32(%rsp), %rax +; CHECK-X64-NEXT: movq 40(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, %rsi ; CHECK-X64-NEXT: addq $8, %rsi -; CHECK-X64-NEXT: movq %rsi, 32(%rsp) +; CHECK-X64-NEXT: movq %rsi, 40(%rsp) ; CHECK-X64-NEXT: LBB0_39: ## %entry ; CHECK-X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-X64-NEXT: movl (%rsp), %esi +; CHECK-X64-NEXT: movl 8(%rsp), %esi ; CHECK-X64-NEXT: cmpl $48, %esi ; CHECK-X64-NEXT: jae LBB0_41 ; CHECK-X64-NEXT: ## %bb.40: ## %entry -; CHECK-X64-NEXT: movq 16(%rsp), %rax +; CHECK-X64-NEXT: movq 24(%rsp), %rax ; CHECK-X64-NEXT: addq %rsi, %rax ; CHECK-X64-NEXT: addl $8, %esi -; CHECK-X64-NEXT: movl %esi, (%rsp) +; CHECK-X64-NEXT: movl %esi, 8(%rsp) ; CHECK-X64-NEXT: jmp LBB0_42 ; CHECK-X64-NEXT: LBB0_41: ## %entry -; CHECK-X64-NEXT: movq 8(%rsp), %rax +; CHECK-X64-NEXT: movq 16(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, %rsi ; CHECK-X64-NEXT: addq $8, %rsi -; CHECK-X64-NEXT: movq %rsi, 8(%rsp) +; CHECK-X64-NEXT: movq %rsi, 16(%rsp) ; CHECK-X64-NEXT: LBB0_42: ## %entry ; CHECK-X64-NEXT: movl (%rax), %esi -; CHECK-X64-NEXT: movl 24(%rsp), %eax +; CHECK-X64-NEXT: movl 32(%rsp), %eax ; CHECK-X64-NEXT: cmpl $48, %eax ; CHECK-X64-NEXT: jae LBB0_44 ; CHECK-X64-NEXT: ## %bb.43: ## %entry ; CHECK-X64-NEXT: addl $8, %eax -; CHECK-X64-NEXT: movl %eax, 24(%rsp) +; CHECK-X64-NEXT: movl %eax, 32(%rsp) ; CHECK-X64-NEXT: jmp LBB0_45 ; CHECK-X64-NEXT: LBB0_44: ## %entry -; CHECK-X64-NEXT: movq 32(%rsp), %rax +; CHECK-X64-NEXT: movq 40(%rsp), %rax ; CHECK-X64-NEXT: addq $8, %rax -; CHECK-X64-NEXT: movq %rax, 32(%rsp) +; CHECK-X64-NEXT: movq %rax, 40(%rsp) ; CHECK-X64-NEXT: LBB0_45: ## %entry ; CHECK-X64-NEXT: movabsq $_.str, %rdi ; CHECK-X64-NEXT: movabsq $_printf, %rbx @@ -269,7 +269,7 @@ define void @func(...) nounwind { ; CHECK-X64-NEXT: pushq %r10 ; CHECK-X64-NEXT: pushq %r11 ; CHECK-X64-NEXT: callq *%rbx -; CHECK-X64-NEXT: addq $240, %rsp +; CHECK-X64-NEXT: addq $256, %rsp ## imm = 0x100 ; CHECK-X64-NEXT: popq %rbx ; CHECK-X64-NEXT: retq ; @@ -349,9 +349,9 @@ define void @func(...) nounwind { ; CHECK-X32-NEXT: .LBB0_9: # %entry ; CHECK-X32-NEXT: movl (%eax), %r9d ; CHECK-X32-NEXT: movq (%esp), %rax -; CHECK-X32-NEXT: movq 8(%esp), %rcx -; CHECK-X32-NEXT: movq %rcx, 24(%esp) ; CHECK-X32-NEXT: movq %rax, 16(%esp) +; CHECK-X32-NEXT: movq 8(%esp), %rax +; CHECK-X32-NEXT: movq %rax, 24(%esp) ; CHECK-X32-NEXT: movl 4(%esp), %eax ; CHECK-X32-NEXT: cmpl $176, %eax ; CHECK-X32-NEXT: jae .LBB0_11 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 47a6022e428c3..d82f2fb876d2b 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -379,7 +379,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind { ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1OR2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1OR2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,5,9,13,13,5,12,13] +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u] ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -388,7 +388,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind { ; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] ; AVX1OR2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1OR2-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpmovsxbw {{.*#+}} xmm3 = [3,7,11,15,7,15,6,7] +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u] ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm4 ; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] @@ -404,7 +404,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vpmovsxbw {{.*#+}} xmm2 = [3,7,11,15,7,15,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u] ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -490,7 +490,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,0,4] ; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] @@ -703,26 +703,26 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; ; AVX512-LABEL: interleaved_load_vf32_i8_stride4: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zmm1[0,4,8,12],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,16,20,24,28],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,32,36,40,44],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,48,52,56,60],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512-NEXT: vpermd %zmm3, %zmm0, %zmm3 -; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zero,zero,zmm2[1,5,9,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[17,21,25,29,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[33,37,41,45,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[49,53,57,61,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zmm1[1,5,9,13],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,17,21,25,29],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,33,37,41,45],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,49,53,57,61],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13] +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm1[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[48,52,56,60,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zmm3[0,4,8,12],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,16,20,24,28],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,32,36,40,44],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,48,52,56,60],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vporq %zmm2, %zmm4, %zmm2 +; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zero,zero,zmm1[1,5,9,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[17,21,25,29,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[33,37,41,45,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[49,53,57,61,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zmm3[1,5,9,13],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,17,21,25,29],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,33,37,41,45],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,49,53,57,61],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpermd %zmm2, %zmm0, %zmm2 ; AVX512-NEXT: vporq %zmm4, %zmm5, %zmm4 +; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zmm1[2,6,10,14,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[18,22,26,30,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[34,38,42,46,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[50,54,58,62,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm4 -; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zmm2[2,6,10,14,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[18,22,26,30,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[34,38,42,46,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[50,54,58,62,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} zmm6 = zmm1[2,6,10,14],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,18,22,26,30],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,34,38,42,46],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,50,54,58,62],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} zmm6 = zmm3[2,6,10,14],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,18,22,26,30],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,34,38,42,46],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,50,54,58,62],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5 +; AVX512-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[3,7,11,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[19,23,27,31,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[35,39,43,47,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[51,55,59,63,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[3,7,11,15],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,19,23,27,31],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,35,39,43,47],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,51,55,59,63],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpermd %zmm5, %zmm0, %zmm5 -; AVX512-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[3,7,11,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[19,23,27,31,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[35,39,43,47,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[51,55,59,63,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[3,7,11,15],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,19,23,27,31],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,35,39,43,47],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,51,55,59,63],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpcmpeqb %zmm4, %zmm3, %k0 +; AVX512-NEXT: vpcmpeqb %zmm4, %zmm2, %k0 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm5, %k1 ; AVX512-NEXT: kxnord %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 @@ -812,8 +812,7 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){ ; AVX2OR512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX2OR512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX2OR512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX2OR512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -821,13 +820,12 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){ ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2OR512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 -; AVX2OR512-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX2OR512-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX2OR512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2OR512-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX2OR512-NEXT: retq %wide.vec = load <96 x i8>, ptr %ptr %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> @@ -852,12 +850,12 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(ptr %ptr){ ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215] +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX-NEXT: retq %wide.vec = load <48 x i8>, ptr %ptr %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> @@ -875,13 +873,13 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(ptr %ptr){ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -900,9 +898,9 @@ define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) ; AVX-NEXT: vmovdqu %xmm1, (%rdi) @@ -944,8 +942,7 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -964,8 +961,7 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512-NEXT: vmovdqu %xmm2, 32(%rdi) ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -1029,8 +1025,7 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -1055,8 +1050,7 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1 @@ -1183,13 +1177,13 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX2-LABEL: interleaved_store_vf64_i8_stride3: ; AVX2: # %bb.0: ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] -; AVX2-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-NEXT: vpslldq {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] +; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm8 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20] -; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10 +; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm10 ; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: # ymm11 = mem[0,1,0,1] @@ -1201,18 +1195,17 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX2-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-NEXT: vpblendvb %ymm11, %ymm13, %ymm3, %ymm11 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25] -; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -1253,8 +1246,7 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 ; AVX512-NEXT: vpshufb %zmm3, %zmm4, %zmm4 @@ -1404,8 +1396,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm7 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm5 ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5 @@ -1414,8 +1405,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm3 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 ; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm9 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0] -; AVX2-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u] ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm8 ; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm8 @@ -1437,10 +1427,10 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] ; AVX2-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: retq ; @@ -1454,15 +1444,14 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5 ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2 @@ -1654,7 +1643,7 @@ define void @splat2_v4f64_load_store(ptr %s, ptr %d) nounwind { ; AVX512-LABEL: splat2_v4f64_load_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovups %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper @@ -1691,7 +1680,7 @@ define void @splat2_v4i64_load_store(ptr %s, ptr %d) nounwind { ; AVX512-LABEL: splat2_v4i64_load_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovups %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper @@ -1746,9 +1735,9 @@ define void @splat4_v8f32_load_store(ptr %s, ptr %d) nounwind { ; AVX512-LABEL: splat4_v8f32_load_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) @@ -1806,9 +1795,9 @@ define void @splat4_v8i32_load_store(ptr %s, ptr %d) nounwind { ; AVX512-LABEL: splat4_v8i32_load_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] ; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) ; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) @@ -1838,10 +1827,10 @@ define void @splat4_v4f64_load_store(ptr %s, ptr %d) nounwind { ; ; AVX512-LABEL: splat4_v4f64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,5,5,5,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,5,5,5,5] ; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm1 # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,2,2,7,7,7,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,7,7,7,7] ; AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, 64(%rsi) ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) @@ -1871,10 +1860,10 @@ define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind { ; ; AVX512-LABEL: splat4_v4i64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,0,0,5,5,5,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,5,5,5,5] ; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm1 # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,2,2,7,7,7,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,7,7,7,7] ; AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, 64(%rsi) ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/x86-no_caller_saved_registers.ll b/llvm/test/CodeGen/X86/x86-no_caller_saved_registers.ll index b9ad39806d1ae..8af23fc62958b 100644 --- a/llvm/test/CodeGen/X86/x86-no_caller_saved_registers.ll +++ b/llvm/test/CodeGen/X86/x86-no_caller_saved_registers.ll @@ -1,7 +1,8 @@ -; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s -; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s -; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=UNKNWOWN +; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s --check-prefix=UNKNWOWN0 +; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=UNKNWOWN-32 +; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s --check-prefix=UNKNWOWN-32-0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; In functions with 'no_caller_saved_registers' attribute, all registers should @@ -16,6 +17,62 @@ define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i ; CHECK: mov{{.*}} %xmm0 ; CHECK: mov{{.*}} {{.*}}, %xmm0 ; CHECK: ret +; UNKNWOWN-LABEL: bar: +; UNKNWOWN: # %bb.0: +; UNKNWOWN-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; UNKNWOWN-NEXT: #APP +; UNKNWOWN-NEXT: #NO_APP +; UNKNWOWN-NEXT: movl $1, %eax +; UNKNWOWN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; UNKNWOWN-NEXT: retq +; +; UNKNWOWN0-LABEL: bar: +; UNKNWOWN0: # %bb.0: +; UNKNWOWN0-NEXT: pushq %rax +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 16 +; UNKNWOWN0-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; UNKNWOWN0-NEXT: .cfi_offset %xmm0, -32 +; UNKNWOWN0-NEXT: .cfi_offset %rax, -16 +; UNKNWOWN0-NEXT: #APP +; UNKNWOWN0-NEXT: #NO_APP +; UNKNWOWN0-NEXT: movl $1, %eax +; UNKNWOWN0-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; UNKNWOWN0-NEXT: popq %rax +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN0-NEXT: retq +; +; UNKNWOWN-32-LABEL: bar: +; UNKNWOWN-32: # %bb.0: +; UNKNWOWN-32-NEXT: subl $28, %esp +; UNKNWOWN-32-NEXT: movups %xmm0, (%esp) # 16-byte Spill +; UNKNWOWN-32-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN-32-NEXT: .cfi_offset %xmm0, -32 +; UNKNWOWN-32-NEXT: #APP +; UNKNWOWN-32-NEXT: #NO_APP +; UNKNWOWN-32-NEXT: movl $1, %eax +; UNKNWOWN-32-NEXT: movups (%esp), %xmm0 # 16-byte Reload +; UNKNWOWN-32-NEXT: addl $28, %esp +; UNKNWOWN-32-NEXT: .cfi_def_cfa_offset 4 +; UNKNWOWN-32-NEXT: retl +; +; UNKNWOWN-32-0-LABEL: bar: +; UNKNWOWN-32-0: # %bb.0: +; UNKNWOWN-32-0-NEXT: pushl %eax +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN-32-0-NEXT: subl $24, %esp +; UNKNWOWN-32-0-NEXT: movups %xmm0, (%esp) # 16-byte Spill +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN-32-0-NEXT: .cfi_offset %eax, -8 +; UNKNWOWN-32-0-NEXT: .cfi_offset %xmm0, -32 +; UNKNWOWN-32-0-NEXT: #APP +; UNKNWOWN-32-0-NEXT: #NO_APP +; UNKNWOWN-32-0-NEXT: movl $1, %eax +; UNKNWOWN-32-0-NEXT: movups (%esp), %xmm0 # 16-byte Reload +; UNKNWOWN-32-0-NEXT: addl $24, %esp +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN-32-0-NEXT: popl %eax +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa_offset 4 +; UNKNWOWN-32-0-NEXT: retl call void asm sideeffect "", "~{xmm0}"() ret i32 1 } @@ -23,6 +80,173 @@ define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i define x86_intrcc void @foo(ptr byval(i8) nocapture readnone %c) { ; CHECK-LABEL: foo ; CHECK-NOT: xmm +; UNKNWOWN-LABEL: foo: +; UNKNWOWN: # %bb.0: # %entry +; UNKNWOWN-NEXT: pushq %rax +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 16 +; UNKNWOWN-NEXT: pushq %r9 +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 24 +; UNKNWOWN-NEXT: pushq %r8 +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN-NEXT: pushq %rdi +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 40 +; UNKNWOWN-NEXT: pushq %rsi +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 48 +; UNKNWOWN-NEXT: pushq %rdx +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 56 +; UNKNWOWN-NEXT: pushq %rcx +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 64 +; UNKNWOWN-NEXT: .cfi_offset %rcx, -64 +; UNKNWOWN-NEXT: .cfi_offset %rdx, -56 +; UNKNWOWN-NEXT: .cfi_offset %rsi, -48 +; UNKNWOWN-NEXT: .cfi_offset %rdi, -40 +; UNKNWOWN-NEXT: .cfi_offset %r8, -32 +; UNKNWOWN-NEXT: .cfi_offset %r9, -24 +; UNKNWOWN-NEXT: .cfi_offset %rax, -16 +; UNKNWOWN-NEXT: cld +; UNKNWOWN-NEXT: subq $8, %rsp +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset 8 +; UNKNWOWN-NEXT: xorl %edi, %edi +; UNKNWOWN-NEXT: movl $1, %esi +; UNKNWOWN-NEXT: movl $2, %edx +; UNKNWOWN-NEXT: movl $3, %ecx +; UNKNWOWN-NEXT: movl $4, %r8d +; UNKNWOWN-NEXT: movl $5, %r9d +; UNKNWOWN-NEXT: pushq $8 +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset 8 +; UNKNWOWN-NEXT: pushq $7 +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset 8 +; UNKNWOWN-NEXT: pushq $6 +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset 8 +; UNKNWOWN-NEXT: callq bar@PLT +; UNKNWOWN-NEXT: addq $32, %rsp +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset -32 +; UNKNWOWN-NEXT: popq %rcx +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 56 +; UNKNWOWN-NEXT: popq %rdx +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 48 +; UNKNWOWN-NEXT: popq %rsi +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 40 +; UNKNWOWN-NEXT: popq %rdi +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN-NEXT: popq %r8 +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 24 +; UNKNWOWN-NEXT: popq %r9 +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 16 +; UNKNWOWN-NEXT: popq %rax +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN-NEXT: iretq +; +; UNKNWOWN0-LABEL: foo: +; UNKNWOWN0: # %bb.0: # %entry +; UNKNWOWN0-NEXT: pushq %rax +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 16 +; UNKNWOWN0-NEXT: pushq %r9 +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 24 +; UNKNWOWN0-NEXT: pushq %r8 +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN0-NEXT: pushq %rdi +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 40 +; UNKNWOWN0-NEXT: pushq %rsi +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 48 +; UNKNWOWN0-NEXT: pushq %rdx +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 56 +; UNKNWOWN0-NEXT: pushq %rcx +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 64 +; UNKNWOWN0-NEXT: subq $32, %rsp +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 96 +; UNKNWOWN0-NEXT: .cfi_offset %rcx, -64 +; UNKNWOWN0-NEXT: .cfi_offset %rdx, -56 +; UNKNWOWN0-NEXT: .cfi_offset %rsi, -48 +; UNKNWOWN0-NEXT: .cfi_offset %rdi, -40 +; UNKNWOWN0-NEXT: .cfi_offset %r8, -32 +; UNKNWOWN0-NEXT: .cfi_offset %r9, -24 +; UNKNWOWN0-NEXT: .cfi_offset %rax, -16 +; UNKNWOWN0-NEXT: cld +; UNKNWOWN0-NEXT: movq %rsp, %rax +; UNKNWOWN0-NEXT: movl $8, 16(%rax) +; UNKNWOWN0-NEXT: movl $7, 8(%rax) +; UNKNWOWN0-NEXT: movl $6, (%rax) +; UNKNWOWN0-NEXT: xorl %edi, %edi +; UNKNWOWN0-NEXT: movl $1, %esi +; UNKNWOWN0-NEXT: movl $2, %edx +; UNKNWOWN0-NEXT: movl $3, %ecx +; UNKNWOWN0-NEXT: movl $4, %r8d +; UNKNWOWN0-NEXT: movl $5, %r9d +; UNKNWOWN0-NEXT: callq bar@PLT +; UNKNWOWN0-NEXT: addq $32, %rsp +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 64 +; UNKNWOWN0-NEXT: popq %rcx +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 56 +; UNKNWOWN0-NEXT: popq %rdx +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 48 +; UNKNWOWN0-NEXT: popq %rsi +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 40 +; UNKNWOWN0-NEXT: popq %rdi +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN0-NEXT: popq %r8 +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 24 +; UNKNWOWN0-NEXT: popq %r9 +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 16 +; UNKNWOWN0-NEXT: popq %rax +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN0-NEXT: iretq +; +; UNKNWOWN-32-LABEL: foo: +; UNKNWOWN-32: # %bb.0: # %entry +; UNKNWOWN-32-NEXT: pushl %ebp +; UNKNWOWN-32-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN-32-NEXT: .cfi_offset %ebp, -8 +; UNKNWOWN-32-NEXT: movl %esp, %ebp +; UNKNWOWN-32-NEXT: .cfi_def_cfa_register %ebp +; UNKNWOWN-32-NEXT: pushl %eax +; UNKNWOWN-32-NEXT: andl $-16, %esp +; UNKNWOWN-32-NEXT: subl $80, %esp +; UNKNWOWN-32-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; UNKNWOWN-32-NEXT: .cfi_offset %eax, -12 +; UNKNWOWN-32-NEXT: .cfi_offset %xmm0, -32 +; UNKNWOWN-32-NEXT: cld +; UNKNWOWN-32-NEXT: movaps {{.*#+}} xmm0 = [4,5,6,7] +; UNKNWOWN-32-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; UNKNWOWN-32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,3] +; UNKNWOWN-32-NEXT: movups %xmm0, (%esp) +; UNKNWOWN-32-NEXT: movl $8, {{[0-9]+}}(%esp) +; UNKNWOWN-32-NEXT: calll bar@PLT +; UNKNWOWN-32-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; UNKNWOWN-32-NEXT: leal -4(%ebp), %esp +; UNKNWOWN-32-NEXT: popl %eax +; UNKNWOWN-32-NEXT: popl %ebp +; UNKNWOWN-32-NEXT: .cfi_def_cfa %esp, 4 +; UNKNWOWN-32-NEXT: iretl +; +; UNKNWOWN-32-0-LABEL: foo: +; UNKNWOWN-32-0: # %bb.0: # %entry +; UNKNWOWN-32-0-NEXT: pushl %ebp +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN-32-0-NEXT: .cfi_offset %ebp, -8 +; UNKNWOWN-32-0-NEXT: movl %esp, %ebp +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa_register %ebp +; UNKNWOWN-32-0-NEXT: pushl %eax +; UNKNWOWN-32-0-NEXT: andl $-16, %esp +; UNKNWOWN-32-0-NEXT: subl $48, %esp +; UNKNWOWN-32-0-NEXT: .cfi_offset %eax, -12 +; UNKNWOWN-32-0-NEXT: cld +; UNKNWOWN-32-0-NEXT: movl %esp, %eax +; UNKNWOWN-32-0-NEXT: movl $8, 32(%eax) +; UNKNWOWN-32-0-NEXT: movl $7, 28(%eax) +; UNKNWOWN-32-0-NEXT: movl $6, 24(%eax) +; UNKNWOWN-32-0-NEXT: movl $5, 20(%eax) +; UNKNWOWN-32-0-NEXT: movl $4, 16(%eax) +; UNKNWOWN-32-0-NEXT: movl $3, 12(%eax) +; UNKNWOWN-32-0-NEXT: movl $2, 8(%eax) +; UNKNWOWN-32-0-NEXT: movl $1, 4(%eax) +; UNKNWOWN-32-0-NEXT: movl $0, (%eax) +; UNKNWOWN-32-0-NEXT: calll bar@PLT +; UNKNWOWN-32-0-NEXT: leal -4(%ebp), %esp +; UNKNWOWN-32-0-NEXT: popl %eax +; UNKNWOWN-32-0-NEXT: popl %ebp +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa %esp, 4 +; UNKNWOWN-32-0-NEXT: iretl entry: tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0 ret void @@ -32,6 +256,173 @@ entry: define x86_intrcc void @baz(ptr byval(i8) nocapture readnone %c) { ; CHECK-LABEL: baz ; CHECK-NOT: xmm +; UNKNWOWN-LABEL: baz: +; UNKNWOWN: # %bb.0: # %entry +; UNKNWOWN-NEXT: pushq %rax +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 16 +; UNKNWOWN-NEXT: pushq %r9 +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 24 +; UNKNWOWN-NEXT: pushq %r8 +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN-NEXT: pushq %rdi +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 40 +; UNKNWOWN-NEXT: pushq %rsi +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 48 +; UNKNWOWN-NEXT: pushq %rdx +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 56 +; UNKNWOWN-NEXT: pushq %rcx +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 64 +; UNKNWOWN-NEXT: .cfi_offset %rcx, -64 +; UNKNWOWN-NEXT: .cfi_offset %rdx, -56 +; UNKNWOWN-NEXT: .cfi_offset %rsi, -48 +; UNKNWOWN-NEXT: .cfi_offset %rdi, -40 +; UNKNWOWN-NEXT: .cfi_offset %r8, -32 +; UNKNWOWN-NEXT: .cfi_offset %r9, -24 +; UNKNWOWN-NEXT: .cfi_offset %rax, -16 +; UNKNWOWN-NEXT: cld +; UNKNWOWN-NEXT: subq $8, %rsp +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset 8 +; UNKNWOWN-NEXT: xorl %edi, %edi +; UNKNWOWN-NEXT: movl $1, %esi +; UNKNWOWN-NEXT: movl $2, %edx +; UNKNWOWN-NEXT: movl $3, %ecx +; UNKNWOWN-NEXT: movl $4, %r8d +; UNKNWOWN-NEXT: movl $5, %r9d +; UNKNWOWN-NEXT: pushq $8 +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset 8 +; UNKNWOWN-NEXT: pushq $7 +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset 8 +; UNKNWOWN-NEXT: pushq $6 +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset 8 +; UNKNWOWN-NEXT: callq bar@PLT +; UNKNWOWN-NEXT: addq $32, %rsp +; UNKNWOWN-NEXT: .cfi_adjust_cfa_offset -32 +; UNKNWOWN-NEXT: popq %rcx +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 56 +; UNKNWOWN-NEXT: popq %rdx +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 48 +; UNKNWOWN-NEXT: popq %rsi +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 40 +; UNKNWOWN-NEXT: popq %rdi +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN-NEXT: popq %r8 +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 24 +; UNKNWOWN-NEXT: popq %r9 +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 16 +; UNKNWOWN-NEXT: popq %rax +; UNKNWOWN-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN-NEXT: iretq +; +; UNKNWOWN0-LABEL: baz: +; UNKNWOWN0: # %bb.0: # %entry +; UNKNWOWN0-NEXT: pushq %rax +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 16 +; UNKNWOWN0-NEXT: pushq %r9 +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 24 +; UNKNWOWN0-NEXT: pushq %r8 +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN0-NEXT: pushq %rdi +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 40 +; UNKNWOWN0-NEXT: pushq %rsi +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 48 +; UNKNWOWN0-NEXT: pushq %rdx +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 56 +; UNKNWOWN0-NEXT: pushq %rcx +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 64 +; UNKNWOWN0-NEXT: subq $32, %rsp +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 96 +; UNKNWOWN0-NEXT: .cfi_offset %rcx, -64 +; UNKNWOWN0-NEXT: .cfi_offset %rdx, -56 +; UNKNWOWN0-NEXT: .cfi_offset %rsi, -48 +; UNKNWOWN0-NEXT: .cfi_offset %rdi, -40 +; UNKNWOWN0-NEXT: .cfi_offset %r8, -32 +; UNKNWOWN0-NEXT: .cfi_offset %r9, -24 +; UNKNWOWN0-NEXT: .cfi_offset %rax, -16 +; UNKNWOWN0-NEXT: cld +; UNKNWOWN0-NEXT: movq %rsp, %rax +; UNKNWOWN0-NEXT: movl $8, 16(%rax) +; UNKNWOWN0-NEXT: movl $7, 8(%rax) +; UNKNWOWN0-NEXT: movl $6, (%rax) +; UNKNWOWN0-NEXT: xorl %edi, %edi +; UNKNWOWN0-NEXT: movl $1, %esi +; UNKNWOWN0-NEXT: movl $2, %edx +; UNKNWOWN0-NEXT: movl $3, %ecx +; UNKNWOWN0-NEXT: movl $4, %r8d +; UNKNWOWN0-NEXT: movl $5, %r9d +; UNKNWOWN0-NEXT: callq bar@PLT +; UNKNWOWN0-NEXT: addq $32, %rsp +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 64 +; UNKNWOWN0-NEXT: popq %rcx +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 56 +; UNKNWOWN0-NEXT: popq %rdx +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 48 +; UNKNWOWN0-NEXT: popq %rsi +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 40 +; UNKNWOWN0-NEXT: popq %rdi +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 32 +; UNKNWOWN0-NEXT: popq %r8 +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 24 +; UNKNWOWN0-NEXT: popq %r9 +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 16 +; UNKNWOWN0-NEXT: popq %rax +; UNKNWOWN0-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN0-NEXT: iretq +; +; UNKNWOWN-32-LABEL: baz: +; UNKNWOWN-32: # %bb.0: # %entry +; UNKNWOWN-32-NEXT: pushl %ebp +; UNKNWOWN-32-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN-32-NEXT: .cfi_offset %ebp, -8 +; UNKNWOWN-32-NEXT: movl %esp, %ebp +; UNKNWOWN-32-NEXT: .cfi_def_cfa_register %ebp +; UNKNWOWN-32-NEXT: pushl %eax +; UNKNWOWN-32-NEXT: andl $-16, %esp +; UNKNWOWN-32-NEXT: subl $80, %esp +; UNKNWOWN-32-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; UNKNWOWN-32-NEXT: .cfi_offset %eax, -12 +; UNKNWOWN-32-NEXT: .cfi_offset %xmm0, -32 +; UNKNWOWN-32-NEXT: cld +; UNKNWOWN-32-NEXT: movaps {{.*#+}} xmm0 = [4,5,6,7] +; UNKNWOWN-32-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; UNKNWOWN-32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,3] +; UNKNWOWN-32-NEXT: movups %xmm0, (%esp) +; UNKNWOWN-32-NEXT: movl $8, {{[0-9]+}}(%esp) +; UNKNWOWN-32-NEXT: calll bar@PLT +; UNKNWOWN-32-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; UNKNWOWN-32-NEXT: leal -4(%ebp), %esp +; UNKNWOWN-32-NEXT: popl %eax +; UNKNWOWN-32-NEXT: popl %ebp +; UNKNWOWN-32-NEXT: .cfi_def_cfa %esp, 4 +; UNKNWOWN-32-NEXT: iretl +; +; UNKNWOWN-32-0-LABEL: baz: +; UNKNWOWN-32-0: # %bb.0: # %entry +; UNKNWOWN-32-0-NEXT: pushl %ebp +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa_offset 8 +; UNKNWOWN-32-0-NEXT: .cfi_offset %ebp, -8 +; UNKNWOWN-32-0-NEXT: movl %esp, %ebp +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa_register %ebp +; UNKNWOWN-32-0-NEXT: pushl %eax +; UNKNWOWN-32-0-NEXT: andl $-16, %esp +; UNKNWOWN-32-0-NEXT: subl $48, %esp +; UNKNWOWN-32-0-NEXT: .cfi_offset %eax, -12 +; UNKNWOWN-32-0-NEXT: cld +; UNKNWOWN-32-0-NEXT: movl %esp, %eax +; UNKNWOWN-32-0-NEXT: movl $8, 32(%eax) +; UNKNWOWN-32-0-NEXT: movl $7, 28(%eax) +; UNKNWOWN-32-0-NEXT: movl $6, 24(%eax) +; UNKNWOWN-32-0-NEXT: movl $5, 20(%eax) +; UNKNWOWN-32-0-NEXT: movl $4, 16(%eax) +; UNKNWOWN-32-0-NEXT: movl $3, 12(%eax) +; UNKNWOWN-32-0-NEXT: movl $2, 8(%eax) +; UNKNWOWN-32-0-NEXT: movl $1, 4(%eax) +; UNKNWOWN-32-0-NEXT: movl $0, (%eax) +; UNKNWOWN-32-0-NEXT: calll bar@PLT +; UNKNWOWN-32-0-NEXT: leal -4(%ebp), %esp +; UNKNWOWN-32-0-NEXT: popl %eax +; UNKNWOWN-32-0-NEXT: popl %ebp +; UNKNWOWN-32-0-NEXT: .cfi_def_cfa %esp, 4 +; UNKNWOWN-32-0-NEXT: iretl entry: tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) ret void diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll index 4d261a9810896..731cd7dd0a0bb 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -820,7 +820,7 @@ define void @infiniteloop() { ; ENABLE-NEXT: movq %rsp, %rax ; ENABLE-NEXT: addq $-16, %rax ; ENABLE-NEXT: movq %rax, %rsp -; ENABLE-NEXT: xorl %ecx, %ecx +; ENABLE-NEXT: xorl %ecx, %ecx ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB10_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -851,8 +851,8 @@ define void @infiniteloop() { ; DISABLE-NEXT: ## %bb.1: ## %if.then ; DISABLE-NEXT: movq %rsp, %rax ; DISABLE-NEXT: addq $-16, %rax -; DISABLE-NEXT: %rax, %rsp -; DISABLE-NEXT: xorl %ecx, %ecx +; DISABLE-NEXT: movq %rax, %rsp +; DISABLE-NEXT: xorl %ecx, %ecx ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB10_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -1028,8 +1028,8 @@ define void @infiniteloop3() { ; ENABLE-NEXT: LBB12_4: ## %loop1 ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 ; ENABLE-NEXT: movq %rcx, %rdx -; ENABLE-NEXT: testq %rax, %rax ; ENABLE-NEXT: movq (%rax), %rcx +; ENABLE-NEXT: testq %rax, %rax ; ENABLE-NEXT: jne LBB12_3 ; ENABLE-NEXT: ## %bb.5: ## in Loop: Header=BB12_4 Depth=1 ; ENABLE-NEXT: movq %rdx, %rax @@ -1058,8 +1058,8 @@ define void @infiniteloop3() { ; DISABLE-NEXT: LBB12_4: ## %loop1 ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 ; DISABLE-NEXT: movq %rcx, %rdx -; DISABLE-NEXT: testq %rax, %rax ; DISABLE-NEXT: movq (%rax), %rcx +; DISABLE-NEXT: testq %rax, %rax ; DISABLE-NEXT: jne LBB12_3 ; DISABLE-NEXT: ## %bb.5: ## in Loop: Header=BB12_4 Depth=1 ; DISABLE-NEXT: movq %rdx, %rax diff --git a/llvm/test/CodeGen/X86/xaluo128.ll b/llvm/test/CodeGen/X86/xaluo128.ll index 977df0f16bb28..b992bb7b377d1 100644 --- a/llvm/test/CodeGen/X86/xaluo128.ll +++ b/llvm/test/CodeGen/X86/xaluo128.ll @@ -18,19 +18,19 @@ define zeroext i1 @saddoi128(i128 %v1, i128 %v2, ptr %res) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: addl {{[0-9]+}}(%esp), %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: seto %al -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %esi, 8(%ecx) -; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %ecx, (%ebx) +; X86-NEXT: movl %edx, 4(%ebx) +; X86-NEXT: movl %esi, 8(%ebx) +; X86-NEXT: movl %edi, 12(%ebx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -58,19 +58,19 @@ define zeroext i1 @uaddoi128(i128 %v1, i128 %v2, ptr %res) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: addl {{[0-9]+}}(%esp), %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: setb %al -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %esi, 8(%ecx) -; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %ecx, (%ebx) +; X86-NEXT: movl %edx, 4(%ebx) +; X86-NEXT: movl %esi, 8(%ebx) +; X86-NEXT: movl %edi, 12(%ebx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -99,19 +99,19 @@ define zeroext i1 @ssuboi128(i128 %v1, i128 %v2, ptr %res) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: seto %al -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %esi, 8(%ecx) -; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %ecx, (%ebx) +; X86-NEXT: movl %edx, 4(%ebx) +; X86-NEXT: movl %esi, 8(%ebx) +; X86-NEXT: movl %edi, 12(%ebx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -139,19 +139,19 @@ define zeroext i1 @usuboi128(i128 %v1, i128 %v2, ptr %res) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: setb %al -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %esi, 8(%ecx) -; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %ecx, (%ebx) +; X86-NEXT: movl %edx, 4(%ebx) +; X86-NEXT: movl %esi, 8(%ebx) +; X86-NEXT: movl %edi, 12(%ebx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll index a076d0d762aa3..255e5631edc6f 100644 --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -129,11 +129,11 @@ define zeroext i1 @smuloi16(i16 %v1, i16 %v2, ptr %res) { ; ; WIN32-LABEL: smuloi16: ; WIN32: # %bb.0: -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imulw {{[0-9]+}}(%esp), %dx +; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: imulw {{[0-9]+}}(%esp), %cx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: seto %al -; WIN32-NEXT: movw %dx, (%ecx) +; WIN32-NEXT: movw %cx, (%edx) ; WIN32-NEXT: retl %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2) %val = extractvalue {i16, i1} %t, 0 @@ -168,10 +168,10 @@ define zeroext i1 @smuloi32(i32 %v1, i32 %v2, ptr %res) { ; WIN32-LABEL: smuloi32: ; WIN32: # %bb.0: ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: seto %al -; WIN32-NEXT: movl %edx, (%ecx) +; WIN32-NEXT: movl %ecx, (%edx) ; WIN32-NEXT: retl %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -209,54 +209,54 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $8, %esp +; WIN32-NEXT: pushl %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl %ebp, %esi ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: imull %ebx, %esi +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull %ebx +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: imull %ebx, %esi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull %ebx ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, %ebp ; WIN32-NEXT: addl %ecx, %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: adcl %esi, %ebx -; WIN32-NEXT: movl %ebx, %edi -; WIN32-NEXT: sarl $31, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl %ecx, %esi ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %eax, %esi +; WIN32-NEXT: imull %edi, %esi +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %eax, %edi ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: addl %ebp, %eax -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: addl %ebp, %edi ; WIN32-NEXT: adcl %esi, %ecx -; WIN32-NEXT: movl %ecx, %ebp -; WIN32-NEXT: sarl $31, %ebp +; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: sarl $31, %esi ; WIN32-NEXT: addl %ebx, %ecx -; WIN32-NEXT: adcl %edi, %ebp +; WIN32-NEXT: adcl %eax, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: imull {{[0-9]+}}(%esp) ; WIN32-NEXT: addl %ecx, %eax -; WIN32-NEXT: adcl %ebp, %edx -; WIN32-NEXT: movl (%esp), %esi # 4-byte Reload -; WIN32-NEXT: movl %esi, %ecx +; WIN32-NEXT: adcl %esi, %edx +; WIN32-NEXT: movl %edi, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx ; WIN32-NEXT: xorl %eax, %ecx ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %esi, 4(%eax) -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; WIN32-NEXT: movl %edi, 4(%eax) +; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $8, %esp +; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -454,23 +454,23 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: testl %esi, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: testl %edi, %edi ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %cl ; WIN32-NEXT: andb %dl, %cl ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: seto %bl -; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: seto %ch ; WIN32-NEXT: orb %bl, %ch ; WIN32-NEXT: orb %cl, %ch -; WIN32-NEXT: leal (%edi,%eax), %esi +; WIN32-NEXT: addl %eax, %esi ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: addl %esi, %edx @@ -557,35 +557,34 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl %ebx, %esi -; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: imull %edi, %esi ; WIN32-NEXT: mull %edi ; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: mull %edi ; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %ecx, %ebp +; WIN32-NEXT: sarl $31, %esi +; WIN32-NEXT: imull %edi, %esi +; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: adcl %esi, %ebx -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: movl %ebx, %esi ; WIN32-NEXT: sarl $31, %esi +; WIN32-NEXT: movl %ecx, %ebp +; WIN32-NEXT: sarl $31, %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %eax, %esi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: addl %ebp, %edi -; WIN32-NEXT: adcl %esi, %ecx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; WIN32-NEXT: adcl %ebp, %ecx ; WIN32-NEXT: movl %ecx, %ebp ; WIN32-NEXT: sarl $31, %ebp ; WIN32-NEXT: addl %ebx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload +; WIN32-NEXT: adcl %esi, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: imull %ebx ; WIN32-NEXT: addl %ecx, %eax @@ -678,38 +677,38 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: pushl %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: testl %ebp, %ebp ; WIN32-NEXT: setne %al -; WIN32-NEXT: testl %esi, %esi +; WIN32-NEXT: testl %ecx, %ecx ; WIN32-NEXT: setne %bl ; WIN32-NEXT: andb %al, %bl -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %edi -; WIN32-NEXT: movl %edi, %edx -; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %esi, %edx +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: mull %ecx +; WIN32-NEXT: mull %edi ; WIN32-NEXT: seto %bh ; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload ; WIN32-NEXT: orb %bl, %bh -; WIN32-NEXT: addl %eax, %edi -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: addl %eax, %esi +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: addl %edi, %edx +; WIN32-NEXT: addl %esi, %edx ; WIN32-NEXT: setb %al ; WIN32-NEXT: orb %bh, %al ; WIN32-NEXT: testb %al, %al ; WIN32-NEXT: jne LBB14_2 ; WIN32-NEXT: # %bb.1: -; WIN32-NEXT: movl %ebp, %ecx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: movl %ebp, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: LBB14_2: -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: movl %esi, %edx +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: movl %ecx, %edx ; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi @@ -957,43 +956,41 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: pushl %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl %ebp, %ecx -; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: imull %edi, %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %ebx, %ecx ; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull %edi -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: sarl $31, %ecx +; WIN32-NEXT: imull %edi, %ecx +; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: mull %edi ; WIN32-NEXT: movl %edx, %edi -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %ebx, %ebp +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: addl %ebp, %ebx ; WIN32-NEXT: adcl %ecx, %edi -; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: imull %esi, %ecx ; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull %edx -; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %ebp, %esi -; WIN32-NEXT: adcl %ecx, %ebx -; WIN32-NEXT: movl %ebx, %ebp -; WIN32-NEXT: sarl $31, %ebp -; WIN32-NEXT: addl %edi, %ebx -; WIN32-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: addl %ebx, %esi +; WIN32-NEXT: adcl %ecx, %ebp +; WIN32-NEXT: movl %ebp, %ecx +; WIN32-NEXT: sarl $31, %ecx +; WIN32-NEXT: addl %edi, %ebp +; WIN32-NEXT: adcl %eax, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: imull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ebx, %eax -; WIN32-NEXT: adcl %ebp, %edx +; WIN32-NEXT: addl %ebp, %eax +; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: sarl $31, %esi ; WIN32-NEXT: xorl %esi, %edx ; WIN32-NEXT: xorl %eax, %esi @@ -1002,7 +999,6 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) { ; WIN32-NEXT: # %bb.3: # %continue ; WIN32-NEXT: movb $1, %al ; WIN32-NEXT: LBB18_2: # %overflow -; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -1266,28 +1262,28 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: testl %esi, %esi -; WIN32-NEXT: setne %dl -; WIN32-NEXT: testl %eax, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: testl %edi, %edi ; WIN32-NEXT: setne %cl -; WIN32-NEXT: andb %dl, %cl +; WIN32-NEXT: testl %eax, %eax +; WIN32-NEXT: setne %bl +; WIN32-NEXT: andb %cl, %bl ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: seto %bl -; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: seto %bh +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: seto %ch -; WIN32-NEXT: orb %bl, %ch -; WIN32-NEXT: orb %cl, %ch -; WIN32-NEXT: leal (%edi,%eax), %esi +; WIN32-NEXT: seto %cl +; WIN32-NEXT: orb %bh, %cl +; WIN32-NEXT: orb %bl, %cl +; WIN32-NEXT: addl %eax, %esi ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: addl %esi, %edx ; WIN32-NEXT: setb %al -; WIN32-NEXT: orb %ch, %al +; WIN32-NEXT: orb %cl, %al ; WIN32-NEXT: subb $1, %al ; WIN32-NEXT: je LBB22_1 ; WIN32-NEXT: # %bb.3: # %continue @@ -1476,12 +1472,12 @@ define zeroext i1 @smuloi16_load(ptr %ptr1, i16 %v2, ptr %res) { ; ; WIN32-LABEL: smuloi16_load: ; WIN32: # %bb.0: -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movzwl (%eax), %edx -; WIN32-NEXT: imulw {{[0-9]+}}(%esp), %dx +; WIN32-NEXT: movzwl (%eax), %ecx +; WIN32-NEXT: imulw {{[0-9]+}}(%esp), %cx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: seto %al -; WIN32-NEXT: movw %dx, (%ecx) +; WIN32-NEXT: movw %cx, (%edx) ; WIN32-NEXT: retl %v1 = load i16, ptr %ptr1 %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2) @@ -1516,12 +1512,12 @@ define zeroext i1 @smuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) { ; ; WIN32-LABEL: smuloi16_load2: ; WIN32: # %bb.0: -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imulw (%eax), %dx +; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: imulw (%eax), %cx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: seto %al -; WIN32-NEXT: movw %dx, (%ecx) +; WIN32-NEXT: movw %cx, (%edx) ; WIN32-NEXT: retl %v2 = load i16, ptr %ptr2 %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2) @@ -1556,12 +1552,12 @@ define zeroext i1 @smuloi32_load(ptr %ptr1, i32 %v2, ptr %res) { ; ; WIN32-LABEL: smuloi32_load: ; WIN32: # %bb.0: -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %edx -; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl (%eax), %ecx +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: seto %al -; WIN32-NEXT: movl %edx, (%ecx) +; WIN32-NEXT: movl %ecx, (%edx) ; WIN32-NEXT: retl %v1 = load i32, ptr %ptr1 %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) @@ -1596,12 +1592,12 @@ define zeroext i1 @smuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) { ; ; WIN32-LABEL: smuloi32_load2: ; WIN32: # %bb.0: -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: imull (%eax), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull (%eax), %edx ; WIN32-NEXT: seto %al -; WIN32-NEXT: movl %edx, (%ecx) +; WIN32-NEXT: movl %ecx, (%edx) ; WIN32-NEXT: retl %v2 = load i32, ptr %ptr2 %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) @@ -1640,57 +1636,56 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $12, %esp +; WIN32-NEXT: subl $8, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %ecx +; WIN32-NEXT: movl (%eax), %edi ; WIN32-NEXT: movl 4(%eax), %ebp ; WIN32-NEXT: movl %ebp, %esi -; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %ebp, (%esp) # 4-byte Spill ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: imull %ebx, %esi -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull %ebx -; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: imull %ebx, %esi ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull %ebx ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %edi, %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: addl %ecx, %ebp ; WIN32-NEXT: adcl %esi, %ebx -; WIN32-NEXT: movl %ebx, %edi -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl %ecx, %esi ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: imull %ecx, %esi -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: imull %edi, %esi +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %eax, %edi ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: addl %ebp, %eax -; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: addl %ebp, %edi ; WIN32-NEXT: adcl %esi, %ecx -; WIN32-NEXT: movl %ecx, %ebp -; WIN32-NEXT: sarl $31, %ebp +; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: sarl $31, %esi ; WIN32-NEXT: addl %ebx, %ecx -; WIN32-NEXT: adcl %edi, %ebp -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; WIN32-NEXT: adcl %eax, %esi +; WIN32-NEXT: movl (%esp), %eax # 4-byte Reload ; WIN32-NEXT: imull {{[0-9]+}}(%esp) ; WIN32-NEXT: addl %ecx, %eax -; WIN32-NEXT: adcl %ebp, %edx -; WIN32-NEXT: movl (%esp), %esi # 4-byte Reload -; WIN32-NEXT: movl %esi, %ecx +; WIN32-NEXT: adcl %esi, %edx +; WIN32-NEXT: movl %edi, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx ; WIN32-NEXT: xorl %eax, %ecx ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %esi, 4(%eax) +; WIN32-NEXT: movl %edi, 4(%eax) ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $12, %esp +; WIN32-NEXT: addl $8, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -1733,57 +1728,57 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $12, %esp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: subl $8, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl (%ecx), %ebx -; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: movl %ebp, %esi ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: imull %ebx, %esi +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull %ebx -; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: imull %ebx, %esi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull %ebx ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, %ebp ; WIN32-NEXT: addl %ecx, %ebp +; WIN32-NEXT: adcl %esi, %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl 4(%eax), %ecx ; WIN32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %esi, %ebx -; WIN32-NEXT: movl %ebx, %edi -; WIN32-NEXT: sarl $31, %edi ; WIN32-NEXT: movl %ecx, %esi ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %eax, %esi +; WIN32-NEXT: imull %edi, %esi +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %eax, %edi ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: addl %ebp, %eax -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: addl %ebp, %edi ; WIN32-NEXT: adcl %esi, %ecx -; WIN32-NEXT: movl %ecx, %ebp -; WIN32-NEXT: sarl $31, %ebp +; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: sarl $31, %esi ; WIN32-NEXT: addl %ebx, %ecx -; WIN32-NEXT: adcl %edi, %ebp +; WIN32-NEXT: adcl %eax, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: imull (%esp) # 4-byte Folded Reload ; WIN32-NEXT: addl %ecx, %eax -; WIN32-NEXT: adcl %ebp, %edx -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; WIN32-NEXT: movl %esi, %ecx +; WIN32-NEXT: adcl %esi, %edx +; WIN32-NEXT: movl %edi, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx ; WIN32-NEXT: xorl %eax, %ecx ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %esi, 4(%eax) +; WIN32-NEXT: movl %edi, 4(%eax) ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $12, %esp +; WIN32-NEXT: addl $8, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -2138,27 +2133,27 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %ebp -; WIN32-NEXT: movl 4(%eax), %eax -; WIN32-NEXT: testl %esi, %esi +; WIN32-NEXT: movl 4(%esi), %eax +; WIN32-NEXT: testl %ebp, %ebp ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %cl ; WIN32-NEXT: andb %dl, %cl ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl (%esi), %esi ; WIN32-NEXT: seto %bl -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull %esi ; WIN32-NEXT: seto %ch ; WIN32-NEXT: orb %bl, %ch ; WIN32-NEXT: orb %cl, %ch -; WIN32-NEXT: leal (%edi,%eax), %esi -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: addl %eax, %edi +; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %esi, %edx +; WIN32-NEXT: addl %edi, %edx ; WIN32-NEXT: setb %cl ; WIN32-NEXT: orb %ch, %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -2218,21 +2213,21 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl (%ecx), %ebp -; WIN32-NEXT: movl 4(%ecx), %esi +; WIN32-NEXT: movl 4(%ecx), %edi ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %dl -; WIN32-NEXT: testl %esi, %esi +; WIN32-NEXT: testl %edi, %edi ; WIN32-NEXT: setne %cl ; WIN32-NEXT: andb %dl, %cl ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl %eax, %esi ; WIN32-NEXT: seto %bl -; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: seto %ch ; WIN32-NEXT: orb %bl, %ch ; WIN32-NEXT: orb %cl, %ch -; WIN32-NEXT: leal (%edi,%eax), %esi +; WIN32-NEXT: addl %eax, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: addl %esi, %edx diff --git a/llvm/test/CodeGen/X86/xor-lea.ll b/llvm/test/CodeGen/X86/xor-lea.ll index d50752e48d293..0f065df7c0a74 100644 --- a/llvm/test/CodeGen/X86/xor-lea.ll +++ b/llvm/test/CodeGen/X86/xor-lea.ll @@ -180,8 +180,8 @@ define i64 @xor_add_sminval_i64(i64 %x, i64 %y) { ; X86-LABEL: xor_add_sminval_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: retl @@ -261,8 +261,8 @@ define i64 @add_xor_sminval_i64(i64 %x, i64 %y) { ; X86-LABEL: add_xor_sminval_i64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index 2bef66825d8c0..c15cf95fda9d0 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -131,8 +131,8 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X86-NEXT: notl %edx ; X86-NEXT: andl %ecx, %edx ; X86-NEXT: addl %edx, %edx -; X86-NEXT: testw %dx, %dx ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: testw %dx, %dx ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -149,8 +149,8 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-LIN-NEXT: notl %ecx ; X64-LIN-NEXT: andl %esi, %ecx ; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: testw %cx, %cx ; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: testw %cx, %cx ; X64-LIN-NEXT: jne .LBB4_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -168,8 +168,8 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-WIN-NEXT: notl %ecx ; X64-WIN-NEXT: andl %edx, %ecx ; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: testw %cx, %cx ; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: testw %cx, %cx ; X64-WIN-NEXT: jne .LBB4_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll b/llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll index f56044b857b93..78d46e84cd506 100644 --- a/llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll +++ b/llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll @@ -22,9 +22,9 @@ define dso_local zeroext i1 @test1(ptr nocapture noundef readonly %0) local_unna ; I386-NEXT: movl (%eax), %eax ; I386-NEXT: movzbl (%eax), %ebx ; I386-NEXT: calll bar +; I386-NEXT: # implicit-def: $cl +; I386-NEXT: # kill: killed $cl ; I386-NEXT: testb %al, %al -; I386-NEXT: # implicit-def: $al -; I386-NEXT: # kill: killed $al ; I386-NEXT: je .LBB0_6 ; I386-NEXT: # %bb.1: ; I386-NEXT: cmpl $0, mas_data_end_type diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index ddd7f10168936..7246801ce979a 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -1682,33 +1682,19 @@ define void @vec256_v32i8_to_v4i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE2-LABEL: vec256_v32i8_to_v2i128_factor16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) -; SSE2-NEXT: retq -; -; SSE42-LABEL: vec256_v32i8_to_v2i128_factor16: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] -; SSE42-NEXT: pand %xmm0, %xmm1 -; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) -; SSE42-NEXT: retq +; SSE-LABEL: vec256_v32i8_to_v2i128_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v2i128_factor16: ; AVX: # %bb.0: @@ -1797,7 +1783,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1808,7 +1794,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -1819,7 +1805,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -2079,7 +2065,7 @@ define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 @@ -2340,7 +2326,7 @@ define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,u,u,u,1,u,u,u] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -2866,7 +2852,7 @@ define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2,0,3,3,0,4,4,0,5] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2,u,3,3,u,4,4,u,5] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero @@ -3103,8 +3089,7 @@ define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bia ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,5] ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero @@ -3367,43 +3352,24 @@ define void @vec384_v48i8_to_v4i96_factor12(ptr %in.vec.base.ptr, ptr %in.vec.bi } define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE2-LABEL: vec384_v48i8_to_v3i128_factor16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb 32(%rdx), %xmm2 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) -; SSE2-NEXT: retq -; -; SSE42-LABEL: vec384_v48i8_to_v3i128_factor16: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] -; SSE42-NEXT: pand %xmm0, %xmm1 -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) -; SSE42-NEXT: retq +; SSE-LABEL: vec384_v48i8_to_v3i128_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v3i128_factor16: ; AVX: # %bb.0: @@ -3518,9 +3484,9 @@ define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) @@ -3533,9 +3499,9 @@ define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: movdqa %xmm0, (%rcx) @@ -3547,9 +3513,9 @@ define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) @@ -3614,8 +3580,8 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) @@ -3639,7 +3605,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -3652,7 +3618,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 @@ -3665,7 +3631,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3854,7 +3820,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -3871,7 +3837,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero @@ -3886,7 +3852,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero @@ -3901,7 +3867,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] ; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -3918,7 +3884,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,0,2,2] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero @@ -3933,7 +3899,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -3949,7 +3915,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,17,4,5,18,7,8,19,10,11,20,13,14,21] ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero @@ -4247,7 +4213,7 @@ define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -4263,7 +4229,7 @@ define void @vec384_v24i16_to_v4i96_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,17,7,8,9,10,11,18,13,14,15] ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -4343,14 +4309,14 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) @@ -4360,13 +4326,13 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) @@ -4376,13 +4342,13 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; ; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) @@ -4428,7 +4394,7 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] @@ -4444,7 +4410,7 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15] ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -4477,9 +4443,9 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec. ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) @@ -4494,9 +4460,9 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec. ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: movaps 32(%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movaps 32(%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) @@ -4510,11 +4476,11 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -4551,7 +4517,7 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,17,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,17,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 @@ -4579,8 +4545,8 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec. ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movaps 16(%rdx), %xmm1 -; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) @@ -4593,8 +4559,8 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec. ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: movaps 16(%rdx), %xmm0 -; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) @@ -4878,7 +4844,7 @@ define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,17,4,5,18,7,8,19,10,11,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,17,4,5,18,7,8,19,10,11,u,u,u,u] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -5009,13 +4975,13 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b ; ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) @@ -5028,7 +4994,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -5043,7 +5009,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,18,9,10,11,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -5107,9 +5073,9 @@ define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE2-NEXT: movaps 32(%rdx), %xmm0 ; SSE2-NEXT: paddb 16(%rdx), %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movaps 32(%rdx), %xmm0 ; SSE2-NEXT: movaps %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm1, 16(%rcx) @@ -5124,9 +5090,9 @@ define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: movaps 32(%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: movaps 32(%rdx), %xmm1 ; SSE42-NEXT: movaps %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) @@ -5139,9 +5105,9 @@ define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1],zero -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) @@ -5182,11 +5148,11 @@ define void @vec384_v12i32_to_v2i192_factor6(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5],ymm0[6],ymm2[7] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) @@ -5241,8 +5207,8 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec. ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps 16(%rdx), %xmm0 -; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) @@ -5255,8 +5221,8 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec. ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: movaps 16(%rdx), %xmm0 -; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) @@ -5331,8 +5297,8 @@ define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; SSE-LABEL: vec384_v6i64_to_v3i128_factor2: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: paddb 16(%rsi), %xmm1 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; SSE-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero @@ -5386,7 +5352,7 @@ define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,1,11,2,13,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,1,11,2,13,u,u] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -5432,9 +5398,9 @@ define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: paddb 16(%rdx), %xmm1 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) @@ -5447,9 +5413,9 @@ define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: movdqa %xmm0, (%rcx) @@ -5461,13 +5427,13 @@ define void @vec384_v6i64_to_v2i192_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5531,8 +5497,8 @@ define void @vec384_v6i64_to_v1i384_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bi ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) @@ -5605,8 +5571,8 @@ define void @vec384_v3i128_to_v1i384_factor3(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 +; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) @@ -5696,8 +5662,8 @@ define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; SSE42-LABEL: vec512_v64i8_to_v32i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE42-NEXT: pxor %xmm3, %xmm3 @@ -5716,23 +5682,23 @@ define void @vec512_v64i8_to_v32i16_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; ; AVX-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v32i16_factor2: @@ -6011,51 +5977,28 @@ define void @vec512_v64i8_to_v8i64_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bia } define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE2-LABEL: vec512_v64i8_to_v4i128_factor16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb 48(%rdx), %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm2 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) -; SSE2-NEXT: retq -; -; SSE42-LABEL: vec512_v64i8_to_v4i128_factor16: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] -; SSE42-NEXT: pand %xmm0, %xmm1 -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) -; SSE42-NEXT: retq +; SSE-LABEL: vec512_v64i8_to_v4i128_factor16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: paddb 16(%rdx), %xmm0 +; SSE-NEXT: paddb 48(%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 48(%rcx) +; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v4i128_factor16: ; AVX: # %bb.0: @@ -6083,9 +6026,9 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero @@ -6103,9 +6046,9 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] @@ -6122,9 +6065,9 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] @@ -6162,7 +6105,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,1,0,8,0,9,0] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,u,1,u,8,u,9,u] ; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -6198,7 +6141,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,0,1,0,8,0,9,0] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,u,1,u,8,u,9,u] ; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -6218,41 +6161,23 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b } define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { -; SSE2-LABEL: vec512_v64i8_to_v2i256_factor32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movaps 16(%rdx), %xmm2 -; SSE2-NEXT: movaps 48(%rdx), %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movaps %xmm3, 48(%rcx) -; SSE2-NEXT: movaps %xmm2, 16(%rcx) -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: retq -; -; SSE42-LABEL: vec512_v64i8_to_v2i256_factor32: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = [255,0] -; SSE42-NEXT: pand %xmm0, %xmm1 -; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movaps 16(%rdx), %xmm2 -; SSE42-NEXT: movaps 48(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movaps %xmm3, 48(%rcx) -; SSE42-NEXT: movaps %xmm2, 16(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: retq +; SSE-LABEL: vec512_v64i8_to_v2i256_factor32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movaps 16(%rdx), %xmm2 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: paddb 32(%rdx), %xmm0 +; SSE-NEXT: paddb (%rdx), %xmm1 +; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v2i256_factor32: ; AVX: # %bb.0: @@ -6275,7 +6200,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6289,7 +6214,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6303,7 +6228,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -6355,7 +6280,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6368,7 +6293,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 @@ -6381,7 +6306,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6424,8 +6349,8 @@ define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE42-LABEL: vec512_v32i16_to_v16i32_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -6444,23 +6369,23 @@ define void @vec512_v32i16_to_v16i32_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b ; ; AVX-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v32i16_to_v16i32_factor2: @@ -6789,7 +6714,7 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,1,2,3,4,5,6,7,35,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,1,2,3,4,5,6,7,35,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 @@ -7036,8 +6961,8 @@ define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; SSE42-LABEL: vec512_v16i32_to_v8i64_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero @@ -7056,23 +6981,23 @@ define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; ; AVX-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: @@ -7230,10 +7155,10 @@ define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,1,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,u,u,u,3,u,u,u] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -7266,7 +7191,7 @@ define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15] ; AVX512BW-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -7563,19 +7488,19 @@ define void @vec512_v8i64_to_v4i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 48(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -7634,7 +7559,7 @@ define void @vec512_v8i64_to_v4i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-FAST-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z} ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,5,3,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,5,3,7] ; AVX512BW-FAST-NEXT: vpermi2q %ymm2, %ymm0, %ymm3 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -7989,9 +7914,9 @@ define void @vec512_v2i256_to_v1i512_factor2(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 7fbb211b69ccf..b4a9c8626d161 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -659,18 +659,18 @@ define void @vec128_i8_widen_to_i64_factor8_broadcast_to_v2i64_factor2(ptr %in.v define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: paddb 16(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb (%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: @@ -749,7 +749,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 @@ -761,7 +761,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax @@ -803,24 +803,24 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; SSE42-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 16(%rdi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa 16(%rdi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq @@ -868,7 +868,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -881,7 +881,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -919,24 +919,24 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; SSE42-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 16(%rdi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa 16(%rdi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq @@ -968,7 +968,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -978,7 +978,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 @@ -989,7 +989,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 @@ -1000,7 +1000,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 @@ -1070,20 +1070,20 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq @@ -1157,21 +1157,21 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq ; @@ -1281,21 +1281,21 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq ; @@ -1404,22 +1404,22 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: paddb 32(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: paddb (%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm1 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movdqa %xmm1, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1448,7 +1448,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -1464,7 +1464,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1555,18 +1555,18 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa 32(%rdi), %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: movdqa %xmm4, (%rcx) @@ -1636,7 +1636,7 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1659,56 +1659,56 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa 32(%rdi), %xmm0 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: paddb 32(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1754,7 +1754,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1776,22 +1776,22 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: paddb 32(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: paddb (%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm1 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movdqa %xmm1, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1828,12 +1828,12 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper @@ -1841,12 +1841,12 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -1854,12 +1854,12 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -1869,7 +1869,7 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1911,29 +1911,29 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa 32(%rdi), %xmm0 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: paddb 32(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] @@ -1978,7 +1978,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2004,7 +2004,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2026,7 +2026,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2100,12 +2100,12 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX2-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper @@ -2113,12 +2113,12 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper @@ -2127,10 +2127,10 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2139,12 +2139,12 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -2153,10 +2153,10 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,3,8,5,6,7] ; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -2178,7 +2178,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2278,12 +2278,12 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] +; AVX512F-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper @@ -2304,12 +2304,12 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -2330,7 +2330,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2398,21 +2398,21 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; ; AVX-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: @@ -2500,8 +2500,8 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; SSE2-LABEL: vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -2572,8 +2572,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm2 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -2652,8 +2651,8 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; SSE2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -2672,19 +2671,19 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; ; SSE42-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rdi), %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero +; SSE42-NEXT: pblendvb %xmm0, %xmm1, %xmm3 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[0],zero,zero,zero,xmm2[0],zero,zero,zero,xmm2[0],zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm3 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0 -; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: paddb %xmm2, %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq @@ -2728,17 +2727,17 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = mem & (ymm1 | ymm0) +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[0],zero,zero,zero,xmm2[0],zero,zero,zero,xmm2[0],zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -2747,17 +2746,17 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm3 -; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm3 +; AVX512DQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = mem & (ymm1 | ymm0) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[0],zero,zero,zero,xmm2[0],zero,zero,zero,xmm2[0],zero,zero,zero +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) @@ -2864,8 +2863,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm2 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -2944,8 +2942,8 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; SSE2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -2964,19 +2962,19 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; ; SSE42-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rdi), %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pblendvb %xmm0, %xmm1, %xmm3 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm3 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0 -; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: paddb %xmm2, %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq @@ -2988,7 +2986,8 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709551360,18446744073709551360] +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 @@ -3020,17 +3019,17 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = mem & (ymm1 | ymm0) +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -3039,17 +3038,17 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm3 -; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm3 +; AVX512DQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = mem & (ymm1 | ymm0) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) @@ -3109,21 +3108,21 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; ; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rdi), %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: pblendvb %xmm0, %xmm1, %xmm3 +; SSE42-NEXT: movdqa %xmm2, %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq ; @@ -3134,7 +3133,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; AVX-NEXT: vpmovsxwd {{.*#+}} xmm3 = [4294967040,4294967295,4294967295,4294967040] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -3155,8 +3154,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm2 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -3254,19 +3252,19 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; ; SSE42-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm1 -; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rdi), %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm2 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE42-NEXT: movdqa %xmm1, %xmm3 -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 -; SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: pblendvb %xmm0, %xmm1, %xmm3 +; SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm3 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0 -; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: paddb %xmm2, %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq @@ -3277,7 +3275,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 @@ -3295,10 +3293,10 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm3 = [255,0,255,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3391,9 +3389,9 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: movaps 32(%rdx), %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: movaps 32(%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) @@ -3409,9 +3407,9 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movaps 32(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm3 ; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: movaps 32(%rdx), %xmm0 ; SSE42-NEXT: movaps %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: movdqa %xmm3, (%rcx) @@ -3419,19 +3417,19 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3442,10 +3440,10 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm2 -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm3 = [255,0,18446744073709551615,18446744073709551360] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -3634,7 +3632,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -3651,7 +3649,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,u,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3696,147 +3694,147 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 48(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; SSE42-NEXT: movdqa 48(%rdi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] -; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] ; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] ; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -3854,7 +3852,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,u,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3877,8 +3875,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; SSE2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535] ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -3988,8 +3986,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -4025,8 +4023,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -4062,7 +4060,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -4080,7 +4078,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,u,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4126,23 +4124,23 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 48(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE42-NEXT: movdqa 48(%rdi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5],xmm2[6],xmm0[7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] ; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6,7] ; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4168,106 +4166,106 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5],xmm2[6],xmm0[7] ; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5],xmm2[6],xmm0[7] ; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -4285,7 +4283,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47,48,49,0,51,52,53,54,55,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47,48,49,0,51,52,53,54,55,u,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4328,8 +4326,8 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pxor %xmm2, %xmm2 @@ -4418,7 +4416,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 @@ -4456,9 +4454,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: movaps 32(%rdx), %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm2 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: movaps 32(%rdx), %xmm1 ; SSE2-NEXT: movaps %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) @@ -4474,9 +4472,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: movaps 32(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movaps 32(%rdx), %xmm0 ; SSE42-NEXT: movaps %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) @@ -4490,10 +4488,10 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) @@ -4557,7 +4555,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 @@ -4645,21 +4643,21 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4707,7 +4705,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4725,7 +4723,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4739,7 +4737,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4756,7 +4754,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] @@ -4782,21 +4780,21 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,1,1] +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,1,1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] -; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: paddb 32(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: retq ; @@ -4823,22 +4821,22 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,0,1,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -4848,7 +4846,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] @@ -4865,11 +4863,11 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] @@ -4884,11 +4882,11 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] @@ -4908,7 +4906,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4926,7 +4924,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4940,7 +4938,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4957,7 +4955,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] @@ -4984,8 +4982,8 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; SSE2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: xorps %xmm2, %xmm2 @@ -5002,8 +5000,8 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pxor %xmm2, %xmm2 @@ -5062,7 +5060,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -5080,7 +5078,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -5095,7 +5093,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -5125,9 +5123,9 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] -; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) @@ -5143,9 +5141,9 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] -; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) @@ -5159,11 +5157,11 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) @@ -5177,7 +5175,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] @@ -5195,7 +5193,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 @@ -5212,7 +5210,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 @@ -5226,7 +5224,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] @@ -5324,7 +5322,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -5342,7 +5340,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -5357,7 +5355,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u] ; AVX512BW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -5367,7 +5365,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -5400,9 +5398,9 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) @@ -5416,9 +5414,9 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) @@ -5431,10 +5429,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) @@ -5443,11 +5441,11 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX2-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,0] ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5466,7 +5464,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,10,0,u,u,u,u] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 @@ -5483,7 +5481,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,7,10,0,u,u,u,u] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1 @@ -5495,7 +5493,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,10,0] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,10,0] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 @@ -5508,7 +5506,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,2,0] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,0] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -5997,7 +5995,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6010,7 +6008,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6023,7 +6021,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,0] +; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6596,7 +6594,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6611,7 +6609,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6626,7 +6624,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6720,7 +6718,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -6735,7 +6733,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -6751,7 +6749,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6837,7 +6835,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -6852,7 +6850,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -6868,7 +6866,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,17,18,19,4,21,22,23,0,25,26,27,4,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,18,19,4,21,22,23,0,25,26,27,4,29,30,31] ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6943,7 +6941,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6958,7 +6956,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6973,7 +6971,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -7039,7 +7037,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -7054,7 +7052,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 @@ -7082,7 +7080,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,0,13,2,15] ; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -7146,7 +7144,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -7161,7 +7159,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 45d589b6c988e..559fcebbc5e74 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -643,7 +643,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -652,7 +652,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] @@ -737,7 +737,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -747,7 +747,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -968,8 +968,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 @@ -983,8 +983,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: pshufb %xmm2, %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 @@ -997,8 +997,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -1064,8 +1064,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 @@ -1079,8 +1079,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: pshufb %xmm2, %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 @@ -1093,8 +1093,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -1159,8 +1159,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; SSE2-NEXT: pandn (%rdi), %xmm1 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm2 @@ -1184,7 +1184,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2 ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 @@ -1196,7 +1196,7 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1328,7 +1328,7 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1354,8 +1354,8 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 @@ -1415,7 +1415,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1440,8 +1440,8 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; SSE2-NEXT: pandn (%rdi), %xmm1 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm2 @@ -1501,7 +1501,7 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1524,9 +1524,9 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,0,1,1] ; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = mem[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, (%rdx) @@ -1662,7 +1662,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] ; AVX512F-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1680,7 +1680,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] ; AVX512DQ-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1698,7 +1698,7 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] ; AVX512BW-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1772,7 +1772,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,0,7] ; AVX512F-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1790,7 +1790,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,0,7] ; AVX512DQ-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) @@ -1808,7 +1808,7 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,5,0,7] ; AVX512BW-FAST-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -1830,22 +1830,23 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn 48(%rdi), %xmm1 -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn 48(%rdi), %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddb (%rsi), %xmm2 +; SSE2-NEXT: paddb (%rsi), %xmm3 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) ; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) +; SSE2-NEXT: movdqa %xmm3, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: @@ -1930,12 +1931,12 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -2015,8 +2016,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -2265,8 +2265,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -2369,7 +2368,8 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -2496,7 +2496,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967040,4294967295,4294967295,4294967040] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -2515,8 +2515,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -2618,7 +2617,7 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 @@ -2651,8 +2650,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -2669,8 +2668,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -2718,9 +2717,9 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: movaps 32(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 +; SSE2-NEXT: movaps 32(%rsi), %xmm2 ; SSE2-NEXT: movaps %xmm2, 32(%rdx) ; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) @@ -2733,9 +2732,9 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: movdqa %xmm1, %xmm2 ; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movaps 32(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm2 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movaps 32(%rsi), %xmm0 ; SSE42-NEXT: movaps %xmm0, 32(%rdx) ; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: movdqa %xmm1, 16(%rdx) @@ -2744,12 +2743,12 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) @@ -2760,10 +2759,10 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1 -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [255,0,18446744073709551615,18446744073709551360] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551615,18446744073709551360,18446744073709551615] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -2775,7 +2774,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq {{.*#+}} xmm1 = mem ^ (xmm1 & (xmm0 ^ mem)) ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -2790,7 +2789,7 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551360,18446744073709551615] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = mem ^ (xmm1 & (xmm0 ^ mem)) ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -2807,8 +2806,8 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX512BW-NEXT: movw $1, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} ; AVX512BW-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -2881,49 +2880,49 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,32,59,32,61,32,63,32,9,32,11,32,13,32,15,32,17,32,19,32,21,32,23,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,57,32,59,32,61,32,63,32,9,32,11,32,13,32,15,32,17,32,19,32,21,32,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3077,7 +3076,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,32,60,61,32,63,8,32,10,11,32,13,14,32,16,17,32,19,20,32,22,23,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,57,58,32,60,61,32,63,8,32,10,11,32,13,14,32,16,17,32,19,20,32,22,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3147,16 +3146,16 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3192,16 +3191,16 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -3222,16 +3221,16 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -3252,7 +3251,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,59,32,61,62,63,32,9,10,11,32,13,14,15,32,17,18,19,32,21,22,23,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,32,61,62,63,32,9,10,11,32,13,14,15,32,17,18,19,32,21,22,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3411,7 +3410,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,59,60,61,32,63,8,9,10,11,32,13,14,15,16,17,32,19,20,21,22,23,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,60,61,32,63,8,9,10,11,32,13,14,15,16,17,32,19,20,21,22,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3526,7 +3525,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,32,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,32,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3556,9 +3555,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: movaps 32(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 +; SSE2-NEXT: movaps 32(%rsi), %xmm2 ; SSE2-NEXT: movaps %xmm2, 32(%rdx) ; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) @@ -3572,9 +3571,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: movaps 32(%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: movaps 32(%rsi), %xmm0 ; SSE42-NEXT: movaps %xmm0, 32(%rdx) ; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) @@ -3585,10 +3584,10 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) @@ -3614,10 +3613,10 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm2[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rdx) @@ -3629,10 +3628,10 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm2[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm1[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rdx) @@ -3642,7 +3641,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,8,9,10,11,32,13,14,15,16,17,18,19,20,21,22,23,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,8,9,10,11,32,13,14,15,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3716,17 +3715,17 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3764,7 +3763,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,16,31,16,5,16,7,16,9,16,11,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,16,31,16,5,16,7,16,9,16,11,u,u,u,u] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -3777,7 +3776,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,16,31,16,5,16,7,16,9,16,11,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,16,31,16,5,16,7,16,9,16,11,u,u,u,u] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -3790,7 +3789,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,16,31,16,5,16,7,16,9,16,11,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,16,31,16,5,16,7,16,9,16,11,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -3831,9 +3830,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 @@ -3851,9 +3850,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; AVX-NEXT: vbroadcastss (%rdi), %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vbroadcastss (%rdi), %xmm3 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) @@ -3867,7 +3866,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] @@ -3884,7 +3883,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] @@ -3901,7 +3900,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] @@ -3914,7 +3913,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,16,4,5,16,7,8,16,10,11,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,16,4,5,16,7,8,16,10,11,u,u,u,u] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -3927,7 +3926,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,16,4,5,16,7,8,16,10,11,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,16,4,5,16,7,8,16,10,11,u,u,u,u] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -3940,7 +3939,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,16,4,5,16,7,8,16,10,11,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,16,4,5,16,7,8,16,10,11,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -4023,7 +4022,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,16,5,6,7,16,9,10,11,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,31,16,5,6,7,16,9,10,11,u,u,u,u] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -4036,7 +4035,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,16,5,6,7,16,9,10,11,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,31,16,5,6,7,16,9,10,11,u,u,u,u] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -4049,7 +4048,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,16,5,6,7,16,9,10,11,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,31,16,5,6,7,16,9,10,11,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -4076,9 +4075,9 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: movaps 32(%rsi), %xmm0 ; SSE2-NEXT: paddb 16(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: movaps 32(%rsi), %xmm0 ; SSE2-NEXT: movaps %xmm0, 32(%rdx) ; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) @@ -4092,9 +4091,9 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] -; SSE42-NEXT: movaps 32(%rsi), %xmm2 ; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: movaps 32(%rsi), %xmm2 ; SSE42-NEXT: movaps %xmm2, 32(%rdx) ; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) @@ -4105,11 +4104,11 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7] ; AVX-NEXT: vbroadcastss (%rdi), %xmm1 -; AVX-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3] ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) @@ -4120,7 +4119,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] @@ -4133,7 +4132,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 @@ -4145,7 +4144,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 @@ -4157,7 +4156,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512BW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,8,9,10,11,0,0,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,8,9,10,11,u,u,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -4237,7 +4236,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,8,3,8,5,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,15,8,3,8,5,u,u] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -4250,7 +4249,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,8,3,8,5,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,15,8,3,8,5,u,u] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -4263,7 +4262,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,8,3,8,5,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,15,8,3,8,5,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -4288,9 +4287,9 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: movaps 32(%rsi), %xmm2 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: movaps 32(%rsi), %xmm2 ; SSE2-NEXT: movaps %xmm2, 32(%rdx) ; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) @@ -4302,9 +4301,9 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE42-NEXT: movaps 32(%rsi), %xmm2 ; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: movaps 32(%rsi), %xmm2 ; SSE42-NEXT: movaps %xmm2, 32(%rdx) ; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) @@ -4314,10 +4313,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) @@ -4340,7 +4339,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,2,8,0,0,0,0] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,15,2,8,u,u,u,u] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 @@ -4352,7 +4351,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,2,8,0,0,0,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,15,2,8,u,u,u,u] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 @@ -4364,7 +4363,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512BW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,2,8,4,5,0,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,15,2,8,4,5,u,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -4796,7 +4795,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = [255,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0] ; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -4807,7 +4806,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = [255,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0] ; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -4818,7 +4817,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm0 = [255,0] +; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0] ; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -4829,7 +4828,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512BW-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,0,2,0,8,0,6,0] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,u,2,u,8,u,6,u] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 @@ -5234,7 +5233,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr % ; ; AVX512BW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [32,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5320,7 +5319,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; ; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5333,7 +5332,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5346,7 +5345,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; ; AVX512BW-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,16,3,16,5,16,7,16,9,16,11,16,13,16,15] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5411,7 +5410,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512F-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5424,7 +5423,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5437,7 +5436,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512BW-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5494,7 +5493,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512F-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5507,7 +5506,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5520,7 +5519,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512BW-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5585,7 +5584,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5598,7 +5597,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5611,7 +5610,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512BW-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,8,3,8,5,8,7] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5668,7 +5667,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512F-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5681,7 +5680,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512DQ-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5694,7 +5693,7 @@ define void @vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512BW-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 @@ -5729,16 +5728,16 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: movq %rax, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: movl %eax, %r11d -; AVX-NEXT: movl %eax, %ebx -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: shrl $8, %eax -; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: movq (%rdi), %rcx +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: movq %rcx, %r8 +; AVX-NEXT: movq %rcx, %r9 +; AVX-NEXT: movq %rcx, %r10 +; AVX-NEXT: movl %ecx, %r11d +; AVX-NEXT: movl %ecx, %ebx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: shrl $8, %ecx +; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; AVX-NEXT: shrl $16, %ebx ; AVX-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 ; AVX-NEXT: shrl $24, %r11d @@ -5749,30 +5748,30 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX-NEXT: shrq $48, %r8 ; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX-NEXT: movq 8(%rdi), %rax -; AVX-NEXT: shrq $56, %rcx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $8, %ecx -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $24, %ecx -; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $40, %rcx -; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $48, %rcx -; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movq 8(%rdi), %rcx ; AVX-NEXT: shrq $56, %rax -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $24, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrq $40, %rax +; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrq $48, %rax +; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-NEXT: shrq $56, %rcx +; AVX-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 @@ -5841,7 +5840,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; ; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5854,7 +5853,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; ; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 @@ -5867,7 +5866,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; ; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,9,2,3,8,9,6,7] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 From 218da1b00581514e9b0ea0a2aae54d5ec3582ebf Mon Sep 17 00:00:00 2001 From: Rose Date: Mon, 2 Jun 2025 11:48:15 -0400 Subject: [PATCH 2/2] more --- .../MIR/InstrRef/x86-cmov-converter.mir | 82 +++++++++++++++++-- 1 file changed, 74 insertions(+), 8 deletions(-) diff --git a/llvm/test/DebugInfo/MIR/InstrRef/x86-cmov-converter.mir b/llvm/test/DebugInfo/MIR/InstrRef/x86-cmov-converter.mir index 0749964292cd6..8dd991701a512 100644 --- a/llvm/test/DebugInfo/MIR/InstrRef/x86-cmov-converter.mir +++ b/llvm/test/DebugInfo/MIR/InstrRef/x86-cmov-converter.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc %s -o - --run-pass=x86-cmov-conversion -mtriple=x86_64-- | FileCheck %s # REQUIRES: x86-registered-target # @@ -6,14 +7,6 @@ # locations. Check that this still works when unfolding memory operands, which # involves more decomposition of instructions. # -# CHECK-LABEL: name: CmovInHotPath -# CHECK-LABEL: bb.3.for.body: -# CHECK: CMOV32rr {{.*}}, debug-instr-number 1 -# -# CHECK-LABEL: name: test_cmov_memoperand_in_group_reuse_for_addr2 -# CHECK-LABEL: bb.2.entry: -# CHECK-NEXT: PHI {{.*}} debug-instr-number 1, -# CHECK-NEXT: PHI {{.*}} debug-instr-number 2, --- | ; ModuleID = 'x86-cmov-converter.ll' source_filename = "x86-cmov-converter.ll" @@ -110,6 +103,55 @@ liveins: - { reg: '$rcx', virtual-reg: '%8' } machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: CmovInHotPath + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x50000000), %bb.2(0x30000000) + ; CHECK-NEXT: liveins: $edi, $esi, $edx, $rcx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rcx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $esi + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr32 = COPY $edi + ; CHECK-NEXT: TEST32rr [[COPY3]], [[COPY3]], implicit-def $eflags, debug-location !12 + ; CHECK-NEXT: JCC_1 %bb.2, 14, implicit $eflags + ; CHECK-NEXT: JMP_1 %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.body.preheader: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV32rr:%[0-9]+]]:gr32 = MOV32rr [[COPY3]], debug-location !13 + ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[MOV32rr]], %subreg.sub_32bit, debug-location !13 + ; CHECK-NEXT: JMP_1 %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.for.cond.cleanup: + ; CHECK-NEXT: RET 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.for.body: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gr64 = PHI [[COPY]], %bb.1, %4, %bb.5 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:gr64 = PHI [[SUBREG_TO_REG]], %bb.1, %3, %bb.5 + ; CHECK-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[PHI]], 1, $noreg, 0, $noreg, debug-location !13 :: (load (s32) from %ir.lsr.iv1) + ; CHECK-NEXT: [[INC32r:%[0-9]+]]:gr32 = nsw INC32r [[MOV32rm]], implicit-def dead $eflags, debug-location !13 + ; CHECK-NEXT: [[IMUL32rr:%[0-9]+]]:gr32 = nsw IMUL32rr [[MOV32rm]], [[COPY2]], implicit-def dead $eflags, debug-location !13 + ; CHECK-NEXT: [[SUB32rr:%[0-9]+]]:gr32 = SUB32rr [[IMUL32rr]], [[COPY1]], implicit-def $eflags, debug-location !13 + ; CHECK-NEXT: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 10 + ; CHECK-NEXT: JCC_1 %bb.5, 15, implicit $eflags, debug-location !13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.for.body: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.for.body: + ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.3(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:gr32 = PHI [[INC32r]], %bb.4, [[MOV32ri]], %bb.3, debug-instr-number 1, debug-location !13 + ; CHECK-NEXT: DBG_INSTR_REF !14, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13 + ; CHECK-NEXT: [[IMUL32rr1:%[0-9]+]]:gr32 = nsw IMUL32rr [[PHI2]], [[INC32r]], implicit-def dead $eflags, debug-location !13 + ; CHECK-NEXT: MOV32mr [[PHI]], 1, $noreg, 0, $noreg, killed [[IMUL32rr1]], debug-location !13 :: (store (s32) into %ir.lsr.iv1) + ; CHECK-NEXT: [[ADD64ri8_:%[0-9]+]]:gr64 = ADD64ri8 [[PHI]], 4, implicit-def dead $eflags, debug-location !13 + ; CHECK-NEXT: [[DEC64r:%[0-9]+]]:gr64 = DEC64r [[PHI1]], implicit-def $eflags, debug-location !13 + ; CHECK-NEXT: JCC_1 %bb.2, 4, implicit $eflags, debug-location !13 + ; CHECK-NEXT: JMP_1 %bb.3, debug-location !13 bb.0.entry: successors: %bb.1(0x50000000), %bb.2(0x30000000) liveins: $edi, $esi, $edx, $rcx @@ -177,6 +219,30 @@ body: | bb.0.entry: liveins: $edi, $esi, $rdx, $rcx + ; CHECK-LABEL: name: test_cmov_memoperand_in_group_reuse_for_addr2 + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $edi, $esi, $rdx, $rcx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rcx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdx + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $esi + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr32 = COPY $edi + ; CHECK-NEXT: [[SUB32rr:%[0-9]+]]:gr32 = SUB32rr [[COPY3]], [[COPY2]], implicit-def $eflags, debug-location !16 + ; CHECK-NEXT: JCC_1 %bb.2, 7, implicit $eflags, debug-location !16 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .1.entry: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY]], 1, $noreg, 0, $noreg :: (load (s64) from %ir.y) + ; CHECK-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[MOV64rm]], 1, $noreg, 0, $noreg :: (load (s32) from %ir.p) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: .2.entry: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gr64 = PHI [[MOV64rm]], %bb.1, [[COPY1]], %bb.0, debug-instr-number 1, debug-location !16 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:gr32 = PHI [[MOV32rm]], %bb.1, [[COPY3]], %bb.0, debug-instr-number 2, debug-location !16 + ; CHECK-NEXT: DBG_INSTR_REF !17, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !16 + ; CHECK-NEXT: DBG_INSTR_REF !18, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !16 + ; CHECK-NEXT: $eax = COPY [[PHI1]], debug-location !16 + ; CHECK-NEXT: RET 0, $eax, debug-location !16 %3:gr64 = COPY $rcx %2:gr64 = COPY $rdx %1:gr32 = COPY $esi